Exemple #1
0
def _dataconvert(args):

    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)

    # What is the type of output file?
    outtype = guess_type(args.outpath)

    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv']:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in [
            'application/vnd.ms-excel',
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'xls'
    ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(
            instream,
            excel_type=excel_type,
            sheet=args.sheet,
            guess_types=args.guess_types,
            encoding=args.encoding)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present'
            % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
Exemple #2
0
def _dataconvert(args):
    
    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)
        
    # What is the type of output file?
    outtype = guess_type(args.outpath)
    
    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv'] + tsv_types:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'xls'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types,
                encoding=args.encoding
                )
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
Exemple #3
0
    def test_1(self):
        metadata = {"fields": [{"type": "Integer", "id": u"temperature"}, {"type": "String", "id": u"place"}]}
        records = [
            {"place": u"Cairo", "temperature": 32},
            {"place": u"Alexandria", "temperature": 22},
            {"place": u"Aswan", "temperature": 42},
        ]

        desired_results = """@RELATION testdataset\n
@ATTRIBUTE temperature NUMERIC
@ATTRIBUTE place STRING\n
@DATA\n32,'Cairo'
22,'Alexandria'
42,'Aswan'
"""
        out = StringIO()
        arff.write(out, records, metadata, dataset_name="testdataset")
        out.seek(0)
        result = out.read()

        assert_equal(result, desired_results)
Exemple #4
0
    def test_1(self):
        metadata = {
            'fields': [{
                'type': 'Integer',
                'id': u'temperature'
            }, {
                'type': 'String',
                'id': u'place'
            }]
        }
        records = [
            {
                'place': u'Cairo',
                'temperature': 32
            },
            {
                'place': u'Alexandria',
                'temperature': 22
            },
            {
                'place': u'Aswan',
                'temperature': 42
            },
        ]

        desired_results = """@RELATION testdataset\n
@ATTRIBUTE temperature NUMERIC
@ATTRIBUTE place STRING\n
@DATA\n32,'Cairo'
22,'Alexandria'
42,'Aswan'
"""
        out = StringIO()
        arff.write(out, records, metadata, dataset_name='testdataset')
        out.seek(0)
        result = out.read()

        assert_equal(result, desired_results)