Example #1
0
def _dataconvert(args):

    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)

    # What is the type of output file?
    outtype = guess_type(args.outpath)

    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv']:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in [
            'application/vnd.ms-excel',
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'xls'
    ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(
            instream,
            excel_type=excel_type,
            sheet=args.sheet,
            guess_types=args.guess_types,
            encoding=args.encoding)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present'
            % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
Example #2
0
 def test_1(self):
     metadata = {'fields': [{'id': 'A'}, {'id': 'B'}]}
     records = [{'A': u'\x9f', 'B': 2}, {'A': 2, 'B': 3}]
     out = StringIO()
     csvconvert.write(out, records, metadata)
     out.seek(0)
     result = out.read()
     assert_equal(result, '''A,B\r\n\xc2\x9f,2\r\n2,3\r\n''')
Example #3
0
def _dataconvert(args):
    
    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)
        
    # What is the type of output file?
    outtype = guess_type(args.outpath)
    
    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv'] + tsv_types:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'xls'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types,
                encoding=args.encoding
                )
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
Example #4
0
 def test_1(self):
     metadata = {
         'fields': [
             { 'id': 'A' },
             { 'id': 'B' }
         ]
     }
     records = [ {'A': 1, 'B': 2}, {'A': 2, 'B': 3} ]
     out = StringIO()
     csvconvert.write(out, records, metadata)
     out.seek(0)
     result = out.read()
     assert_equal(result, '''A,B\r\n1,2\r\n2,3\r\n''')
Example #5
0
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records'''
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'
Example #6
0
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records''',
        default=1
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'