def _dataconvert(args): # What is the type of input file? if args.format: intype = args.format else: intype = guess_type(args.inpath) # What is the type of output file? outtype = guess_type(args.outpath) # If outtype is ARFF then we need to guess field-types. # Thus we overwrite the args.guess_types to True. if outtype == arff.MIMETYPE: args.guess_types = True if is_url_path(args.inpath): instream = urllib2.urlopen(args.inpath) else: instream = open(args.inpath) # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] if intype in ['text/csv', 'csv']: records, metadata = dcsv.parse(instream, guess_types=args.guess_types) elif intype in [ 'application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'xls' ]: import dataconverters.xls excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx' records, metadata = dataconverters.xls.parse( instream, excel_type=excel_type, sheet=args.sheet, guess_types=args.guess_types, encoding=args.encoding) else: raise ValueError( 'No support for reading file type %s - support for csv or xls only at present' % intype) if args.outpath.startswith('_.'): outstream = sys.stdout else: outstream = open(args.outpath, 'w') if (args.records): records = itertools.islice(records, int(args.records)) if outtype == 'text/csv': dcsv.write(outstream, records, metadata) elif outtype == 'application/json': import dataconverters.jsondata as js js.write(outstream, records, metadata) elif outtype == arff.MIMETYPE: arff.write(outstream, records, metadata) else: raise ValueError('Only support writing to csv and json at present')
def _dataconvert(args): # What is the type of input file? if args.format: intype = args.format else: intype = guess_type(args.inpath) # What is the type of output file? outtype = guess_type(args.outpath) # If outtype is ARFF then we need to guess field-types. # Thus we overwrite the args.guess_types to True. if outtype == arff.MIMETYPE: args.guess_types = True if is_url_path(args.inpath): instream = urllib2.urlopen(args.inpath) else: instream = open(args.inpath) tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] if intype in ['text/csv', 'csv'] + tsv_types: records, metadata = dcsv.parse(instream, guess_types=args.guess_types) elif intype in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'xls' ]: import dataconverters.xls excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx' records, metadata = dataconverters.xls.parse(instream, excel_type=excel_type, sheet=args.sheet, guess_types=args.guess_types, encoding=args.encoding ) else: raise ValueError( 'No support for reading file type %s - support for csv or xls only at present' % intype) if args.outpath.startswith('_.'): outstream = sys.stdout else: outstream = open(args.outpath, 'w') if (args.records): records = itertools.islice(records, int(args.records)) if outtype == 'text/csv': dcsv.write(outstream, records, metadata) elif outtype == 'application/json': import dataconverters.jsondata as js js.write(outstream, records, metadata) elif outtype == arff.MIMETYPE: arff.write(outstream, records, metadata) else: raise ValueError('Only support writing to csv and json at present')
def test_4_empty_title_convert_csv(self): """Test converting a CSV with empty header to JSON""" csv = open(os.path.join(self.testdata_path, 'simple_empty_title.csv')) iterator, metadata = csvconvert.parse(csv, guess_types=False) assert_equal([{"id": u"date"}, {"id": u"column_1"}, {"id": u"temperature"}, {"id": u"place"}], metadata['fields']) content = [row for row in iterator] assert ({u"date": u"2011-01-03", u"place": u"Berkeley", u"temperature": u"5", u"column_1": u""} in content)
def test_csv_from_ressource(): """ download file from ressource """ url ="https://ckannet-storage.commondatastorage.googleapis.com/2013-05-02T185247/Valeurs_ajoutees_par_branches_dactivites_aux_prix_constants_de_1999_en_milliards_FCFA).csv" instream =urllib.urlopen(url) records, metadata = dcsv.parse(instream, guess_types=True #guess_types=args.guess_types) ) outstream = open("some_json.json", 'w') js.write(outstream, records, metadata)
def test_5_header_type(self): """Test guessing header type""" csv = open(os.path.join(self.testdata_path, 'simple.csv')) iterator, metadata = csvconvert.parse(csv, header_type=1) assert_equal([{'type': 'DateTime', 'id': u'date'}, {'id': u'temperature', 'type': 'Integer'}, {'id': u'place', 'type': 'String'}], metadata['fields']) rows = [ row for row in iterator ] assert_equal(len(rows), 6) assert_equal({u'date': datetime.datetime(2011, 1, 3), u'place': u'Berkeley', u'temperature': 5}, rows[5])
def test_1_convert_csv(self): """Test converting a CSV to JSON""" csv = open(os.path.join(self.testdata_path, 'simple.tsv')) iterator, metadata = csvconvert.parse(csv, delimiter='\t', guess_types=False) assert_equal( [{'id': u'date'}, {'id': u'temperature'}, {'id': u'place'}], metadata['fields']) rows = [ row for row in iterator ] assert_equal(len(rows), 6) assert ({u'date': u'2011-01-03', u'place': u'Berkeley', u'temperature': u'5'} in rows)
def test_2_convert_csv_strict(self): """Test converting a CSV to JSON""" csv = open(os.path.join(self.testdata_path, 'simple.csv')) iterator, metadata = csvconvert.parse(csv, strict_type_guess=True) assert_equal( [{'id': u'date', 'type': 'DateTime'}, {'id': u'temperature', 'type': 'Integer'}, {'id': u'place', 'type': 'String'}], metadata['fields']) rows = [row for row in iterator] assert_equal(len(rows), 6) assert ({u'date': datetime.datetime(2011, 1, 3, 0, 0), u'place': u'Berkeley', u'temperature': 5} in rows)
def test_2_unicode_csv(self): """Test converting a CSV with unicode chars to JSON""" csv = open(os.path.join(self.testdata_path, 'spanish_chars.csv')) iterator, metadata = csvconvert.parse(csv, guess_types=False) assert_equal( [{"id": u"GF_ID"}, {"id": u"FN_ID"}, {"id": u"SF_ID"}, {"id": u"GF"}, {"id": u"F"}, {"id": u"SF"}, {"id": u"Gasto total 2011"}, {"id": u"Descripci\u00f3n"}], metadata['fields']) content = [row for row in iterator] assert ({u"Gasto total 2011": u"", u"F": u"", u"Descripci\u00f3n": "", u"SF_ID": u"", u"GF_ID": u"Fuente: Presupuesto de Egresos de" u" la Federaci\u00f3n 2011 An\u00e1lisis de las Funciones y " u"Subfunciones del Gasto Programable por Destino del Gasto " u"(neto) y Manual de Programaci\u00f3n y Presupuesto 2011 " u"Anexo 11 Cat\u00e1logo Funcional ", u"GF": "", u"FN_ID": u"", u"SF": u""} in content)
def test_3_unicode_csv(self): """Test converting a CSV with unicode chars to JSON""" csv = open(os.path.join(self.testdata_path, 'spanish_chars.csv')) iterator, metadata = csvconvert.parse(csv, guess_types=False) assert_equal([{ "id": u"GF_ID" }, { "id": u"FN_ID" }, { "id": u"SF_ID" }, { "id": u"GF" }, { "id": u"F" }, { "id": u"SF" }, { "id": u"Gasto total 2011" }, { "id": u"Descripci\u00f3n" }], metadata['fields']) content = [row for row in iterator] assert ({ u"Gasto total 2011": u"", u"F": u"", u"Descripci\u00f3n": "", u"SF_ID": u"", u"GF_ID": u"Fuente: Presupuesto de Egresos de" u" la Federaci\u00f3n 2011 An\u00e1lisis de las Funciones y " u"Subfunciones del Gasto Programable por Destino del Gasto " u"(neto) y Manual de Programaci\u00f3n y Presupuesto 2011 " u"Anexo 11 Cat\u00e1logo Funcional ", u"GF": "", u"FN_ID": u"", u"SF": u"" } in content)
def test_1_convert_csv(self): """Test converting a CSV to JSON""" csv = open(os.path.join(self.testdata_path, 'simple.tsv')) iterator, metadata = csvconvert.parse(csv, delimiter='\t', guess_types=False) assert_equal([{ 'id': u'date' }, { 'id': u'temperature' }, { 'id': u'place' }], metadata['fields']) rows = [row for row in iterator] assert_equal(len(rows), 6) assert ({ u'date': u'2011-01-03', u'place': u'Berkeley', u'temperature': u'5' } in rows)
def test_4_empty_title_convert_csv(self): """Test converting a CSV with empty header to JSON""" csv = open(os.path.join(self.testdata_path, 'simple_empty_title.csv')) iterator, metadata = csvconvert.parse(csv, guess_types=False) assert_equal([{ "id": u"date" }, { "id": u"column_1" }, { "id": u"temperature" }, { "id": u"place" }], metadata['fields']) content = [row for row in iterator] assert ({ u"date": u"2011-01-03", u"place": u"Berkeley", u"temperature": u"5", u"column_1": u"" } in content)
def test_2_convert_csv_strict(self): """Test converting a CSV to JSON""" csv = open(os.path.join(self.testdata_path, 'simple.csv')) iterator, metadata = csvconvert.parse(csv, strict_type_guess=True) assert_equal([{ 'id': u'date', 'type': 'DateTime' }, { 'id': u'temperature', 'type': 'Integer' }, { 'id': u'place', 'type': 'String' }], metadata['fields']) rows = [row for row in iterator] assert_equal(len(rows), 6) assert ({ u'date': datetime.datetime(2011, 1, 3, 0, 0), u'place': u'Berkeley', u'temperature': 5 } in rows)
def test_5_header_type(self): """Test guessing header type""" csv = open(os.path.join(self.testdata_path, 'simple.csv')) iterator, metadata = csvconvert.parse(csv, header_type=1) assert_equal([{ 'type': 'DateTime', 'id': u'date' }, { 'id': u'temperature', 'type': 'Integer' }, { 'id': u'place', 'type': 'String' }], metadata['fields']) rows = [row for row in iterator] assert_equal(len(rows), 6) assert_equal( { u'date': datetime.datetime(2011, 1, 3), u'place': u'Berkeley', u'temperature': 5 }, rows[5])
def main(): parser = argparse.ArgumentParser(description=\ '''Convert data between formats. Supported formats: Input: csv, tsv, excel (xls, xlsx). Output: csv, json Examples ======== dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv Help ==== ''', epilog=\ '''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license. Part of the DataConverters project: https://github.com/okfn/dataconverters''', formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('inpath', metavar='inpath', type=str, help='in file path or url') parser.add_argument('outpath', metavar='outpath', type=str, help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)') parser.add_argument('--no-guess-types', dest='guess_types', action='store_false', help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''', default=True ) parser.add_argument('--sheet', metavar='NUM', help='''Index of sheet in spreadsheet to convert (index starts at 1)''', default=1 ) parser.add_argument('--records', metavar='NUM', help='''Only convert a maximum of NUM records''' ) args = parser.parse_args() intype = guess_type(args.inpath) outtype = guess_type(args.outpath) if is_url_path(args.inpath): instream = urllib2.urlopen(args.inpath) else: instream = open(args.inpath) # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] if intype == 'text/csv': records, metadata = dcsv.parse(instream, guess_types=args.guess_types) elif intype in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ]: import dataconverters.xls excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx' records, metadata = dataconverters.xls.parse(instream, excel_type=excel_type, sheet=args.sheet, guess_types=args.guess_types) else: raise ValueError( 'No support for reading file type %s - support for csv or xls only at present' % intype) if args.outpath.startswith('_.'): outstream = sys.stdout else: outstream = open(args.outpath, 'w') if (args.records): records = itertools.islice(records, int(args.records)) if outtype == 'text/csv': dcsv.write(outstream, records, metadata) elif outtype == 'application/json': import dataconverters.jsondata as js js.write(outstream, records, metadata) else: print 'Only support writing to csv and json at present'
def main(): parser = argparse.ArgumentParser(description=\ '''Convert data between formats. Supported formats: Input: csv, tsv, excel (xls, xlsx). Output: csv, json Examples ======== dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv Help ==== ''', epilog=\ '''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license. Part of the DataConverters project: https://github.com/okfn/dataconverters''', formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('inpath', metavar='inpath', type=str, help='in file path or url') parser.add_argument('outpath', metavar='outpath', type=str, help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)') parser.add_argument('--no-guess-types', dest='guess_types', action='store_false', help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''', default=True ) parser.add_argument('--sheet', metavar='NUM', help='''Index of sheet in spreadsheet to convert (index starts at 1)''', default=1 ) parser.add_argument('--records', metavar='NUM', help='''Only convert a maximum of NUM records''', default=1 ) args = parser.parse_args() intype = guess_type(args.inpath) outtype = guess_type(args.outpath) if is_url_path(args.inpath): instream = urllib2.urlopen(args.inpath) else: instream = open(args.inpath) # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] if intype == 'text/csv': records, metadata = dcsv.parse(instream, guess_types=args.guess_types) elif intype in ['application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ]: import dataconverters.xls excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx' records, metadata = dataconverters.xls.parse(instream, excel_type=excel_type, sheet=args.sheet, guess_types=args.guess_types) else: raise ValueError( 'No support for reading file type %s - support for csv or xls only at present' % intype) if args.outpath.startswith('_.'): outstream = sys.stdout else: outstream = open(args.outpath, 'w') if (args.records): records = itertools.islice(records, int(args.records)) if outtype == 'text/csv': dcsv.write(outstream, records, metadata) elif outtype == 'application/json': import dataconverters.jsondata as js js.write(outstream, records, metadata) else: print 'Only support writing to csv and json at present'