コード例 #1
0
def _dataconvert(args):

    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)

    # What is the type of output file?
    outtype = guess_type(args.outpath)

    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv']:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in [
            'application/vnd.ms-excel',
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'xls'
    ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(
            instream,
            excel_type=excel_type,
            sheet=args.sheet,
            guess_types=args.guess_types,
            encoding=args.encoding)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present'
            % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
コード例 #2
0
ファイル: __init__.py プロジェクト: bqevin/dataconverters
def _dataconvert(args):
    
    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)
        
    # What is the type of output file?
    outtype = guess_type(args.outpath)
    
    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv'] + tsv_types:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'xls'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types,
                encoding=args.encoding
                )
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
コード例 #3
0
 def test_4_empty_title_convert_csv(self):
     """Test converting a CSV with empty header to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple_empty_title.csv'))
     iterator, metadata = csvconvert.parse(csv, guess_types=False)
     assert_equal([{"id": u"date"}, {"id": u"column_1"}, {"id":
         u"temperature"}, {"id": u"place"}],
         metadata['fields'])
     content = [row for row in iterator]
     assert ({u"date": u"2011-01-03", u"place": u"Berkeley", u"temperature": u"5", u"column_1": u""} in content)
コード例 #4
0
def test_csv_from_ressource():
    """ download file from ressource """
    url  ="https://ckannet-storage.commondatastorage.googleapis.com/2013-05-02T185247/Valeurs_ajoutees_par_branches_dactivites_aux_prix_constants_de_1999_en_milliards_FCFA).csv"
    instream   =urllib.urlopen(url)
    records, metadata = dcsv.parse(instream,
                                guess_types=True
                                #guess_types=args.guess_types)
                                   )
    outstream = open("some_json.json", 'w')
    js.write(outstream, records, metadata)
コード例 #5
0
 def test_5_header_type(self):
     """Test guessing header type"""
     csv = open(os.path.join(self.testdata_path, 'simple.csv'))
     iterator, metadata = csvconvert.parse(csv, header_type=1)
     assert_equal([{'type': 'DateTime', 'id': u'date'}, {'id':
                      u'temperature', 'type': 'Integer'}, {'id': u'place',
                      'type': 'String'}], metadata['fields'])
     rows = [ row for row in iterator ]
     assert_equal(len(rows), 6)
     assert_equal({u'date': datetime.datetime(2011, 1, 3), u'place': u'Berkeley', u'temperature':
         5}, rows[5])
コード例 #6
0
 def test_1_convert_csv(self):
     """Test converting a CSV to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple.tsv'))
     iterator, metadata = csvconvert.parse(csv, delimiter='\t', guess_types=False)
     assert_equal(
         [{'id': u'date'}, {'id': u'temperature'}, {'id': u'place'}],
         metadata['fields'])
     rows = [ row for row in iterator ]
     assert_equal(len(rows), 6)
     assert ({u'date': u'2011-01-03', u'place': u'Berkeley', u'temperature':
         u'5'} in rows)
コード例 #7
0
 def test_2_convert_csv_strict(self):
     """Test converting a CSV to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple.csv'))
     iterator, metadata = csvconvert.parse(csv, strict_type_guess=True)
     assert_equal(
         [{'id': u'date', 'type': 'DateTime'}, {'id': u'temperature',
         'type': 'Integer'}, {'id': u'place', 'type': 'String'}],
         metadata['fields'])
     rows = [row for row in iterator]
     assert_equal(len(rows), 6)
     assert ({u'date': datetime.datetime(2011, 1, 3, 0, 0),
             u'place': u'Berkeley', u'temperature': 5} in rows)
コード例 #8
0
 def test_2_unicode_csv(self):
     """Test converting a CSV with unicode chars to JSON"""
     csv = open(os.path.join(self.testdata_path, 'spanish_chars.csv'))
     iterator, metadata = csvconvert.parse(csv, guess_types=False)
     assert_equal(
         [{"id": u"GF_ID"}, {"id": u"FN_ID"}, {"id": u"SF_ID"},
             {"id": u"GF"}, {"id": u"F"}, {"id": u"SF"},
             {"id": u"Gasto total 2011"}, {"id": u"Descripci\u00f3n"}],
         metadata['fields'])
     content = [row for row in iterator]
     assert ({u"Gasto total 2011": u"", u"F": u"", u"Descripci\u00f3n": "",
             u"SF_ID": u"", u"GF_ID": u"Fuente: Presupuesto de Egresos de"
             u" la Federaci\u00f3n 2011 An\u00e1lisis de las Funciones y "
             u"Subfunciones del Gasto Programable por Destino del Gasto "
             u"(neto) y Manual de Programaci\u00f3n y Presupuesto 2011 "
             u"Anexo 11 Cat\u00e1logo Funcional ", u"GF": "", u"FN_ID":
             u"", u"SF": u""} in content)
コード例 #9
0
 def test_3_unicode_csv(self):
     """Test converting a CSV with unicode chars to JSON"""
     csv = open(os.path.join(self.testdata_path, 'spanish_chars.csv'))
     iterator, metadata = csvconvert.parse(csv, guess_types=False)
     assert_equal([{
         "id": u"GF_ID"
     }, {
         "id": u"FN_ID"
     }, {
         "id": u"SF_ID"
     }, {
         "id": u"GF"
     }, {
         "id": u"F"
     }, {
         "id": u"SF"
     }, {
         "id": u"Gasto total 2011"
     }, {
         "id": u"Descripci\u00f3n"
     }], metadata['fields'])
     content = [row for row in iterator]
     assert ({
         u"Gasto total 2011":
         u"",
         u"F":
         u"",
         u"Descripci\u00f3n":
         "",
         u"SF_ID":
         u"",
         u"GF_ID":
         u"Fuente: Presupuesto de Egresos de"
         u" la Federaci\u00f3n 2011 An\u00e1lisis de las Funciones y "
         u"Subfunciones del Gasto Programable por Destino del Gasto "
         u"(neto) y Manual de Programaci\u00f3n y Presupuesto 2011 "
         u"Anexo 11 Cat\u00e1logo Funcional ",
         u"GF":
         "",
         u"FN_ID":
         u"",
         u"SF":
         u""
     } in content)
コード例 #10
0
 def test_1_convert_csv(self):
     """Test converting a CSV to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple.tsv'))
     iterator, metadata = csvconvert.parse(csv,
                                           delimiter='\t',
                                           guess_types=False)
     assert_equal([{
         'id': u'date'
     }, {
         'id': u'temperature'
     }, {
         'id': u'place'
     }], metadata['fields'])
     rows = [row for row in iterator]
     assert_equal(len(rows), 6)
     assert ({
         u'date': u'2011-01-03',
         u'place': u'Berkeley',
         u'temperature': u'5'
     } in rows)
コード例 #11
0
 def test_4_empty_title_convert_csv(self):
     """Test converting a CSV with empty header to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple_empty_title.csv'))
     iterator, metadata = csvconvert.parse(csv, guess_types=False)
     assert_equal([{
         "id": u"date"
     }, {
         "id": u"column_1"
     }, {
         "id": u"temperature"
     }, {
         "id": u"place"
     }], metadata['fields'])
     content = [row for row in iterator]
     assert ({
         u"date": u"2011-01-03",
         u"place": u"Berkeley",
         u"temperature": u"5",
         u"column_1": u""
     } in content)
コード例 #12
0
 def test_2_convert_csv_strict(self):
     """Test converting a CSV to JSON"""
     csv = open(os.path.join(self.testdata_path, 'simple.csv'))
     iterator, metadata = csvconvert.parse(csv, strict_type_guess=True)
     assert_equal([{
         'id': u'date',
         'type': 'DateTime'
     }, {
         'id': u'temperature',
         'type': 'Integer'
     }, {
         'id': u'place',
         'type': 'String'
     }], metadata['fields'])
     rows = [row for row in iterator]
     assert_equal(len(rows), 6)
     assert ({
         u'date': datetime.datetime(2011, 1, 3, 0, 0),
         u'place': u'Berkeley',
         u'temperature': 5
     } in rows)
コード例 #13
0
 def test_5_header_type(self):
     """Test guessing header type"""
     csv = open(os.path.join(self.testdata_path, 'simple.csv'))
     iterator, metadata = csvconvert.parse(csv, header_type=1)
     assert_equal([{
         'type': 'DateTime',
         'id': u'date'
     }, {
         'id': u'temperature',
         'type': 'Integer'
     }, {
         'id': u'place',
         'type': 'String'
     }], metadata['fields'])
     rows = [row for row in iterator]
     assert_equal(len(rows), 6)
     assert_equal(
         {
             u'date': datetime.datetime(2011, 1, 3),
             u'place': u'Berkeley',
             u'temperature': 5
         }, rows[5])
コード例 #14
0
ファイル: cli.py プロジェクト: Web5design/dataconverters
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records'''
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'
コード例 #15
0
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records''',
        default=1
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'