def test_teecsv_unicode(): t1 = ((u"name", u"id"), (u"Արամ Խաչատրյան", 1), (u"Johann Strauß", 2), (u"Вагиф Сәмәдоғлу", 3), (u"章子怡", 4)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teecsv(f1.name, encoding="utf-8").selectgt("id", 1).tocsv(f2.name, encoding="utf-8")) ieq(t1, etl.fromcsv(f1.name, encoding="utf-8").convertnumbers()) ieq(etl.wrap(t1).selectgt("id", 1), etl.fromcsv(f2.name, encoding="utf-8").convertnumbers())
def test_teecsv(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teecsv(f1.name, encoding="ascii").selectgt("bar", 1).tocsv(f2.name, encoding="ascii")) ieq(t1, etl.fromcsv(f1.name, encoding="ascii").convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.fromcsv(f2.name, encoding="ascii").convertnumbers())
def test_fromcsv_cachetag(): """Test the cachetag method on tables returned by fromcsv.""" # initial data f = NamedTemporaryFile(delete=False) writer = csv.writer(f) table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: writer.writerow(row) f.close() # cachetag with initial data tbl = fromcsv(f.name) tag1 = tbl.cachetag() # make a change with open(f.name, "wb") as o: writer = csv.writer(o) rows = ( ("foo", "bar"), ("d", 3), # ('e', 5), ("f", 4), ) for row in rows: writer.writerow(row) # check cachetag has changed tag2 = tbl.cachetag() assert tag2 != tag1, (tag2, tag1)
def test_stringsource(): tbl1 = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) # test writing to a string buffer ss = StringSource() etl.tocsv(tbl1, ss) expect = "foo,bar\r\na,1\r\nb,2\r\nc,2\r\n" if not PY2: expect = expect.encode('ascii') actual = ss.getvalue() eq_(expect, actual) # test reading from a string buffer tbl2 = etl.fromcsv(StringSource(actual)) ieq(tbl1, tbl2) ieq(tbl1, tbl2) # test appending etl.appendcsv(tbl1, ss) actual = ss.getvalue() expect = "foo,bar\r\na,1\r\nb,2\r\nc,2\r\na,1\r\nb,2\r\nc,2\r\n" if not PY2: expect = expect.encode('ascii') eq_(expect, actual)
def check_bx_book_ratings(fn): """Check the consistency of the BX-Book-Ratings.csv file. """ tab = etl.fromcsv(fn, delimiter=DELIMITER, quoting=QUOTE_ALL, encoding=ENCODING) v = Validator({ "User-ID" : { "type" : "string", "required" : True }, "ISBN" : { "type" : "string", "required" : True }, "Book-Rating" : { "type" : "string", "validator" : validate_rating, "required" : True } }) for row_num, r in enumerate(tab.dicts(), 1): is_valid = v.validate(r) if not is_valid: print "row %d -> %r, %r" % (row_num, v.errors, r) return
def test_fromcsv_gz(): """Test the fromcsv function on a gzipped file.""" f = NamedTemporaryFile(delete=False) f.close() fn = f.name + '.gz' os.rename(f.name, fn) fz = gzip.open(fn, 'wb') writer = csv.writer(fz, delimiter='\t') table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: writer.writerow(row) fz.close() actual = fromcsv(fn, delimiter='\t') expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_fromcsv_cachetag_strict(): """Test the cachetag method on tables returned by fromcsv.""" # initial data f = NamedTemporaryFile(delete=False) writer = csv.writer(f) table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: writer.writerow(row) f.close() # cachetag with initial data tbl = fromcsv(FileSource(f.name, checksumfun=adler32sum)) tag1 = tbl.cachetag() # make a change, preserving file size with open(f.name, 'wb') as o: writer = csv.writer(o) rows = (('foo', 'bar'), ('d', 3), ('e', 5), ('f', 4)) for row in rows: writer.writerow(row) # check cachetag has changed tag2 = tbl.cachetag() assert tag2 != tag1, (tag2, tag1)
def check_bx_users(fn): """Check the consistentcy of the BX-Users.csv file. """ tab = etl.fromcsv(fn, delimiter=DELIMITER, quoting=QUOTE_ALL, encoding=ENCODING) v = Validator({ "User-ID" : { "type" : "string", "regex" : USER_ID_PAT, "required" : True }, "Location" : { "type" : "string", "required" : True }, "Age" : { "type" : "string", "validator" : validate_age, "required" : True } }) for row_num, r in enumerate(tab.dicts(), 1): is_valid = v.validate(r) if not is_valid: print "row %d -> %r, %r" % (row_num, v.errors, r) return
def test_popensource(): expect = (('foo', 'bar'),) delimiter = ' ' actual = etl.fromcsv(PopenSource(r'echo foo bar', shell=True), delimiter=delimiter) ieq(expect, actual)
def test_teecsv_write_header(): t1 = (("foo", "bar"), ("a", "2"), ("b", "1"), ("c", "3")) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) ( etl.wrap(t1) .convertnumbers() .teecsv(f1.name, write_header=False, encoding="ascii") .selectgt("bar", 1) .tocsv(f2.name, encoding="ascii") ) ieq(t1[1:], etl.fromcsv(f1.name, encoding="ascii")) ieq(etl.wrap(t1).convertnumbers().selectgt("bar", 1), etl.fromcsv(f2.name, encoding="ascii").convertnumbers())
def really_read_filelines(filename, p_train_data, split_mode): #Load the table csvfile = fromcsv(filename) train_data, test_data = split_dataset(csvfile, p_train_data, split_mode) return train_data, test_data
def from_csv(source=None, encoding=None, errors='strict', description=None, **csvargs): table = etl.fromcsv(source, encoding, errors, **csvargs) dataset = Dataset() dataset.header = table.header dataset.rows = table.records # FIXME: configure name, etc. return dataset
def check_bx_books(fn): """Check the consistency of the BX-Books.csv file. """ tab = etl.fromcsv(fn, delimiter=DELIMITER, quoting=QUOTE_ALL, encoding=ENCODING) v = Validator({ "ISBN" : { "type" : "string", "validator" : validate_not_null_str, "required" : True, }, "Book-Title" : { "type" : "string", "validator" : validate_not_null_str, "required" : True }, "Book-Author" : { "type" : "string", "validator" : validate_not_null_str, "required" : True }, "Year-Of-Publication" : { "type" : "string", "regex" : YEAR_PAT, "required" : True }, "Publisher" : { "type" : "string", "validator" : validate_not_null_str, "required" : True }, "Image-URL-S" : { "type" : "string", "regex" : URL_PAT, "required" : True }, "Image-URL-M" : { "type" : "string", "regex" : URL_PAT, "required" : True }, "Image-URL-L" : { "type" : "string", "regex" : URL_PAT, "required" : True } }) for row_num, r in enumerate(tab.dicts(), 1): is_valid = v.validate(r) if not is_valid: print "row %d -> %r, %r" % (row_num, v.errors, r) return
def create_config(csvfile,config_name): ''' Creates a configuration file from a CSV file ''' print csvfile var = '' try: open(config_name+".ini") var = raw_input("This file already exists. Do you wish to continue? (Yes/No) ") except: pass if var == 'Yes': cfgfile = open(config_name+".ini", "w") examplefile = open(config_name+".example", "w") else: print "goodbye" sys.exit() c = fromcsv(csvfile) columns = header(c) it = iterdata(c) print it.next() examplefile.write(str(see(rowslice(c,2,3)))) examplefile.close() # add the settings to the structure of the file, and lets write it out... Config = ConfigParser.ConfigParser() # dont' change names to lower case Config.optionxform = str Config.add_section('FieldTypes') Config.add_section('FieldMap') for name in columns: #Config.set('FieldTypes',c) #print name new = name new = new.split("(", 1)[0].strip() # Connect words with underscore new = new.replace("/","_") new = new.replace(" ","_") new = new.replace("-","_") new = new.lower() # try to guess some of the names if "amount" in name: print name Config.set('FieldMap',name, new + " FLOAT") else: print name Config.set('FieldMap',name, new + " VARCHAR(10)") Config.write(cfgfile) cfgfile.close()
def test_memorysource_2(): data = 'foo,bar\r\na,1\r\nb,2\r\nc,2\r\n' if not PY2: data = data.encode('ascii') actual = etl.fromcsv(MemorySource(data)) expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) ieq(expect, actual) ieq(expect, actual)
def main(): hours = load_pickles('disneyland-hours.pickle') events = load_pickles('disneyland-events.pickle') passes = load_pickles('disneyland-passes.pickle') forecast = get_forecast() spreadsheet = petl.fromcsv( 'https://spreadsheets.google.com/tq?key=%s&gid=0&tqx=out:csv' % DISNEY_SPREADSHEET_KEY ) events_lookup = {} for item in events: buff = events_lookup.get(item['date'].date(), {}) buff[item['park']] = item buff['date'] = item['date'].date() events_lookup[item['date'].date()] = buff for item in spreadsheet.dicts(): # print item sheet_date = du_parse(item['date']).date() if events_lookup.has_key(sheet_date): e = events_lookup[sheet_date] e['disneyland']['crowd_level'] = item['disneyland_crowd_level'] e['disneyland']['closures'] = [x for x in map( unicode.strip, item['disneyland_closures'].split(',')) if x] e['disney-california-adventure']['crowd_level'] = \ item['california_adventure_crowd_level'] e['disney-california-adventure']['closures'] = \ [x for x in map( unicode.strip, item['california_adventure_closures'].split(',')) if x] for item in hours: if events_lookup.has_key(item['date'].date()): events_lookup[item['date'].date()][item['park']]['hours'] = item for item in passes: # print item if events_lookup.has_key(item['date']): events_lookup[item['date']]['passes'] = item for date, item in forecast.items(): if events_lookup.has_key(date.date()): events_lookup[date.date()]['forecast'] = item f = StringIO() from pprint import pprint pprint(events_lookup) pickle.dump(sorted(events_lookup.values(), key=lambda x:x['date']), f) s3_save(f, 'disneyland-merged_data.pickle') f.seek(0) s3_save(f, 'disney-california-adventure-merged_data.pickle')
def _load_db(self, taiwan_date_str=None, is_delete=False): if not (taiwan_date_str is None): self.set_trade_date(taiwan_date_str) raw_file = self.source_file tse = etl.fromcsv(raw_file) connection = self._get_connection() #psycopg2.connect(_CONNECTION) if is_delete: self.clean_db(self._trade_date) # assuming table "quotes" already exists in the database, and tse need to have the header. # petl.io.db.todb(table, dbo, tablename, schema=None, commit=True, create=False, drop=False, constraints=True, # metadata=None, dialect=None, sample=1000)[source] etl.todb(tse, connection, 'quotes', drop=False) #, truncate=False)
def main(argv): parser = argparse.ArgumentParser(description='Dumps CSV-formatted ID-related info for every individual profile ' \ 'in CCB. Pipe into a file to create CSV file.') parser.add_argument("--individuals-filename", required=True, help="Input UTF8 CSV with individuals data " "dumped from Servant Keeper") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.individuals_filename), "Error: cannot open file '" + args.individuals_filename + "'" trace('RETRIEVING INDIVIDUALS XML FROM CCB...', args.trace, banner=True) tmp_filename = http_get2tmp_file('https://ingomar.ccbchurch.com/api.php?srv=individual_profiles', settings.ccbapi.username, settings.ccbapi.password) xml_tree = ET.parse(tmp_filename) xml_root = xml_tree.getroot() trace('WALKING XML TO CREATE SK2CCB ID MAP DICTIONARY...', args.trace, banner=True) sk2ccb_id_map_dict = xml2id_dict(xml_root) os.remove(tmp_filename) trace('WALKING SK DATA TO IDENTIFY GROUPS TO SET ON PER-INDIVIDUAL BASIS...', args.trace, banner=True) table = petl.fromcsv(args.individuals_filename) sk_indiv_id2groups = gather_semi_sep_by_indiv_id(table, {'Mailing Lists': ['Home Touch', 'Rummage Sale'], 'Activities': ['Veteran', 'Celebration Singers', 'Wesleyan Choir', 'Praise Team']}) sk_indiv_id2name = gather_name_by_indiv_id(table) trace('RETRIEVING GROUPS XML FROM CCB...', args.trace, banner=True) tmp_filename = http_get2tmp_file('https://ingomar.ccbchurch.com/api.php?srv=group_profiles', settings.ccbapi.username, settings.ccbapi.password) xml_tree = ET.parse(tmp_filename) xml_root = xml_tree.getroot() group_id_map_dict = xml2group_id_dict(xml_root) os.remove(tmp_filename) for sk_indiv_id in sk_indiv_id2groups: if sk_indiv_id in sk2ccb_id_map_dict: for group_name in sk_indiv_id2groups[sk_indiv_id]: if not group_name in group_id_map_dict: print "*** Cannot find CCB group name '" + group_name + "' in CCB account." print "Adding " + sk_indiv_id2name[sk_indiv_id] + " (Individual ID = " + \ sk2ccb_id_map_dict[sk_indiv_id] + ") to group '" + group_name + "' (Group ID = " + \ group_id_map_dict[group_name] + ")." add_person_to_group(sk2ccb_id_map_dict[sk_indiv_id], group_id_map_dict[group_name]) else: groups_trying_to_add = ', '.join(sk_indiv_id2groups[sk_indiv_id]) print "*** No SK->CCB ID map for '" + str(sk_indiv_id) + "' (" + sk_indiv_id2name[sk_indiv_id] + "), " + \ "so person not added to " + groups_trying_to_add + "." sys.stdout.flush() sys.stderr.flush()
def test_gzipsource(): # setup tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')] fn = NamedTemporaryFile().name + '.gz' expect = b"foo,bar\na,1\nb,2\n" # write explicit etl.tocsv(tbl, GzipSource(fn), lineterminator='\n') actual = gzip.open(fn).read() eq_(expect, actual) # write implicit etl.tocsv(tbl, fn, lineterminator='\n') actual = gzip.open(fn).read() eq_(expect, actual) # read explicit tbl2 = etl.fromcsv(GzipSource(fn)) ieq(tbl, tbl2) # read implicit tbl2 = etl.fromcsv(fn) ieq(tbl, tbl2)
def xmltodict(xmlFile): parser = etree.XMLParser(remove_blank_text=True) with open(xmlFile) as f: xml = f.read() root = objectify.fromstring(xml) print type(root) d = dict((e.tag, e.text) for e in root['iati-activity'].iterchildren()) pprint(d) #Have a look at the format generated with CSV tool with open(datadir+"iati_transformations/iati_download_20131107_simple_aa-activity.csv") as f: k = fromcsv(f.read()) sys.exit()
def test_issue_231(): table = [['foo', 'bar'], ['a', '1'], ['b', '2']] t = cut(table, 'foo') totsv(t, 'tmp/issue_231.tsv') u = fromtsv('tmp/issue_231.tsv') ieq(t, u) tocsv(t, 'tmp/issue_231.csv') u = fromcsv('tmp/issue_231.csv') ieq(t, u) topickle(t, 'tmp/issue_231.pickle') u = frompickle('tmp/issue_231.pickle') ieq(t, u)
def test_fromcsv(): """Test the fromcsv function.""" f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter="\t") table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: writer.writerow(row) f.close() actual = fromcsv(f.name, delimiter="\t") expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2")) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_staticmethods(): data = [b'foo,bar', b'a,1', b'b,2', b'c,2'] f = NamedTemporaryFile(mode='wb', delete=False) f.write(b'\n'.join(data)) f.close() expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) actual = etl.fromcsv(f.name, encoding='ascii') ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def main(argv, version, org_data_path='data'): args = parse_args(argv, version) input_fields = InputFields.from_args(args) output_fields = OutputFields.from_args(args) input = petl.fromcsv(args.input_csv, encoding='utf-8', errors='strict') parser = Parser() parser.read_csv('data/settlements.csv', report_conflicts=False) matches = find_matches( input, input_fields, output_fields, index_data=org_data_path, parse=parser.parse, extramatches=args.extramatches, differentiating_ambiguity=args.differentiating_ambiguity) if args.progress: matches = matches.progress() matches.tocsv(args.output_csv, encoding='utf-8')
def test_fromcsv_gz(): """Test the fromcsv function on a gzipped file.""" f = NamedTemporaryFile(delete=False) f.close() fn = f.name + ".gz" os.rename(f.name, fn) fz = gzip.open(fn, "wb") writer = csv.writer(fz, delimiter="\t") table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: writer.writerow(row) fz.close() actual = fromcsv(fn, delimiter="\t") expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2")) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data " "dumped from Servant Keeper") parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with " "pledges data in CCB import format ") parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile(args.pledges_filename), "Error: cannot open file '" + args.pledges_filename + "'" table = petl.fromcsv(args.pledges_filename) table = petl.rename(table, { 'Frequency': 'SK Frequency', 'Individual ID': 'SK Individual ID' }) table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID']) table = petl.addfield(table, 'Campus', '') table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account']) table = petl.addfield(table, 'Amount Pledged', convert_amount) table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged']) table = petl.addfield(table, 'Frequency', lambda rec: {1:'Yearly', 4:'Yearly', 12:'Monthly', 52:'Weekly', 24:'Monthly', 2:'Yearly'}[int(rec['Payments'])]) table = petl.addfield(table, 'Number of Gifts', lambda rec: {'Yearly':1, 'Monthly':12, 'Weekly':52}[rec['Frequency']]) table = petl.addfield(table, 'Length Multiplier', lambda rec: {'Yearly':'Years', 'Monthly':'Months', 'Weekly':'Weeks'}[rec['Frequency']]) table = petl.addfield(table, 'Start Date', lambda rec: {'Operating Income':'2013-01-01', 'Mortgage Principal':'2013-01-01', 'Operating Income 2015':'2015-01-01'}[rec['Account']]) table = petl.addfield(table, 'End Date', lambda rec: {'Operating Income':'2013-12-31', 'Mortgage Principal':'2013-12-31', 'Operating Income 2015':'2015-12-31'}[rec['Account']]) trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True) table.progress(200).tocsv(args.output_filename) trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True) trace('DONE!', args.trace, banner=True)
def test_teetext(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = "foo,bar\n" template = "{foo},{bar}\n" epilogue = "d,4" ( etl.wrap(t1) .teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue) .selectgt("bar", 1) .topickle(f2.name) ) ieq(t1 + (("d", 4),), etl.fromcsv(f1.name).convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def test_teetext_unicode(): t1 = ((u"foo", u"bar"), (u"Արամ Խաչատրյան", 2), (u"Johann Strauß", 1), (u"Вагиф Сәмәдоғлу", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = u"foo,bar\n" template = u"{foo},{bar}\n" epilogue = u"章子怡,4" ( etl.wrap(t1) .teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue, encoding="utf-8") .selectgt("bar", 1) .topickle(f2.name) ) ieq(t1 + ((u"章子怡", 4),), etl.fromcsv(f1.name, encoding="utf-8").convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def test_fromcsv(): """Test the fromcsv function.""" f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter='\t') table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: writer.writerow(row) f.close() actual = fromcsv(f.name, delimiter='\t') expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-ccb-csv-filename", required=True, help="Input CCB CSV loading file to validate") parser.add_argument("--output-validation-csv-filename", required=True, help="Output CSV file that'll be created " \ "with validation results") args = parser.parse_args() table = petl.fromcsv(args.input_ccb_csv_filename) constraints = [ {'name': 'max_len_20', 'field':'Legal Name', 'assertion':max_len(20)}, {'name': 'max_len_30', 'field':'How They Heard', 'assertion':max_len(30)}, {'name': 'max_len_20', 'field':'Last Name', 'assertion':max_len(20)}, {'name': 'max_len_100', 'field':'Mailbox Number', 'assertion':max_len(100)}, {'name': 'max_len_20', 'field':'Middle Name', 'assertion':max_len(20)}, {'name': 'max_len_30', 'field':'Job Title', 'assertion':max_len(30)}, {'name': 'max_len_20', 'field':'First Name', 'assertion':max_len(20)}, {'name': 'max_len_30', 'field':'School', 'assertion':max_len(30)}, {'name': 'max_len_semisep_30', 'field':'Abilities/Skills', 'assertion':max_len_semisep(30)}, {'name': 'max_len_semisep_30', 'field':'Spiritual Gifts', 'assertion':max_len_semisep(30)}, {'name': 'max_len_semisep_30', 'field':'Passions', 'assertion':max_len_semisep(30)}, {'name': 'max_len_100', 'field':'Transferred Frm', 'assertion':max_len(100)}, {'name': 'max_len_100', 'field':'Transferred To', 'assertion':max_len(100)}, {'name': 'max_len_30', 'field':'How They Joined', 'assertion':max_len(30)}, {'name': 'max_len_30', 'field':'Membership Type', 'assertion':max_len(30)}, {'name': 'max_len_30', 'field':'Reason Left Church', 'assertion':max_len(30)}, {'name': 'max_len_100', 'field':'Pastr When Join', 'assertion':max_len(100)}, {'name': 'max_len_100', 'field':'Pastr When Leav', 'assertion':max_len(100)} ] validation_table = petl.validate(table, constraints=constraints) validation_table.progress(200).tocsv(args.output_validation_csv_filename) print "See output file '" + args.output_validation_csv_filename + "' for results" # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
import petl as etl import csv # importing the data from csv file and converting the type of data. table1 = (etl.fromcsv('owid-covid-data.csv').convert( 'iso_code', 'upper').convert('location', str).convert('date', str).convert( 'total_cases', int).convert('new_cases', int).convert( 'total_deaths', int).convert('new_deaths', int).convert( 'total_cases_per_million', float).convert('new_cases_per_million', float).convert( 'total_deaths_per_million', float).convert('new_deaths_per_million', float).convert( 'total_tests', int).convert('new_tests', int).convert( 'total_tests_per_thousand', float).convert('new_tests_per_thousand', float).convert('tests_units', str)) # declaring a list of header that will be needed to make heading of the table # other value will be appended from the for loop below data1 = [[ 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units' ]] # cut function is used to cut out the column given in the bracket below from the table # cut function is not compulsory for table1 because the value given below are the total field that are present in table1 data = etl.cut(table1, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
def home(): data_table = petl.fromcsv(SERVICES_FILE) data = [("Business & Trade", 1), ("Life Events", 2), ("Legal Information & Services", 3), ("Employment & Jobs", 4), ("Health & Wellbeing", 5)] return bottle.template('office_map', data=data)
def Average(lst): return sum(lst) / len(lst) if __name__ == "__main__": try: DATE = sys.argv[1] except: print("ERROR! Use con s2_aggregate DATE") sys.exit(1) # print("Algunos agregados de los resultados, para fecha: ", DATE) t1 = (etl.fromcsv(f"var/s1_invisible_prefixes-{DATE}.csv", delimiter="|").convert('visible', int).convert( 'dark', int).convert('total', int)) #print(t1.look()) print("Algunos agregados de los resultados, para fecha: ", DATE) print("Total de espacio de todo el pool", sum([x[4] for x in list(t1)][1:])) print(" - ") # numero de asignaciones completamente invisibles n1 = etl.select(t1, lambda r: r['dark'] == r['total']) print("numero de asignaciones completamente invisibles", n1.nrows()) print("total de ips en asignaciones completamente invisibles", sum([x[4] for x in list(n1)][1:])) print("tamano promedio de asignaciones completamente invisibles", Average([x[4] for x in list(n1)][1:]))
"""Return a list of valid NUTS codes.""" with open(GEOCODES_FILE) as stream: lines = csv.DictReader(stream) geocodes = [] for i, line in enumerate(lines): # The first line has an empty NUTS-code if i > 0: geocode = line['NUTS-Code'] geocodes.append(geocode) logging.debug('Loaded %d NUTS geocodes', len(geocodes)) return tuple(geocodes) GEOCODES = list(dicts(fromcsv(GEOCODES_FILE))) def get_all_codelists(): """Return all codelists as a dictionary of dictionaries.""" codelists = {} for codelist_file in os.listdir(CODELISTS_DIR): codelist_name, _ = os.path.splitext(codelist_file) codelist = get_codelist(codelist_name) codelists.update({codelist_name: codelist}) return codelists
def build_cn_raster(landcover_raster, lookup_csv, soils_polygon, soils_hydrogroup_field="SOIL_HYDRO", reference_raster=None, out_cn_raster=None): """Build a curve number raster from landcover raster, soils polygon, and a crosswalk between landcover classes, soil hydro groups, and curve numbers. :param lookup_csv: [description] :type lookup_csv: [type] :param landcover_raster: [description] :type landcover_raster: [type] :param soils_polygon: polygon containing soils with a hydro classification. :type soils_polygon: [type] :param soils_hydrogroup_field: [description], defaults to "SOIL_HYDRO" (from the NCRS soils dataset) :type soils_hydrogroup_field: str, optional :param out_cn_raster: [description] :type out_cn_raster: [type] """ # GP Environment ---------------------------- msg("Setting up GP Environment...") # if reference_raster is provided, we use it to set the GP environment for # subsequent raster operations if reference_raster: if not isinstance(reference_raster, Raster): # read in the reference raster as a Raster object. reference_raster = Raster(reference_raster) else: reference_raster = Raster(landcover_raster) # set the snap raster, cell size, and extent, and coordinate system for subsequent operations env.snapRaster = reference_raster env.cellSize = reference_raster.meanCellWidth env.extent = reference_raster env.outputCoordinateSystem = reference_raster cs = env.outputCoordinateSystem.exportToString() # SOILS ------------------------------------- msg("Processing Soils...") # read the soils polygon into a raster, get list(set()) of all cell values from the landcover raster soils_raster_path = so("soils_raster") PolygonToRaster_conversion(soils_polygon, soils_hydrogroup_field, soils_raster_path, "CELL_CENTER") soils_raster = Raster(soils_raster_path) # use the raster attribute table to build a lookup of raster values to soil hydro codes # from the polygon (that were stored in the raster attribute table after conversion) if not soils_raster.hasRAT: msg("Soils raster does not have an attribute table. Building...", "warning") BuildRasterAttributeTable_management(soils_raster, "Overwrite") # build a 2D array from the RAT fields = ["Value", soils_hydrogroup_field] rows = [fields] # soils_raster_table = MakeTableView_management(soils_raster_path) with SearchCursor(soils_raster_path, fields) as sc: for row in sc: rows.append([row[0], row[1]]) # turn that into a dictionary, where the key==soil hydro text and value==the raster cell value lookup_from_soils = {v: k for k, v in etl.records(rows)} # also capture a list of just the values, used to iterate conditionals later soil_values = [v['Value'] for v in etl.records(rows)] # LANDCOVER --------------------------------- msg("Processing Landcover...") if not isinstance(landcover_raster, Raster): # read in the reference raster as a Raster object. landcover_raster_obj = Raster(landcover_raster) landcover_values = [] with SearchCursor(landcover_raster, ["Value"]) as sc: for row in sc: landcover_values.append(row[0]) # LOOKUP TABLE ------------------------------ msg("Processing Lookup Table...") # read the lookup csv, clean it up, and use the lookups from above to limit it to just # those values in the rasters t = (etl.fromcsv(lookup_csv).convert('utc', int).convert('cn', int).select( 'soil', lambda v: v in lookup_from_soils).convert( 'soil', lookup_from_soils).select('utc', lambda v: v in landcover_values)) # This gets us a table where we the landcover class (as a number) corresponding to the # correct value in the converted soil raster, with the corresponding curve number. # DETERMINE CURVE NUMBERS ------------------- msg("Assigning Curve Numbers...") # Use that to reassign cell values using conditional map algebra operations cn_rasters = [] for rec in etl.records(t): cn_raster_component = Con( (landcover_raster_obj == rec.utc) & (soils_raster == rec.soil), rec.cn, 0) cn_rasters.append(cn_raster_component) cn_raster = CellStatistics(cn_rasters, "MAXIMUM") # REPROJECT THE RESULTS ------------------- msg("Reprojecting and saving the results....") if not out_cn_raster: out_cn_raster = so("cn_raster", "random", "in_memory") ProjectRaster_management(in_raster=cn_raster, out_raster=out_cn_raster, out_coor_system=cs, resampling_type="NEAREST", cell_size=env.cellSize) # cn_raster.save(out_cn_raster) return out_cn_raster
look(table) # a table and field name can also be provided as arguments look(table1) table2 = unflatten(table1, 'lines', 3) look(table2) # tocsv table = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] from petl import tocsv, look look(table) tocsv(table, 'test.csv', delimiter='\t') # look what it did from petl import fromcsv look(fromcsv('test.csv', delimiter='\t')) # appendcsv table = [['foo', 'bar'], ['d', 7], ['e', 42], ['f', 12]] # look at an existing CSV file from petl import look, fromcsv testcsv = fromcsv('test.csv', delimiter='\t') look(testcsv) # append some data look(table) from petl import appendcsv appendcsv(table, 'test.csv', delimiter='\t') # look what it did look(testcsv)
def extract_staging_csv(dataframe): """Nested function to load processed stage csv data to table data frame""" table = etl.fromcsv(dataframe, encoding='utf8') return table
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument( "--sep-columns", required=False, nargs='*', default=argparse.SUPPRESS, help= "Column names of columns containing comma- or semi-colon-separated values" ) parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument( "--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile( args.input_csv_filename ), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str( skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str( dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
def createTableFromCSV(csvFile): return etl.fromcsv(csvFile)
# import libraries import folium import pandas as pd from bottle import route, run from bottle import static_file import petl as etl table2 = etl.fromcsv('new.csv') # Make a data frame with dots to show on the map data = pd.DataFrame({ 'lat': [ -25.274398, 20.593684, 30.375321, 35.86166, 32.427908, 50.503887, -14.235004, 56.130366, 6.428055, 19.2823, 37.09024, 33.886917, 47.516231, 33.93911, 38.963745 ], 'lon': [ 133.775136, 78.96288, 69.345116, 104.195397, 53.688046, 4.469936, -51.92528, -106.346771, -9.429499, 166.6470, -95.712891, 9.537499, 14.550072, 67.709953, 35.243322 ], 'name': [ "Australia", "India", "Pakistan", "China", "Iran", "Belgium", "Brazil", "Canada", "Liberia", "United States Virgin Islands", "United States", "Tanzania", "Austria", "Afghanistan", "Turkey" ], }) data # Make an empty map m = folium.Map(location=[20, 0], zoom_start=3.5) m = folium.Map(location=[48.85, 2.35], tiles="OpenStreetMap", zoom_start=2) s = 'Name: ' + data.iloc[0]['name']
# coding:utf8 # introduction example_data = """foo,bar,baz a,1,2.3 b,4,5.6 c,7,8.9 """ with open('example.csv', 'w') as f: f.write(example_data) import petl as etl table1 = etl.fromcsv('example.csv') print(table1) table2 = etl.convert(table1, 'foo', 'upper') print(table2) table3 = etl.convert(table2, {'bar': int, 'baz': float}) print(table3) table4 = etl.addfield(table3, 'bar*baz', lambda row: row.baz * row.bar) print(table4.look()) # 管道操作 table=( etl .fromcsv('example.csv') .convert({'foo':'upper','bar':int,'baz':float}) .addfield('bar*bar_new',lambda r:r.baz* r.bar) .convert('baz', lambda v, row: v * float(row.bar),pass_row=True) ) print(table.look())
def paypal_to_quickbooks(paypal_path, iif_path=None, unprocessed_path=None, start_date=None, end_date=None): """ Process the paypal CSV into a QuickBooks INPUT: paypal.csv OUTPUT: output.iif and unprocessed.csv """ etl.config.look_style = 'minimal' if iif_path is None: # If no iif path was specified, default to the same folder # as the input, and filename = 'output.iif' iif_path = os.path.join(os.path.dirname(paypal_path), 'output.iif') if unprocessed_path is None: # If no path for unprocessed trades was specified, default to # the same folder as the input, and filename = 'unprocessed.csv' unprocessed_path = os.path.join(os.path.dirname(paypal_path), 'unprocessed.csv') # TODO: Validate that the cart_payment is associated with exactly # cart_payment.Quantity cart_items. # -------------------- # 1. LOAD PAYPAL CSV FILE paypal = etl.fromcsv(paypal_path) print("Loaded PayPal input file (" + str(paypal.nrows()) + " rows)") paypal = cleanup_paypal(paypal) if start_date is not None: # Eliminiate dates prior to start_date paypal = paypal.selectge('Date', start_date) if end_date is not None: # Eliminiate dates after end_date paypal = paypal.selectle('Date', end_date) # Any cancelled trades basically cancel, so we can eliminate most of them # right off the bat. paypal = eliminate_cancellations(paypal) # -------------------- # 2. CREATE QUICKBOOKS IIF FILE print("Creating output IIF file") # We always delete the file and start fresh try: os.remove(iif_path) except FileNotFoundError: # If the file wasn't there in the first place, no problem. pass # Start with the names data, add that to the .IIF file. get_customer_names(paypal).appendtsv(iif_path, write_header=True) # TicketLeap fees have a header for both the transaction and the split # so I have to write to the IIF file within the function paypal = append_TicketLeap_fees(paypal, iif_path) # TicketLeap sales receipts make up the bulk of the transactions paypal = append_sales_as_deposits(paypal, iif_path) # Invoices are for tickets or for membership sales paypal = append_invoices(paypal, iif_path) # -------------------- # 3. CREATE UNPROCESSED ROWS FILE print("Creating output unprocessed rows CSV file (" + str(paypal.nrows()) + " rows)") unprocessed_file = open(unprocessed_path, 'w') writer = csv.writer(unprocessed_file, lineterminator='\n') writer.writerows(paypal) unprocessed_file.close()
def get_table(self) -> etl.Table: return etl.fromcsv(self.file.path)
import petl import os # Use this file to prepare CSV data from # webrobot.io's kickstarter data # change this listdir input to whatever # your path to your data is files = os.listdir(".")[0:-1] print(files) first = True for i in files: data = petl.fromcsv(i) blurbs = petl.cut(data, 'blurb') print(petl.look(blurbs)) if first: petl.tocsv(blurbs, 'blurbs.csv') first = False else: petl.appendcsv(blurbs, 'blurbs.csv')
def execute(self, context): try: nm_arq = 'COSTOS.csv' with open(f'{_PROC_FILES}/{nm_arq}', 'wb') as data_from: data_from.write( self.client_from.get_blob_client( nm_arq).download_blob().readall()) table = etl.fromcsv(f'{_PROC_FILES}/{nm_arq}', delimiter='|') def rowmapper(row): strnull = {'NULL': ''} return [ row[0].strip(), row[1].strip(), row[2].strip(), row[3].strip(), strnull[row['nr_guia_principal'].strip()] if row['nr_guia_principal'].strip() in strnull else row['nr_guia_principal'].strip(), row[5].strip(), strnull[row['fk_cid'].strip()] if row['fk_cid'].strip() in strnull else row['fk_cid'].strip(), strnull[row['dt_ini_internacao'].strip()] if row['dt_ini_internacao'].strip() in strnull else row['dt_ini_internacao'].strip(), strnull[row['dt_fim_internacao'].strip()] if row['dt_fim_internacao'].strip() in strnull else row['dt_fim_internacao'].strip(), strnull[row['fk_medicosolicitante'].strip()] if row['fk_medicosolicitante'].strip() in strnull else row['fk_medicosolicitante'].strip(), strnull[row['cod_executante'].strip()] if row['cod_executante'].strip() in strnull else row['cod_executante'].strip(), row[11].strip(), row[12].strip(), row[13].strip(), row[14].strip(), row[15].strip(), row[16].strip(), strnull[row['qtd_unica'].strip()] if row['qtd_unica'].strip() in strnull else row['qtd_unica'].strip(), row[18].strip(), row[19].strip(), row[20].strip(), row[21].strip(), row[22].strip(), row[23].strip(), row[24].strip(), strnull[row['tipo_internacao'].strip()] if row['tipo_internacao'].strip() in strnull else row['tipo_internacao'].strip(), strnull[row['tipo_acomodacao'].strip()] if row['tipo_acomodacao'].strip() in strnull else row['tipo_acomodacao'].strip(), strnull[row['ds_indicacao_clinica'].strip()] if row['ds_indicacao_clinica'].strip() in strnull else row['ds_indicacao_clinica'].strip(), row[28].strip(), row[29].strip(), row[30].strip(), strnull[row['partic_medico'].strip()] if row['partic_medico'].strip() in strnull else row['partic_medico'].strip() ] table1 = etl.rowmap( table, rowmapper, header=[ 'cod_custo', 'fk_beneficiario', 'mes_competencia', 'nr_guia', 'nr_guia_principal', 'dt_realizacao', 'fk_cid', 'dt_ini_internacao', 'dt_fim_internacao', 'fk_medicosolicitante', 'cod_executante', 'fk_local_atend', 'fk_capa_unico', 'fk_destino_pgto', 'fk_servico', 'cod_servico_tipo', 'qtd', 'qtd_unica', 'vl_unitario', 'vl_total', 'vl_coparticipacao', 'ind_internacao', 'tipo_registro', 'tipo_atendimento', 'tipo_guia', 'tipo_internacao', 'tipo_acomodacao', 'ds_indicacao_clinica', 'vl_hm', 'vl_co', 'vl_filme', 'partic_medico' ]) table2 = etl.addfields(table1, [('fk_operadora', _SOURCE)]) etl.tocsv(table2, f'{_PROC_FILES}/t{nm_arq}', delimiter='|') self.logger.info(f"Destino do arquivo {self.client_to}") self.logger.info(f"Container destino {self.container_to}") except azure.core.exceptions.ResourceNotFoundError: print('Entrou na exceção :)') upload_file = f'{_PROC_FILES}/t{nm_arq}' try: if os.path.isfile(upload_file): with open(upload_file, "rb") as data: self.client_to.upload_blob(nm_arq, data, overwrite=True) self.logger.info(f'{data} carregado') else: self.logger.info(f't{nm_arq} não foi encontrado no container') finally: self.logger.info('Tudo Carregado')
def read(csv_file): return petl.fromcsv(data.csv_open(path + '/' + csv_file), encoding='utf-8', errors='strict')
def execute(self, context): try: nm_arq = 'PRESTADORES.csv' with open(f'{_PROC_FILES}/{nm_arq}', 'wb') as data_from: data_from.write( self.client_from.get_blob_client( nm_arq).download_blob().readall()) table = etl.fromcsv(f'{_PROC_FILES}/{nm_arq}', delimiter='|') def rowmapper(row): strnull = {'NULL': ''} return [ row[0].strip(), row[1].strip(), strnull[row['nome_completo'].strip()] if row['nome_completo'].strip() in strnull else row['nome_completo'].strip(), row[3].strip(), row[4].strip(), strnull[row['dat_nascimento'].strip()] if row['dat_nascimento'].strip() in strnull else row['dat_nascimento'].strip(), row[6].strip(), row[7].strip(), row[8].strip(), strnull[row['cnae'].strip()] if row['cnae'].strip() in strnull else row['cnae'].strip(), row[10].strip(), strnull[row['grau'].strip()] if row['grau'].strip() in strnull else row['grau'].strip(), row[12].strip(), row[13].strip(), row[14].strip(), row[15].strip(), row[16].strip(), row[17].strip(), row[18].strip(), strnull[row['latitude'].strip()] if row['latitude'].strip() in strnull else row['latitude'].strip(), strnull[row['longitude'].strip()] if row['longitude'].strip() in strnull else row['longitude'].strip(), strnull[row['fone1'].strip()] if row['fone1'].strip() in strnull else row['fone1'].strip(), strnull[row['fone2'].strip()] if row['fone2'].strip() in strnull else row['fone2'].strip(), strnull[row['fone3'].strip()] if row['fone3'].strip() in strnull else row['fone3'].strip() ] table1 = etl.rowmap( table, rowmapper, header=[ 'pk_medico_prestador', 'nome_fantasia', 'nome_completo', 'cod_crm', 'f_crm', 'dat_nascimento', 'num_cpf_cnpj', 'tipo_prestador', 'grupo_prestador', 'cnae', 'especialidade', 'grau', 'endereço', 'numero', 'Complemento', 'bairro', 'cidade_prestador', 'uf_prestador', 'cep', 'latitude', 'longitude', 'fone1', 'fone2', 'fone3' ]) table2 = etl.addfields(table1, [('fk_operadora', _SOURCE)]) etl.tocsv(table2, f'{_PROC_FILES}/t{nm_arq}', delimiter='|') self.logger.info(f"Destino do arquivo {self.client_to}") self.logger.info(f"Container destino {self.container_to}") except azure.core.exceptions.ResourceNotFoundError: print('Entrou na exceção :)') upload_file = f'{_PROC_FILES}/t{nm_arq}' try: if os.path.isfile(upload_file): with open(upload_file, "rb") as data: self.client_to.upload_blob(nm_arq, data, overwrite=True) self.logger.info(f'{data} carregado') else: self.logger.info(f't{nm_arq} não foi encontrado no container') finally: self.logger.info('Tudo Carregado')
standardize_nulls(a['std_high_num'])) else None) \ .addfield('change_stdir', lambda a: 1 if a['no_address'] != 1 and str(standardize_nulls(a['stdir'])) != str( standardize_nulls(a['std_street_predir'])) else None) \ .addfield('change_stnam', lambda a: 1 if a['no_address'] != 1 and str(standardize_nulls(a['stnam'])) != str( standardize_nulls(a['std_street_name'])) else None) \ .addfield('change_stdes', lambda a: 1 if a['no_address'] != 1 and str(standardize_nulls(a['stdes'])) != str( standardize_nulls(a['std_street_suffix'])) else None) \ .addfield('change_stdessuf', lambda a: 1 if a['no_address'] != 1 and str(standardize_nulls(a['stdessuf'])) != str( standardize_nulls(a['std_address_postdir'])) else 0) \ .tocsv('dor_parcel_address_analysis.csv', write_header=True) # get address_summary rows with dor_parcel_id as array: address_summary_rows = address_summary_out_table \ .addfield('dor_parcel_id_array', lambda d: d.dor_parcel_id.split('|') if d.dor_parcel_id else []) \ .addfield('opa_account_num_array', lambda d: d.opa_account_num.split('|') if d.opa_account_num else []) dor_parcel_address_analysis = etl.fromcsv('dor_parcel_address_analysis.csv') mapreg_count_map = {} address_count_map = {} dor_parcel_header = dor_parcel_address_analysis[0] # count_maps for row in dor_parcel_address_analysis[1:]: dict_row = dict(zip(dor_parcel_header, row)) mapreg = dict_row['mapreg'] std_street_address = dict_row['std_street_address'] if mapreg not in mapreg_count_map: mapreg_count_map[mapreg] = 1 else: mapreg_count_map[mapreg] += 1 if std_street_address not in address_count_map: address_count_map[std_street_address] = 1
today = parser.parse(inputstr + '00+0000').astimezone(tz.tzlocal()) dataset = {} for i in ('temp', 'hvel', 'zcor'): dataset[i] = netCDF4.MFDataset([ os.path.join(head, 'ocm3_{0}_{1}_H{2}.nc'.format(i, inputstr, j)) for j in ('-23_00', '01_24', '25_48', '49_72') ]) find = [-1, -5, -10] depth = petl.fromcolumns([find], ['depth']) points = petl.fromcsv(os.path.join(home, 'var', 'Points')).convert({ 'lat': float, 'lon': float, 'ncpos': int }) pPoints = [int(x) for x in open(os.path.join(home, 'var', 'PengHu'))] nodelist = points.values('ncpos') zcorlist = dataset['zcor']['zcor'][:].take(nodelist, 2) templist = dataset['temp']['temp'][:].take(nodelist, 2) ulist = dataset['hvel']['u'][:].take(nodelist, 2) vlist = dataset['hvel']['v'][:].take(nodelist, 2) tempout = numpy.ndarray((timelen, 3, 189), numpy.float32) speedout = numpy.ndarray((timelen, 3, 189), numpy.float32) dirout = numpy.ndarray((timelen, 3, 189), numpy.float32)
import petl as etl import sqlite3 from textblob import TextBlob import plotly.plotly as py import plotly from plotly.graph_objs import * conn = sqlite3.connect('billboard_top100.db') cur = conn.cursor() billboard_top100 = etl.fromcsv("billboard_lyrics_1964-2015-2.csv", encoding="ISO-8859-1") create_songs = "CREATE TABLE IF NOT EXISTS Songs (id INTEGER PRIMARY KEY AUTOINCREMENT, Rank TEXT, Song TEXT, Artist TEXT, Year TEXT, Lyrics TEXT)" cur.execute(create_songs) cut_data = etl.cut(billboard_top100, "Rank", "Song", "Artist", "Year", "Lyrics") etl.todb(cut_data, conn, 'Songs') select_lyrics = "SELECT Year, Lyrics from Songs" cur.execute(select_lyrics) lyrics_list = cur.fetchall() lyrics_dict = {} for (x, y) in lyrics_list: if x not in lyrics_dict.keys(): lyrics_dict[x] = [y] else: lyrics_dict[x].append(y)
def retrieve_rna_data(self): """ Parse 'rna_tissue' csv file, RNA levels in 56 cell lines and 37 tissues based on RNA-seq from HPA. :return: dict """ self.logger.info('get rna tissue rows into dicts') self.logger.debug('melting rna level table into geneid tissue level') t_level = (petl.fromcsv(URLZSource( self.rna_level_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'anatomical_systems', lambda rec: asys_from_tissue( rec['tissue_label'], self.t2m)). addfield( 'organs', lambda rec: organs_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_value = (petl.fromcsv(URLZSource( self.rna_value_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='rna_value').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue( rec['tissue'].strip(), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).addfield( 'rna_unit', 'TPM').cutout('tissue')) t_zscore = (petl.fromcsv( URLZSource(self.rna_zscore_url), delimiter='\t').melt( key='ID', variablefield='tissue', valuefield='zscore_level').rename({ 'ID': 'gene' }).addfield( 'tissue_label', lambda rec: name_from_tissue(rec['tissue'].strip( ), self.t2m)).addfield( 'tissue_code', lambda rec: code_from_tissue( rec['tissue_label'], self.t2m)).cutout('tissue')) t_vl = petl.join(t_level, t_value, key=('gene', 'tissue_code', 'tissue_label'), presorted=True) t_join = (petl.join(t_vl, t_zscore, key=('gene', 'tissue_code', 'tissue_label'), presorted=True).aggregate( 'gene', aggregation={ 'data': (('tissue_code', 'tissue_label', 'rna_level', 'rna_value', 'rna_unit', 'anatomical_systems', 'organs', 'zscore_level'), list) }, presorted=True)) return t_join
## Purpose: Combining all the newly created datasets from tureen_txt_manip.py and tureen_csv_manip.py to create .db or Database ## The resulting database can then be used to visualize the trend of wins for all the English Premier League Teams for each season ## from Season 2000-2001 to Season 2016-2017 # Import modules import csv import petl as etl import sqlite3 # Needed for Windows encoding, otherwise will not print appropriate symbols import codecs import sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer) # Convert CSV files to etl format for further data manipulation chelsea_data = etl.fromcsv('chelseaResults.csv', encoding='utf-8') manu_data = etl.fromcsv('manunitedResults.csv', encoding='utf-8') mancity_data = etl.fromcsv('mancityResults.csv', encoding='utf-8') arsenal_data = etl.fromcsv('arsenalResults.csv', encoding='utf-8') tottenham_data = etl.fromcsv('tottenhamResults.csv', encoding='utf-8') liverpool_data = etl.fromcsv('liverpoolResults.csv', encoding='utf-8') everton_data = etl.fromcsv('evertonResults.csv', encoding='utf-8') print(chelsea_data) ## Connect to Sqlite and Initialize a database conn = sqlite3.connect('epl_teamWins.db') cur = conn.cursor() ## Create Table for Chelsea F.C. createCFC = """CREATE TABLE IF NOT EXISTS Chelsea (Year int primary key not null, ChelseaWins int)"""
# List of source and target files for merge healthcsvfile = './datafiles/Vic_Health_Care.csv' locationxmlfile = './datafiles/Vic_Locations.xml' mergedcsvfile = './datafiles/practice_locations.csv' # xmlfields is a dictionary to be used as xmlfields = {'Town_name': 'Town', 'Latitude': 'Lat', 'Longitude': 'Lon'} # type: Dict[str, str] xmlparent = 'Town_location' initialrow = ['Practice_Name', 'Latitude', 'Longitude', 'Town', 'State', 'Post_Code'] # tables in memory created from xml and csv files csvtable = petl.fromcsv(healthcsvfile) xmltable = petl.fromxml(locationxmlfile, xmlparent, xmlfields) # Find the row in xmltable matching town from csv lktbl = petl.lookupone(xmltable, 'Town_name') # type: Union[Dict[Any, Union[tuple[Any], Tuple[Any]]], Any] nmdtbl = petl.namedtuples(csvtable) finaltabl = [initialrow] for lin in nmdtbl: tabl = lktbl[lin.Town] latitude = tabl[0] longitude = tabl[1] insertline = (str(lin.Practice_Name) + ',' + latitude + ',' + longitude + ',' + str( lin.Town) + ',' + str(lin.State) + ',' + str(lin.Postcode)).split(',') print insertline
def read_csv(file): return etl.fromcsv(file)
return 'AS' elif (country == 'MS Zaandam'): return 'NA' elif (country == 'Kosovo') or (country == 'Holy See'): return 'EU' else: return 'N/A' # Fuente de los datos que vamos a leer uri_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' uri_death = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' uri_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' # Procesamos primero casos confirmados t_confirmed = etl.fromcsv(uri_confirmed) # Cambiamos el nombre a los encabezados t_confirmed = etl.rename(t_confirmed, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(t_confirmed) i = 0 for header in headers: if i >= 4: t_confirmed = etl.convert(t_confirmed, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime( header, '%m/%d/%y') # calculamos la fecha en formato correcto
import pickle import petl as etl import csv cols = [[0, 1, 2], ['a', 'b', 'c']] table1 = etl.fromcolumns(cols) print(table1) ###########################CSV Reading############################### table2 = etl.fromcsv('AAPL.csv') print(table2['Date']) print(table2) etl.tocsv(table1, 'example.csv') #wrting to a CSV file ##########################Reading from Pickle files#################### """" what is pickle? Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script. """ #Creating a pickle file a = ['test value', 'test value 2', 'test value 3'] file_Name = "testfile" # open the file for writing
def petl_view(self) -> petl.Table: """``petl.Table`` built from stored CSV ``file``.""" return petl.fromcsv(source=self.file)
def do_etl(bx_users_fn, bx_books_fn, bx_book_ratings_fn, db_fn): """Run the ETL procedures over all the BX csv files, populating a single sqlite .db file. Arguments: bx_users_fn -- path to the BX-Users.csv file. bx_books_fn -- path to the BX-Books.csv file. bx_book_ratings_fn -- path to the BX-Book_Ratings.csv file. db_fn -- path to the output sqlite .db file. Returns: None """ errargs = {"errorvalue": ERROR_VALUE} # Transform BX-Users.csv tab = etl.fromcsv(bx_users_fn, delimiter=DELIMITER, escapechar=ESCAPECHAR, quoting=QUOTE_ALL, encoding=ENCODING) tab = tab.rename({ "User-ID": "user_id", "Location": "location", "Age": "age" }) users_tab = (tab.convert("user_id", (lambda s: s.strip()), **errargs).convert("location", clean_text, **errargs).convert( "age", (lambda s: int(s)), **errargs)) # Transform BX-Books.csv tab = etl.fromcsv(bx_books_fn, delimiter=DELIMITER, escapechar=ESCAPECHAR, quoting=QUOTE_ALL, encoding=ENCODING) tab = tab.rename({ "ISBN": "isbn", "Book-Title": "title", "Book-Author": "author", "Year-Of-Publication": "year", "Publisher": "publisher", "Image-URL-S": "img_url_s", "Image-URL-M": "img_url_m", "Image-URL-L": "img_url_l" }) books_tab = (tab.convert("isbn", clean_isbn, **errargs).convert( "title", clean_text, **errargs).convert("author", clean_text, **errargs).convert( "year", clean_year, **errargs).convert("publisher", clean_text, **errargs).convert( "img_url_s", (lambda s: s.strip()), **errargs).convert("img_url_m", (lambda s: s.strip()), **errargs).convert("img_url_l", (lambda s: s.strip()), **errargs)) # Transform BX-Book-Ratings.csv tab = etl.fromcsv(bx_book_ratings_fn, delimiter=DELIMITER, escapechar=ESCAPECHAR, quoting=QUOTE_ALL, encoding=ENCODING) tab = tab.rename({ "User-ID": "user_id", "ISBN": "isbn", "Book-Rating": "rating" }) ratings_tab = (tab.convert("user_id", (lambda s: s.strip()), **errargs).convert("isbn", clean_isbn, **errargs).convert( "rating", clean_rating, **errargs)) with sqlite3.connect(db_fn) as conn: # Load each table into an sqlite db. users_tab.todb(conn, "users", create=True) books_tab.todb(conn, "books", create=True, drop=True) ratings_tab.todb(conn, "ratings", create=True) conn.commit() return
def main(argv): parser = argparse.ArgumentParser(description='Dumps CSV-formatted ID-related info for every individual profile ' \ 'in CCB. Pipe into a file to create CSV file.') parser.add_argument("--individuals-filename", required=True, help="Input UTF8 CSV with individuals data " "dumped from Servant Keeper") parser.add_argument( '--trace', action='store_true', help="If specified, prints tracing/progress messages to " "stdout") args = parser.parse_args() assert os.path.isfile( args.individuals_filename ), "Error: cannot open file '" + args.individuals_filename + "'" trace('RETRIEVING INDIVIDUALS XML FROM CCB...', args.trace, banner=True) tmp_filename = http_get2tmp_file( 'https://ingomar.ccbchurch.com/api.php?srv=individual_profiles', settings.ccbapi.username, settings.ccbapi.password) xml_tree = ET.parse(tmp_filename) xml_root = xml_tree.getroot() trace('WALKING XML TO CREATE SK2CCB ID MAP DICTIONARY...', args.trace, banner=True) sk2ccb_id_map_dict = xml2id_dict(xml_root) os.remove(tmp_filename) trace( 'WALKING SK DATA TO IDENTIFY GROUPS TO SET ON PER-INDIVIDUAL BASIS...', args.trace, banner=True) table = petl.fromcsv(args.individuals_filename) sk_indiv_id2groups = gather_semi_sep_by_indiv_id( table, { 'Mailing Lists': ['Home Touch', 'Rummage Sale'], 'Activities': [ 'Veteran', 'Celebration Singers', 'Wesleyan Choir', 'Praise Team' ] }) sk_indiv_id2name = gather_name_by_indiv_id(table) trace('RETRIEVING GROUPS XML FROM CCB...', args.trace, banner=True) tmp_filename = http_get2tmp_file( 'https://ingomar.ccbchurch.com/api.php?srv=group_profiles', settings.ccbapi.username, settings.ccbapi.password) xml_tree = ET.parse(tmp_filename) xml_root = xml_tree.getroot() group_id_map_dict = xml2group_id_dict(xml_root) os.remove(tmp_filename) for sk_indiv_id in sk_indiv_id2groups: if sk_indiv_id in sk2ccb_id_map_dict: for group_name in sk_indiv_id2groups[sk_indiv_id]: if not group_name in group_id_map_dict: print "*** Cannot find CCB group name '" + group_name + "' in CCB account." print "Adding " + sk_indiv_id2name[sk_indiv_id] + " (Individual ID = " + \ sk2ccb_id_map_dict[sk_indiv_id] + ") to group '" + group_name + "' (Group ID = " + \ group_id_map_dict[group_name] + ")." add_person_to_group(sk2ccb_id_map_dict[sk_indiv_id], group_id_map_dict[group_name]) else: groups_trying_to_add = ', '.join(sk_indiv_id2groups[sk_indiv_id]) print "*** No SK->CCB ID map for '" + str(sk_indiv_id) + "' (" + sk_indiv_id2name[sk_indiv_id] + "), " + \ "so person not added to " + groups_trying_to_add + "." sys.stdout.flush() sys.stderr.flush()
def execute(self, context): try: nm_arq = 'BENEFICIARIO.csv' with open(f'{_PROC_FILES}/{nm_arq}', 'wb') as data_from: data_from.write( self.client_from.get_blob_client( nm_arq).download_blob().readall()) table = etl.fromcsv(f'{_PROC_FILES}/{nm_arq}', delimiter='|') def rowmapper(row): transmf = {'MASCULINO': 'M', 'FEMININO': 'F'} strnull = {'NULL': ''} return [ row[0].strip(), row[1].strip(), row[2].strip(), row[3].strip(), row['nome'].strip(), row[5].strip(), row[6].strip(), row[7].strip(), row[8].strip(), strnull[row['copart_percentual'].strip()] if row['copart_percentual'].strip() in strnull else row['copart_percentual'].strip(), strnull[row['limite_copart'].strip()] if row['limite_copart'].strip() in strnull else row['limite_copart'].strip(), row[11].strip(), row[12].strip(), row[13].strip(), row[14].strip(), strnull[row['dt_exclusao'].strip()] if row['dt_exclusao'].strip() in strnull else row['dt_exclusao'].strip(), row[16], transmf[row['sexo'].strip()] if row['sexo'].strip() in transmf else None, row[18].strip(), row[19].strip(), row[20].strip(), row[21].strip(), row[22].strip(), row[23].strip(), row[24].strip(), row[25].strip(), row[26].strip(), row[27].strip(), row[28].strip(), row[29].strip() ] table1 = etl.rowmap( table, rowmapper, header=[ 'pk_beneficiario', 'nr_beneficiario', 'nr_beneficiario_tit', 'fk_empresa', 'nome', 'cpf', 'dt_nascimento', 'cod_plano', 'descricao_plano', 'copart_percentual', 'limite_copart', 'tipo_acomodacao', 'abrangencia_plano', 'grau_dependencia', 'dt_inclusao', 'dt_exclusao', 'nome_mae', 'sexo', 'tipo_contrato', 'endereco', 'bairro', 'cidade', 'uf', 'cep', 'tipo_cliente', 'nr_cartaonacionalsaude', 'plano_regulamentado', 'descricao_entidade', 'tipo_pessoa', 'acao_judicial' ]) table2 = etl.addfields(table1, [('fk_operadora', _SOURCE)]) etl.tocsv(table2, f'{_PROC_FILES}/t{nm_arq}', delimiter='|') self.logger.info(f"Destino do arquivo {self.client_to}") self.logger.info(f"Container destino {self.container_to}") except azure.core.exceptions.ResourceNotFoundError: print('Entrou na exceção :)') upload_file = f'{_PROC_FILES}/t{nm_arq}' try: if os.path.isfile(upload_file): with open(upload_file, "rb") as data: self.client_to.upload_blob(nm_arq, data, overwrite=True) self.logger.info(f'{data} carregado') else: self.logger.info(f't{nm_arq} não foi encontrado no container') finally: self.logger.info('Tudo Carregado')