def test_values(): """Test the values function.""" table = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2), ('b', 7, False)) actual = values(table, 'foo') expect = ('a', 'b', 'b') ieq(expect, actual) ieq(expect, actual) actual = values(table, 'bar') expect = (1, 2, 7) ieq(expect, actual) ieq(expect, actual) # old style signature for multiple fields, still supported actual = values(table, ('foo', 'bar')) expect = (('a', 1), ('b', 2), ('b', 7)) ieq(expect, actual) ieq(expect, actual) # as of 0.24 new style signature for multiple fields actual = values(table, 'foo', 'bar') expect = (('a', 1), ('b', 2), ('b', 7)) ieq(expect, actual) ieq(expect, actual) actual = values(table, 'baz') expect = (True, None, False) ieq(expect, actual) ieq(expect, actual)
def test_values(): """Test the values function.""" table = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2), ('b', 7, False)) actual = values(table, 'foo') expect = ('a', 'b', 'b') ieq(expect, actual) ieq(expect, actual) actual = values(table, 'bar') expect = (1, 2, 7) ieq(expect, actual) ieq(expect, actual) actual = values(table, ('foo', 'bar')) expect = (('a', 1), ('b', 2), ('b', 7)) ieq(expect, actual) ieq(expect, actual) actual = values(table, 'baz') expect = (True, None, False) ieq(expect, actual) ieq(expect, actual)
def test_values(): """Test the values function.""" table = (("foo", "bar", "baz"), ("a", 1, True), ("b", 2), ("b", 7, False)) actual = values(table, "foo") expect = ("a", "b", "b") ieq(expect, actual) ieq(expect, actual) actual = values(table, "bar") expect = (1, 2, 7) ieq(expect, actual) ieq(expect, actual) # old style signature for multiple fields, still supported actual = values(table, ("foo", "bar")) expect = (("a", 1), ("b", 2), ("b", 7)) ieq(expect, actual) ieq(expect, actual) # as of 0.24 new style signature for multiple fields actual = values(table, "foo", "bar") expect = (("a", 1), ("b", 2), ("b", 7)) ieq(expect, actual) ieq(expect, actual) actual = values(table, "baz") expect = (True, None, False) ieq(expect, actual) ieq(expect, actual)
def get_delta(source_table, target_table, key='id'): source_table_headers = etl.header(source_table) target_table_headers = etl.header(target_table) if source_table_headers != target_table_headers: raise Exception( 'Source table columns do not match target table columns') source_ids = etl.cut(source_table, key) target_ids = etl.cut(target_table, key) added_ids_table, _ = etl.diff(source_ids, target_ids) merged_table = etl.merge(source_table, target_table, key=key) load_frame = etl.todataframe( etl.selectin(target_table, key, etl.values(added_ids_table, key))) print(load_frame) for row in etl.data(merged_table): for i, col in enumerate(row): if isinstance(col, etl.transform.reductions.Conflict): changes = tuple(col) print('For car {}, {} changed from {} to {}'.format( row[0], source_table_headers[i], changes[1], changes[0])) row_dict = dict(zip(source_table_headers, list(row))) row_dict[source_table_headers[i]] = changes[0] row_dict = {key: [val] for (key, val) in row_dict.items()} print(row_dict) df = pd.DataFrame(row_dict) load_frame = load_frame.append(df, ignore_index=True) break return etl.fromdataframe(load_frame)
def _has_observations(self, record): sql = '''select count(PointID) from dbo.WaterLevelsContinuous_Pressure where PointID=%s''' pid = record['PointID'] table = petl.fromdb(nm_aquifier_connection(), sql, (pid, )) nobs = petl.values(table, '')[0] print(f'{pid} has nobs={nobs}') return bool(nobs)
def get_max_dt(self): """ Gets the current maximum date in the table :return: """ sql = 'select max(dt) as max_dt from ol_transactions' self.log("SQL: {0}".format(sql)) table = petl.fromdb(self.connection, sql) max_dt = petl.values(table, 'max_dt')[0] return max_dt
def get_calls_per_office(parsons_table): target_list = [] for targets in petl.values(parsons_table.table, "target_names"): for target in targets: target_list.append(target) counter = collections.Counter(target_list) calls_counter = dict(counter) calls_per_office = [{"name" : key, "num_calls": value} for key, value in calls_counter.items()] return Table(calls_per_office)
def test_minimal_start(): data = [ ['dur', 'description', 'start'], [timedelta(), 'test 1', datetime(2000, 1, 1, 0, 15)], [timedelta(), 'test 1', datetime(2000, 1, 1, 20, 15)], ] result = group_entries_by_day(data) assert set(petl.values(result, 'start')) == {datetime(2000, 1, 1, 0, 15)}
def sep_valuecounter(table, col_name, sep_char=';'): dict_sep = {} for value in petl.values(table, col_name): if value.strip() == '': continue else: for sep in value.split(sep_char): sep_str = sep.strip() if sep_str not in dict_sep: dict_sep[sep_str] = 0 dict_sep[sep_str] += 1 return dict_sep
def get_min_dt(self, last): """ Gets the minimum date considering previous extractions from the table. :param last: :return: """ if last is None or len(last) == 0: sql = "select min(dt) as min_dt from ol_transactions" else: sql = "select min(dt) as min_dt from ol_transactions where dt >= '{0}'".format(last) self.log("SQL: {0}".format(sql)) table = petl.fromdb(self.connection, sql) extract_dt = petl.values(table, 'min_dt')[0] return extract_dt
def test_sum_duration(): data = [ ['dur', 'description', 'start'], [timedelta(minutes=1), 'test 1', datetime(2000, 1, 1, 15, 15)], [timedelta(minutes=1), 'test 1', datetime(2000, 1, 1, 20, 15)], [timedelta(hours=2), 'test 1', datetime(2000, 1, 20, 15, 15)], [timedelta(hours=1), 'test 1', datetime(2000, 1, 20, 20, 15)], ] result = group_entries_by_day(data) assert set(petl.values( result, 'dur')) == {timedelta(minutes=2), timedelta(hours=3)}
def valuecounts(table, col_name): return_dict = {} reported_count = 0 unreported_count = 0 column = petl.values(table, col_name) nrows = petl.nrows(table) non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''") num_blanks = nrows - petl.nrows(non_blanks) counts_table = petl.valuecounts(non_blanks, col_name) for row in petl.records(counts_table): if row['frequency'] > 0.01: return_dict[row[col_name]] = row['count'] reported_count += row['count'] else: unreported_count += row['count'] return_dict['<other>'] = unreported_count return_dict['<blank>'] = num_blanks return return_dict
def load_split_transaction_details(list_split_details_files): dict_split_transaction_details = {} if list_split_details_files is not None: for split_details_file in list_split_details_files: assert os.path.isfile(split_details_file), "Error: cannot open file '" + split_details_file + "'" table = petl.fromcsv(split_details_file) account_name = get_account_name(petl.values(table, 'Account')) for row in petl.records(table): if row['Account'] == 'Split Transaction': string_key = row['Individual ID'] + ',' + row['Batch Date'] if string_key not in dict_split_transaction_details: dict_split_transaction_details[string_key] = [] dict_split_transaction_details[string_key].append({ 'Account': account_name, 'Amount': row['Amount'], 'Tax': row['Tax'] }) return dict_split_transaction_details
def get_column_max_width(self, column): """ Return the maximum width of the column. `Args:` column: str The column name. `Returns:` int """ max_width = 0 for v in petl.values(self.table, column): if len(str(v)) > max_width: max_width = len(str(v)) return max_width
def phone(self, table): """ Match based on a list of 500 phones numbers. Table can contain up to 500 phone numbers to match `Args:` table: parsons table See :ref:`parsons-table`. One row per phone number, up to 500 phone numbers. `Returns:` See :ref:`parsons-table` for output options. """ url = self.connection.uri + 'person/phone-search' args = {'phones': list(petl.values(table.table, 0))} return Table( self.connection.request(url, args=args, raw=True)['result'])
def process_records(self): """ Handles querying and extraction :return: """ rows = petl.values(self.table, 'dt', 'total', 'duration') row_count = 0 measurements = [] properties = {'app_id': self.app_id} source = "littledog.com" for row in rows: timestamp = int(row[0].strftime('%s')) total = int(row[1]) duration = int(row[2]) logging.debug("Add Measurements => dt: {0}, total: {1}, duration: {2} ".format(timestamp, total, duration)) row_count += 1 measurements.append(Measurement(metric='ONLINE_TRANSACTION_COUNT', source=source, value=total, timestamp=timestamp, properties=properties)) measurements.append(Measurement(metric='ONLINE_TRANSACTION_TIME', source=source, value=duration, timestamp=timestamp, properties=properties)) # Send when we have batch of 10 measurements if row_count == 10: # send measurements self.send_measurements(measurements) measurements = [] row_count = 0 # If there are any remaining measurements send them on if len(measurements) > 0: self.api.measurement_create_batch(measurements)
def order_by_constraint(base_path, table, schema, self_dep_set): file_name = base_path + "/content/data/" + table + ".tsv" tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = etl.fromcsv(file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') key_dep_dict = {} # print(file_name) for constraint in self_dep_set: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield(table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name)
def split_dataset(dataset, p_train_data, split_mode): fields = list(fieldnames(dataset)) size_dataset = len(values(dataset, fields[0])) size_train_data = int(round(size_dataset * p_train_data)) size_test_data = abs(size_train_data - size_dataset) if split_mode == 'normal' : train_data = head(dataset, size_train_data - 1) if size_test_data == 0: test_data = [] else: test_data = tail(dataset, size_test_data - 1) #################### Falta incluir Shuffle mode ############### return train_data, test_data
tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = etl.fromcsv(file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') key_dep_dict = {} print(file_name) for constraint in value: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield( table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='',
try: # if the user provided a database password if args.database_pass: conn = psycopg2.connect( "dbname={} user={} password={} host='localhost'".format( args.database_name, args.database_user, args.database_pass)) # otherwise, connect without a password else: conn = psycopg2.connect("dbname={} user={} host='localhost'".format( args.database_name, args.database_user)) except: print("Unable to connect to the database") exit() # get these fields from the CSV for record in etl.values(records, args.from_field_name, args.to_field_name): with conn: with conn.cursor() as curs: # resource_type_id 2 is metadata for items sql = "SELECT text_value FROM metadatavalue WHERE resource_type_id=2 AND metadata_field_id=%s AND text_value=%s" curs.execute(sql, (args.metadata_field_id, record[0])) records_to_fix = curs.rowcount print("Fixing {} occurences of: {}".format(records_to_fix, record[0])) with conn: with conn.cursor() as curs: sql = "UPDATE metadatavalue SET text_value=%s WHERE resource_type_id=2 AND metadata_field_id=%s AND text_value=%s" curs.execute(sql, (record[1], args.metadata_field_id, record[0])) rows_updated = curs.rowcount print("> {} occurences updated".format(rows_updated))
def extract_values(dataset, field_position): fields = list(fieldnames(dataset)) field_values = values(dataset, field_position) return field_values
def _get_field_sample(resource_sample, field): """Return a subset of the relevant data column.""" sample_table = petl.fromdicts(resource_sample) sample_column = list(petl.values(sample_table, field['name'])) return sample_column
def compact(s): shift = s['role']['name'][0:14] + " " + s['shift']['start'][11:16].lstrip("0") + " " + s['user']['firstname'] + " " + s['user']['lastname'][0:1] #shift["Notes"]=s['shift']['notes'] return shift data = [mini(s) for s in db] pprint(data) table = etl.fromdicts(data) pprint(etl.look(table)) #Using set get a list of distinct departments departments = (set(etl.values(table,"Department"))) pprint(departments) for d in departments: print(d) table2 = etl.select(table, 'Department', lambda v: v == d ).facet("Day" ) report = ( etl .empty() .addcolumn('12-13', list(table2["2017-12-13"].values("Shift"))) .addcolumn('12-14', list(table2["2017-12-14"].values("Shift"))) .addcolumn('12-15', list(table2["2017-12-15"].values("Shift"))) .addcolumn('12-16', list(table2["2017-12-16"].values("Shift"))) .addcolumn('12-17', list(table2["2017-12-17"].values("Shift"))) )
# selecting the data from table on the basis of current date # variable num consist of only the data of 2020-04-30 from each country.Hence the latest data is filter out. num = etl.select(data, 'date', lambda r: r == '2020-04-30') # sort function is used to sort the unsorted data on the basis of iso_code # thus ,this process help us to join the data easily in furthur steps table1_sort = etl.sort(num, key='iso_code') # counter variable is declared to count the number of country count = 0 # values function is used to read the data from table for i in etl.values(table1_sort, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units'): # condition to take 15 countries from the table for intregration # thus 16 is given because when count will be 0 it contain the unnecessary data. if count == 16: break # thus data1 was already declared with header on above code and other data are being appended data1.append(i) count = count + 1 # removing the unnecessary data from the list which was in data1[1] data1.pop(1)
from __future__ import division, print_function, absolute_import, unicode_literals # values() ########## import petl as etl table1 = [["foo", "bar"], ["a", True], ["b"], ["b", True], ["c", False]] foo = etl.values(table1, "foo") foo list(foo) bar = etl.values(table1, "bar") bar list(bar) # values from multiple fields table2 = [["foo", "bar", "baz"], [1, "a", True], [2, "bb", True], [3, "d", False]] foobaz = etl.values(table2, "foo", "baz") foobaz list(foobaz) # header() ########## import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] etl.header(table)
def transform(mmj_menu_items, mmj_categories, prices, organization_id, source_db, debug): """ Transform data """ # source data table source_dt = utils.view_to_list(mmj_menu_items) cut_menu_data = [ 'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id', 'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica', 'on_hold', 'product_type', 'image_file_name', 'medicine_amount', 'product_type' ] cut_prices = [ 'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram', 'price_two_gram', 'price_eigth', 'price_quarter', 'price_half', 'price_ounce' ] # Cut out all the fields we don't need to load menu_items = etl.cut(source_dt, cut_menu_data) prices_data = etl.cut(prices, cut_prices) menu_items = (etl.addfield( menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield( 'locationProductDetails').addfield('keys').addfield('restockLevel') ) # Two-step transform and cut. First we need to cut the name # and id from the source data to map to. cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement') source_values = etl.values(cut_source_cats, 'name', 'id') # Then we nede a dict of categories to compare against. # id is stored to match against when transforming and mapping categories mmj_categories = dict([(value, id) for (value, id) in source_values]) mappings = OrderedDict() mappings['id'] = 'id' mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at) mappings['name'] = 'name' mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db) """ 1 = Units 2 = Grams (weight) """ mappings['unitOfMeasure'] = \ lambda x: _map_uom(x.category_id, source_db) fields = etl.fieldmap(menu_items, mappings) data = etl.merge(menu_items, fields, key='id') items = [] for item in etl.dicts(data): breakpoint_pricing = (etl.select( prices_data, lambda x: x.dispensary_id == item['dispensary_id']).rename({ 'price_eigth': 'price_eighth' }).cutout('menu_item_id')) # Set image url for load to download url = None if debug and item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-development.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) elif item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-production.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) item['image_file_name'] = url item['categoryId'] = _map_categories(item['category_id'], item['sativa'], item['indica'], mmj_categories, menu_items) item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'], 'menu_id': item['menu_id'], 'vendor_id': item['vendor_id'], 'strain_id': item['strain_id'], 'category_id': item['category_id'] } # set a default netMJ value if the menu item is a unit product if item['unitOfMeasure'] is 2: item['netMarijuana'] = int(item['medicine_amount']) for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['locationProductDetails'] = { 'id': item['id'], 'active': _active(item['on_hold']) } item['restockLevel'] = _restock_level(item['dispensary_id'], item['product_type'], source_db) if item['shareOnWM'] is None: item['shareOnWM'] = False for price in etl.dicts(breakpoint_pricing): try: price_two_gram = price['price_two_gram'] except KeyError: price_two_gram = 0.0 item['locationProductDetails']['weightPricing'] = { 'price_half_gram': utils.dollars_to_cents(price['price_half_gram']), 'price_two_gram': utils.dollars_to_cents(price_two_gram), 'price_gram': utils.dollars_to_cents(price['price_gram']), 'price_eighth': utils.dollars_to_cents(price['price_eighth']), 'price_quarter': utils.dollars_to_cents(price['price_quarter']), 'price_half': utils.dollars_to_cents(price['price_half']), 'price_ounce': utils.dollars_to_cents(price['price_ounce']) } del item['vendor_id'] del item['indica'] del item['dispensary_id'] del item['id'] del item['strain_id'] del item['on_hold'] del item['menu_id'] del item['sativa'] del item['category_id'] del item['updated_at'] del item['created_at'] del item['product_type'] if item['image_file_name'] is None: del item['image_file_name'] # set up final structure for API items.append(item) # Remove inactive items for item in items: if item['locationProductDetails']['active'] is False: items.remove(item) if debug: result = json.dumps(items, sort_keys=True, indent=4, default=utils.json_serial) print(result) return items
from __future__ import division, print_function, absolute_import, \ unicode_literals # values() ########## import petl as etl table1 = [['foo', 'bar'], ['a', True], ['b'], ['b', True], ['c', False]] foo = etl.values(table1, 'foo') foo list(foo) bar = etl.values(table1, 'bar') bar list(bar) # values from multiple fields table2 = [['foo', 'bar', 'baz'], [1, 'a', True], [2, 'bb', True], [3, 'd', False]] foobaz = etl.values(table2, 'foo', 'baz') foobaz list(foobaz) # header() ########## import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] etl.header(table) # fieldnames() ##############
def convert_folder(base_source_dir, base_target_dir, tmp_dir, tika=False, ocr=False, merge=False, tsv_source_path=None, tsv_target_path=None, make_unique=True, sample=False, zip=False): # WAIT: Legg inn i gui at kan velge om skal ocr-behandles txt_target_path = base_target_dir + '_result.txt' json_tmp_dir = base_target_dir + '_tmp' converted_now = False errors = False originals = False if merge is False: # TODO: Trengs begge argumentene? make_unique = False if tsv_source_path is None: tsv_source_path = base_target_dir + '.tsv' else: txt_target_path = os.path.splitext( tsv_source_path)[1][1:] + '_result.txt' if tsv_target_path is None: tsv_target_path = base_target_dir + '_processed.tsv' if os.path.exists(tsv_target_path): os.remove(tsv_target_path) Path(base_target_dir).mkdir(parents=True, exist_ok=True) # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried? # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig? if not os.path.isfile(tsv_source_path): if tika: run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip) else: run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip) # TODO: Legg inn test på at tsv-fil ikke er tom replace_text_in_file(tsv_source_path, '\0', '') table = etl.fromtsv(tsv_source_path) table = etl.rename(table, { 'filename': 'source_file_path', 'tika_batch_fs_relative_path': 'source_file_path', 'filesize': 'file_size', 'mime': 'mime_type', 'Content_Type': 'mime_type', 'Version': 'version' }, strict=False) thumbs_table = etl.select( table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db') if etl.nrows(thumbs_table) > 0: thumbs_paths = etl.values(thumbs_table, 'source_file_path') for path in thumbs_paths: if '/' not in path: path = os.path.join(base_source_dir, path) if os.path.isfile(path): os.remove(path) table = etl.select( table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db') table = etl.select(table, lambda rec: rec.source_file_path != '') table = etl.select(table, lambda rec: '#' not in rec.source_file_path) # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn row_count = etl.nrows(table) file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)]) if row_count == 0: print('No files to convert. Exiting.') return 'Error', file_count elif file_count != row_count: print('Row count: ' + str(row_count)) print('File count: ' + str(file_count)) print("Files listed in '" + tsv_source_path + "' doesn't match files on disk. Exiting.") return 'Error', file_count elif not zip: print('Converting files..') # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering append_fields = ('version', 'norm_file_path', 'result', 'original_file_copy', 'id') table = add_fields(append_fields, table) cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime', 'X_TIKA_EXCEPTION_warn') table = remove_fields(cut_fields, table) header = etl.header(table) append_tsv_row(tsv_target_path, header) # Treat csv (detected from extension only) as plain text: table = etl.convert(table, 'mime_type', lambda v, row: 'text/plain' if row.id == 'x-fmt/18' else v, pass_row=True) # Update for missing mime types where id is known: table = etl.convert(table, 'mime_type', lambda v, row: 'application/xml' if row.id == 'fmt/979' else v, pass_row=True) if os.path.isfile(txt_target_path): os.remove(txt_target_path) data = etl.dicts(table) count = 0 for row in data: count += 1 count_str = ('(' + str(count) + '/' + str(file_count) + '): ') source_file_path = row['source_file_path'] if '/' not in source_file_path: source_file_path = os.path.join(base_source_dir, source_file_path) mime_type = row['mime_type'] # TODO: Virker ikke når Tika brukt -> finn hvorfor if ';' in mime_type: mime_type = mime_type.split(';')[0] version = row['version'] result = None old_result = row['result'] if not mime_type: if os.path.islink(source_file_path): mime_type = 'n/a' # kind = filetype.guess(source_file_path) extension = os.path.splitext(source_file_path)[1][1:].lower() if extension == 'xml': mime_type = 'application/xml' if not zip: print_path = os.path.relpath(source_file_path, Path(base_source_dir).parents[1]) print(count_str + '.../' + print_path + ' (' + mime_type + ')') if mime_type not in mime_to_norm.keys(): # print("|" + mime_type + "|") errors = True converted_now = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') row['norm_file_path'] = '' row['original_file_copy'] = '' else: keep_original = mime_to_norm[mime_type][0] if keep_original: originals = True if zip: keep_original = False function = mime_to_norm[mime_type][1] # Ensure unique file names in dir hierarchy: norm_ext = mime_to_norm[mime_type][2] if not norm_ext: norm_ext = 'none' if make_unique: norm_ext = (base64.b32encode( bytes( str(count), encoding='ascii'))).decode('utf8').replace( '=', '').lower() + '.' + norm_ext target_dir = os.path.dirname( source_file_path.replace(base_source_dir, base_target_dir)) normalized = file_convert(source_file_path, mime_type, function, target_dir, tmp_dir, None, norm_ext, version, ocr, keep_original, zip=zip) if normalized['result'] == 0: errors = True result = 'Conversion failed' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 1: result = 'Converted successfully' converted_now = True elif normalized['result'] == 2: errors = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 3: if old_result not in ('Converted successfully', 'Manually converted'): result = 'Manually converted' converted_now = True else: result = old_result elif normalized['result'] == 4: converted_now = True errors = True result = normalized['error'] append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 5: result = 'Not a document' if normalized['norm_file_path']: row['norm_file_path'] = relpath(normalized['norm_file_path'], base_target_dir) file_copy_path = normalized['original_file_copy'] if file_copy_path: file_copy_path = relpath(file_copy_path, base_target_dir) row['original_file_copy'] = file_copy_path row['result'] = result row_values = list(row.values()) # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere? # row_values = [r.replace('\n', ' ') for r in row_values if r is not None] append_tsv_row(tsv_target_path, row_values) if sample and count > 9: break if not sample: shutil.move(tsv_target_path, tsv_source_path) # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper msg = None if sample: msg = 'Sample files converted.' if errors: msg = "Not all sample files were converted. See '" + txt_target_path + "' for details." else: if converted_now: msg = 'All files converted succcessfully.' if errors: msg = "Not all files were converted. See '" + txt_target_path + "' for details." else: msg = 'All files converted previously.' return msg, file_count, errors, originals # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
# Open CSV file stores = etl.fromcsv('stores.csv') # Open XML document locations = etl.fromxml('locations.xml', 'store', {'Name': 'Name', 'Lat': 'Lat', 'Lon': 'Lon'}) print(locations) # Set output output_table = [["ID", "Name", "Suburb", "State", "Postcode"]] store_id = 1 # Read through the store.csv to generate output_table store = etl.cut(stores, 'Name', 'Suburb', 'State', 'Postcode').distinct() print(store) for s in etl.values(store, 'Name', 'Suburb', 'State', 'Postcode'): output_table.append([store_id, s]) store_id += 1 print (output_table) # Merge and join XML and CSV together merge_output = etl.join(stores, locations, key="Name") print(merge_output) store_table = etl.cut(merge_output, 'ID', 'Name', 'Suburb', 'State', 'Postcode', 'Lat', 'Lon') print(etl.head(store_table, 5)) # Export to CSV file etl.tocsv(merge_output, 'store_locations.csv')
emails.append(rec['first_name'] + '.' + rec['last_name'] + '@mycompany.com') data2 = etl.addcolumn(data, 'email', emails) else: data2 = data etl.todb(data2, mysql_engine, table, create=True) # load CSV file data = etl.fromcsv(source=socialmedia_csv) recs = etl.records(data) # determine employee numbers empnos = [] for rec in recs: sub = etl.fromdb( sqlite_engine, "SELECT emp_no FROM employees " + "where last_name = '" + rec['last_name'] + "' " + "and first_name = '" + rec['first_name'] + "' " + "and birth_date = '" + rec['birth_date'] + "' " + "order by birth_date desc " + "limit 1") vals = etl.values(sub, 'emp_no') if len(vals) > 0: empnos.append(vals[0]) else: empnos.append(-1) # dummy # adding column gets ignored?? data2 = etl.addcolumn(data, 'emp_no', empnos) etl.todb(data2, mysql_engine, 'socialmedia', create=True) end_time = time.time() print("execution time", end_time - start_time)