def get_delta(source_table, target_table, key='id'): source_table_headers = etl.header(source_table) target_table_headers = etl.header(target_table) if source_table_headers != target_table_headers: raise Exception( 'Source table columns do not match target table columns') source_ids = etl.cut(source_table, key) target_ids = etl.cut(target_table, key) added_ids_table, _ = etl.diff(source_ids, target_ids) merged_table = etl.merge(source_table, target_table, key=key) load_frame = etl.todataframe( etl.selectin(target_table, key, etl.values(added_ids_table, key))) print(load_frame) for row in etl.data(merged_table): for i, col in enumerate(row): if isinstance(col, etl.transform.reductions.Conflict): changes = tuple(col) print('For car {}, {} changed from {} to {}'.format( row[0], source_table_headers[i], changes[1], changes[0])) row_dict = dict(zip(source_table_headers, list(row))) row_dict[source_table_headers[i]] = changes[0] row_dict = {key: [val] for (key, val) in row_dict.items()} print(row_dict) df = pd.DataFrame(row_dict) load_frame = load_frame.append(df, ignore_index=True) break return etl.fromdataframe(load_frame)
def dataPreProcessing(fileName): inputData = fromcsv(fileName) table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title', 'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d', 'issue_d', 'purpose', 'addr_city', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d') table2 = select( table1, lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "") labelMapping = OrderedDict() labelMapping['loan_status'] = 'loan_status' labelMapping['id'] = 'id' table6 = fieldmap(table2, labelMapping) table8 = sort(table6, 'id') table10 = cutout(table8, 'id') mappings = OrderedDict() mappings['id'] = 'id' mappings['home_ownership'] = 'ownership', { 'MORTGAGE': '-1', 'RENT': '0', 'OWN': '1' } mappings['emp_length'] = 'empLength', {'n/a': 0} mappings['is_inc_v'] = 'verificationStatus', { 'Source Verified': 1, 'Verified': 0, 'Not Verified': -1 } mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1} mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1} table3 = fieldmap(table2, mappings) table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan', 'initial_list_status', 'term', 'loan_status') table5 = merge(table3, table4, key='id') table7 = sort(table5, 'id') table9 = cutout(table7, 'id') featureFileCsv = tocsv(table9, 'featureFileCsv.csv') labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv') return featureFileCsv, labelsFileCsv
# magic command for IPython display # locs_only_in_a.displayall(caption='a only') locs_only_in_b = b_locs.complement(a_locs) b_only = locs_only_in_b.nrows() print("B only rows: {}".format(b_only)) # Export missing locations to csv if a_only > 0: locs_only_in_a.tocsv('missing_locations_a.csv') else: locs_only_in_b.tocsv('missing_locations_b.csv') # find conflicts between A/B on Chr and Pos columns ab_merge = etl.merge(a_conv, b_conv, key=('Chr', 'Pos')) # magic command for IPython display # ab_merge.display(caption='ab_merge', # td_styles=lambda v: highlight if isinstance(v, etl.Conflict) else '') # Create a new list of all conflicting values ab = etl.cat(a_conv.addfield('source', 'a', index=0), b_conv.addfield('source', 'b', index=0)) ab_conflicts = ab.conflicts(key=('Chr', 'Pos'), exclude='source') # magic command for IPython display # ab_conflicts.display(10) # Highlight specific conflicts ab_conflicts_mut = ab.conflicts(key=('Chr', 'Pos'), include='Mut')
# merge table1 = [['foo', 'bar', 'baz'], [1, 'A', True], [2, 'B', None], [4, 'C', True]] table2 = [['bar', 'baz', 'quux'], ['A', True, 42.0], ['B', False, 79.3], ['C', False, 12.4]] from petl import look, merge look(table1) look(table2) table3 = merge(table1, table2, key='bar') look(table3) # aggregate table1 = [['foo', 'bar', 'baz'], ['a', 3, True], ['a', 7, False], ['b', 2, True], ['b', 2, False], ['b', 9, False], ['c', 4, True]] from petl import aggregate, look look(table1) # aggregate whole rows table2 = aggregate(table1, 'foo', len)
# merge table1 = [['foo', 'bar', 'baz'], [1, 'A', True], [2, 'B', None], [4, 'C', True]] table2 = [['bar', 'baz', 'quux'], ['A', True, 42.0], ['B', False, 79.3], ['C', False, 12.4]] from petl import look, merge look(table1) look(table2) table3 = merge(table1, table2, key='bar') look(table3) # aggregate table1 = [['foo', 'bar'], ['a', 3], ['a', 7], ['b', 2], ['b', 1], ['b', 9], ['c', 4], ['d', 3], ['d'], ['e']]
def transform(mmj_menu_items, mmj_categories, prices, organization_id, source_db, debug): """ Transform data """ # source data table source_dt = utils.view_to_list(mmj_menu_items) cut_menu_data = [ 'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id', 'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica', 'on_hold', 'product_type', 'image_file_name', 'medicine_amount', 'product_type' ] cut_prices = [ 'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram', 'price_two_gram', 'price_eigth', 'price_quarter', 'price_half', 'price_ounce' ] # Cut out all the fields we don't need to load menu_items = etl.cut(source_dt, cut_menu_data) prices_data = etl.cut(prices, cut_prices) menu_items = (etl.addfield( menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield( 'locationProductDetails').addfield('keys').addfield('restockLevel') ) # Two-step transform and cut. First we need to cut the name # and id from the source data to map to. cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement') source_values = etl.values(cut_source_cats, 'name', 'id') # Then we nede a dict of categories to compare against. # id is stored to match against when transforming and mapping categories mmj_categories = dict([(value, id) for (value, id) in source_values]) mappings = OrderedDict() mappings['id'] = 'id' mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at) mappings['name'] = 'name' mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db) """ 1 = Units 2 = Grams (weight) """ mappings['unitOfMeasure'] = \ lambda x: _map_uom(x.category_id, source_db) fields = etl.fieldmap(menu_items, mappings) data = etl.merge(menu_items, fields, key='id') items = [] for item in etl.dicts(data): breakpoint_pricing = (etl.select( prices_data, lambda x: x.dispensary_id == item['dispensary_id']).rename({ 'price_eigth': 'price_eighth' }).cutout('menu_item_id')) # Set image url for load to download url = None if debug and item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-development.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) elif item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-production.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) item['image_file_name'] = url item['categoryId'] = _map_categories(item['category_id'], item['sativa'], item['indica'], mmj_categories, menu_items) item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'], 'menu_id': item['menu_id'], 'vendor_id': item['vendor_id'], 'strain_id': item['strain_id'], 'category_id': item['category_id'] } # set a default netMJ value if the menu item is a unit product if item['unitOfMeasure'] is 2: item['netMarijuana'] = int(item['medicine_amount']) for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['locationProductDetails'] = { 'id': item['id'], 'active': _active(item['on_hold']) } item['restockLevel'] = _restock_level(item['dispensary_id'], item['product_type'], source_db) if item['shareOnWM'] is None: item['shareOnWM'] = False for price in etl.dicts(breakpoint_pricing): try: price_two_gram = price['price_two_gram'] except KeyError: price_two_gram = 0.0 item['locationProductDetails']['weightPricing'] = { 'price_half_gram': utils.dollars_to_cents(price['price_half_gram']), 'price_two_gram': utils.dollars_to_cents(price_two_gram), 'price_gram': utils.dollars_to_cents(price['price_gram']), 'price_eighth': utils.dollars_to_cents(price['price_eighth']), 'price_quarter': utils.dollars_to_cents(price['price_quarter']), 'price_half': utils.dollars_to_cents(price['price_half']), 'price_ounce': utils.dollars_to_cents(price['price_ounce']) } del item['vendor_id'] del item['indica'] del item['dispensary_id'] del item['id'] del item['strain_id'] del item['on_hold'] del item['menu_id'] del item['sativa'] del item['category_id'] del item['updated_at'] del item['created_at'] del item['product_type'] if item['image_file_name'] is None: del item['image_file_name'] # set up final structure for API items.append(item) # Remove inactive items for item in items: if item['locationProductDetails']['active'] is False: items.remove(item) if debug: result = json.dumps(items, sort_keys=True, indent=4, default=utils.json_serial) print(result) return items
def transform(source_data, organization_id, debug): """ Load the transformed data into the destination(s) """ # source data table source_dt = utils.view_to_list(source_data) cut_data = [ 'id', 'dispensary_id', 'mmjvenu_id', 'name', 'phone_number', 'email', 'country', 'state', 'city', 'address', 'zip_code', 'liscense_no', 'confirmed', 'website' ] vendor_data = etl.cut(source_dt, cut_data) vendor_mappings = OrderedDict() vendor_mappings['id'] = 'id' vendor_mappings['dispensary_id'] = 'dispensary_id' vendor_mappings['address'] = 'address' # field renames vendor_mappings['accountStatus'] = \ lambda x: "ACTIVE" if x.confirmed == 1 else "INACTIVE" vendor_mappings['phone'] = 'phone_number' vendor_mappings['licenceNumber'] = 'liscense_no' vendor_mappings['zip'] = 'zip_code' vendors_fields = etl.fieldmap(vendor_data, vendor_mappings) merged_vendors = etl.merge(vendor_data, vendors_fields, key='id') vendors = [] for item in etl.dicts(merged_vendors): if item['address'] is not None: item['address'] = { 'line1': item['address'], 'line2': None, 'city': item['city'], 'state': item['state'], 'zip': item['zip'], 'country': item['country'], } else: del item['address'] if item['licenceNumber'] is None or item['email'] is None or item[ 'website'] is None: del item['licenceNumber'] del item['email'] del item['website'] if item['phone'] is not None: item['phone'] = [{ 'name': 'business', 'number': item['phone'], 'default': True }] else: del item['phone'] item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'], 'mmjvenu_id': item['mmjvenu_id'] } # remove any item['keys'] tuples with None values for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] # mutate dict and remove fields that are mapped and no longer required del item['zip'] del item['state'] del item['country'] del item['city'] del item['zip_code'] del item['phone_number'] del item['confirmed'] del item['liscense_no'] # delete fk's del item['mmjvenu_id'] del item['id'] del item['dispensary_id'] # set up final structure for API vendors.append(item) if debug: result = json.dumps(vendors, sort_keys=True, indent=4, default=utils.json_serial) print(result) return vendors
def transform(mmj_employees, organization_id, debug, fake_email, source_db): """ Load the transformed data into the destination(s) """ # source data table source_dt = utils.view_to_list(mmj_employees) cut_data = [ 'id', 'email', 'first_name', 'organization_id', 'last_name', 'created_at', 'updated_at', 'login' ] employee_data = etl.cut(source_dt, cut_data) employees = (etl.addfield( employee_data, 'keys').addfield('name').addfield('role').addfield('dateOfBirth')) mappings = OrderedDict() mappings['id'] = 'id' mappings['name'] = \ lambda name: _set_name(name.first_name, name.last_name, name.login) """ Roles: 1 = site-admin 2 = site-admin 3 = store-manager 4 = budtender """ mappings['role'] = lambda x: _assign_role(x.id, source_db) mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['dateOfBirth'] = \ lambda _: datetime.datetime(year=1970, month=01, day=01, hour=02, minute=30) mappings['organization_id'] = 'organization_id' # keep mmj org mappings['accountStatus'] = lambda x: _active(x.id, source_db) fields = etl.fieldmap(employees, mappings) merged_employees = etl.merge(employees, fields, key='id') mapped_employees = [] for item in etl.dicts(merged_employees): item['keys'] = { 'id': item['id'], 'organization_id': item['organization_id'] } # remove any item['keys'] tuples with None values for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['email'] = _set_email(item['email'], fake_email, debug) del item['login'] del item['first_name'] del item['last_name'] del item['created_at'] del item['id'] del item['organization_id'] # set up final structure for API mapped_employees.append(item) if debug: result = json.dumps(mapped_employees, sort_keys=True, indent=4, default=utils.json_serial) print(result) return mapped_employees
import petl as etl table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7], ['B', 2, None], ['D', 3, 9.4], ['B', None, 7.8], ['E', None, 42.], ['D', 3, 12.3], ['A', 2, None]] table2 = etl.mergeduplicates(table1, 'foo') table2 # merge() ######### import petl as etl table1 = [['foo', 'bar', 'baz'], [1, 'A', True], [2, 'B', None], [4, 'C', True]] table2 = [['bar', 'baz', 'quux'], ['A', True, 42.0], ['B', False, 79.3], ['C', False, 12.4]] table3 = etl.merge(table1, table2, key='bar') table3 # fold() ######## import petl as etl table1 = [['id', 'count'], [1, 3], [1, 5], [2, 4], [2, 8]] import operator table2 = etl.fold(table1, 'id', operator.add, 'count', presorted=True) table2
def transform(dispensary_details, pricing, organization_id, debug, source_db): """ Load the transformed data into the destination(s) """ # source data table general_settings = utils.view_to_list(dispensary_details) pricing_detail = utils.view_to_list(pricing) dispensary_cut_data = [ 'id', 'dispensary_id', 'menu_show_tax', 'logo_file_name', 'inactivity_logout', 'calculate_even_totals', 'require_customer_referrer', 'membership_fee_enabled', 'pp_enabled', 'pp_global_dollars_to_points', 'pp_global_points_to_dollars', 'pp_points_per_referral', 'allow_unpaid_visits', 'red_flags_enabled', 'mmjrevu_api_key' ] pricing_cut_data = [ 'id', 'price_half_gram', 'price_gram', 'price_two_gram', 'price_eigth', 'price_quarter', 'price_half', 'price_ounce' ] dispensary_settings_data = etl.cut(general_settings, dispensary_cut_data) pricing_data = etl.cut(pricing_detail, pricing_cut_data) settings = (etl.addfield(dispensary_settings_data, 'organizationId')) mappings = OrderedDict() mappings['id'] = 'id' # field renames mappings['organizationId'] = organization_id settings_fields = etl.fieldmap(settings, mappings) merged_settings = ( etl.merge(settings, settings_fields, key='id').rename({ # Global -> General -> SESSION TIMEOUT DURATION 'inactivity_logout': 'sessionTimeoutDuration', # Global -> Logo 'logo_file_name': 'image', # Global -> Members -> Membership Level 'membership_fee_enabled': 'membershipLevelsEnabled', 'pp_global_dollars_to_points': 'dollarsPerPoint', 'pp_global_points_to_dollars': 'pointsPerDollar', 'pp_points_per_referral': 'referralPoints', # <Location> -> Sales -> TAXES IN 'menu_show_tax': 'enableTaxesIn', # <Location> -> Sales -> PRICE ROUNDING 'calculate_even_totals': 'hasPriceRounding', # <Location> -> Members -> REFERRER REQUIRED 'require_customer_referrer': 'mandatoryReferral', # <Location> -> Members -> PAID VISITS 'allow_unpaid_visits': 'paidVisitsEnabled', # <Location> -> Members -> MEDICAL MEMBERS 'red_flags_enabled': 'hasLimits', # <Location> -> General -> STORE LOCATIONS 'mmjrevu_api_key': 'apiKey' })) settings = {} for item in etl.dicts(merged_settings): item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'] } # if not item['sessionTimeoutDuration'] >= 30: # del item['sessionTimeoutDuration'] url = None if debug and item['image'] is not None: url = ("https://wm-mmjmenu-images-development.s3." "amazonaws.com/logos/{0}/original/" "{1}").format(item['id'], item['image']) elif item['image'] is not None: url = ("https://wm-mmjmenu-images-production.s3." "amazonaws.com/logos/{0}/original/" "{1}").format(item['id'], item['image']) item['image'] = url # remove any item['keys'] tuples with None values for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] """ Member settings nested - crm.member.settings """ if item['pp_enabled']: item['crm_member_settings'] = {} item['crm_member_settings']['membershipLevel'] = { 'membershipLevelsEnabled': \ utils.true_or_false(item['membershipLevelsEnabled']), 'levelName': 'Unnamed', 'dollarsPerPoint': item['dollarsPerPoint'], 'pointsPerDollar': item['pointsPerDollar'], 'referralPoints': item['referralPoints'] } """ Location settings nested. """ if item['apiKey']: item['location_specific'] = {'apiKey': item['apiKey']} else: item['location_specific'] = {} item['location_specific']['members'] = { 'paidVisitsEnabled': utils.true_or_false(item['paidVisitsEnabled']), 'mandatoryReferral': utils.true_or_false(item['mandatoryReferral']) } item['location_specific']['sales'] = { 'enableTaxesIn': utils.true_or_false(item['enableTaxesIn']), 'hasPriceRounding': utils.true_or_false(item['hasPriceRounding']) } # sales.settings.taxes item['sales_settings_taxes'] = {} for tax in _get_taxes(item['dispensary_id'], source_db): item['sales_settings_taxes']['taxes'] = { 'code': tax['name'], 'percent': tax['amount'] / 100, 'type': 'sales' } for pricing in etl.dicts(pricing_data): item['location_specific']['inventory'] = {} item['location_specific']['inventory']['weightPricing'] = { 'name': 'Default', 'defaultTier': True } item['location_specific']['inventory']['weightPricing'][ 'breakpoints'] = { 'price_half_gram': utils.dollars_to_cents(pricing['price_half_gram']), 'price_gram': utils.dollars_to_cents(pricing['price_gram']), 'price_two_gram': utils.dollars_to_cents(pricing['price_two_gram']), 'price_eighth': utils.dollars_to_cents(pricing['price_eigth']), 'price_quarter': utils.dollars_to_cents(pricing['price_quarter']), 'price_half': utils.dollars_to_cents(pricing['price_half']), 'price_ounce': utils.dollars_to_cents(pricing['price_ounce']), } # monthly purchase limit is two week limit x2 if item['hasLimits'] == 1: for limits in _medical_limits(item['dispensary_id'], source_db): item['location_specific']['members']['medicalLimits'] = { 'hasLimits': True, 'dailyPurchaseLimit': int(limits['daily_purchase_limit']), 'visitPurchaseLimit': int(limits['visit_purchase_limit']), 'dailyVisitLimit': int(limits['daily_visit_limit']), 'monthlyPurchaseLimit': \ int(limits['two_week_purchase_limit'] * 2) } if item['image'] is None or item['apiKey'] is None: del item['image'] del item['apiKey'] # delete fk's del item['id'] del item['dispensary_id'] del item['membershipLevelsEnabled'] del item['enableTaxesIn'] del item['hasLimits'] del item['hasPriceRounding'] del item['dollarsPerPoint'] del item['mandatoryReferral'] del item['paidVisitsEnabled'] del item['pointsPerDollar'] del item['pp_enabled'] del item['referralPoints'] # set up final structure for API settings.update(item) if debug: result = json.dumps(settings, sort_keys=True, indent=4, default=utils.json_serial) print(result) return settings
table2 # merge() ######### import petl as etl table1 = [['foo', 'bar', 'baz'], [1, 'A', True], [2, 'B', None], [4, 'C', True]] table2 = [['bar', 'baz', 'quux'], ['A', True, 42.0], ['B', False, 79.3], ['C', False, 12.4]] table3 = etl.merge(table1, table2, key='bar') table3 # fold() ######## import petl as etl table1 = [['id', 'count'], [1, 3], [1, 5], [2, 4], [2, 8]] import operator table2 = etl.fold(table1, 'id', operator.add, 'count', presorted=True)