def incorrect_email_address(): # migration-first-ver df = cst.read_customer_file() # emails only in retail schema df = df.query('SECTOR < 1007 | SECTOR == 1901') df['VALID_EMAIL'] = df['EMAIL.1'].map(vld.valid_email) # df.query('VALID_EMAIL == 0')[['CUSTOMER.CODE', 'EMAIL.1']].to_excel('wrong-empty-emails.xlsx', index = None, header=True) df_bad_emails = df.query('VALID_EMAIL == 0')[[ 'CUSTOMER.CODE', 'EMAIL.1', 'PHONE.1', 'SMS.1', 'NAME.1', 'NAME.2', 'ADDRESS' ]] df_acc = acc.read_accounts_file() df_m = pd.merge(df_acc, df_bad_emails, left_on='CUSTOMER', right_on='CUSTOMER.CODE') df_m = df_m[[ 'CUSTOMER', 'ACCOUNT', 'PHONE.1', 'SMS.1', 'NAME.1', 'NAME.2', 'ADDRESS', 'EMAIL.1' ]] df_m.to_excel('accounts_wrong_email.xlsx', index=None, header=True)
def dictionary_producer(target, dictionary): df = None if target == ProductFiles.CUSTOMER: df = cst.read_customer_file() elif target == ProductFiles.ACCOUNT: df = acc.read_accounts_file() elif target == ProductFiles.LEGAL_DOC: df = get_leagal_doc_set() else: print('Invalid argument - target') return # print(df.shape) print(dictionary, df[dictionary].dropna().unique().tolist()) print(dictionary, len(df[dictionary].dropna().unique().tolist())) # dict_list = df['LEGAL.DOC.NAME'].map( # cln.document_type).dropna().unique().tolist() # print(list(filter(lambda x: x.find('RESIDE') > -1 , df.columns.tolist()))) # print for dictionary dict_list = df[dictionary].dropna().unique().tolist() for x in dict_list: # # if len(x.split('|')) > 2: # # print(x) print(f'\'{x}\':\'VOD_{x}\',')
def save_xlsx(target=ProductFiles.CUSTOMER): df = None if target == ProductFiles.CUSTOMER: df = cst.read_customer_file() elif target == ProductFiles.ACCOUNT: df = acc.read_accounts_file() else: print('Invalid argument - target') return print(df.shape) # print(df.columns) f_name = f"./output/{target}_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.xlsx" df.to_excel(f_name, index=None, header=True)
def combine_customer_account(): df_cst = cst.read_customer_file() # choose which one df_acc = acc.read_accounts_file('RETAIL') df_acc = df_acc[~df_acc['REGROUPE'].notnull()] df_acc = df_acc.drop_duplicates(['CUSTOMER'], keep='first') df_merged = pd.merge(df_acc, df_cst, left_on='CUSTOMER', right_on='CUSTOMER.CODE') # df_merged = df_acc[~df_acc['CUSTOMER'].isin(df_cst['CUSTOMER.CODE'])] print('Shape is:', df_merged.shape) valid_birthday_tax(df_merged)
def accounts(client_type='SME'): df = acc.read_accounts_file() if client_type == 'SME': df = df[df['SECTOR.CU.NATCLI'].isin(['BA', 'CO', 'ET', 'OR'])] else: df = df[df['SECTOR.CU.NATCLI'].isin(['RE'])] # print('COLUMN LIST',df.columns) 122 df[['CUSTOMER', 'ACCOUNT', 'SECTOR.CU.NATCLI', 'CURRENCY']].to_excel(f'acc_{client_type.lower()}.xlsx', index=None, header=True) print(df['ACCOUNT'].count(), len(df['ACCOUNT'].unique().tolist())) # get only duplicated list #this way is cost effective # df_s = pd.concat(g for _, g in df.groupby("CUSTOMER") if len(g) > 1) customers = df["CUSTOMER"] # df_s = df[customers.isin(customers[customers.duplicated()])].sort_values("CUSTOMER") # print('Unique customers num:', len(df_s['CUSTOMER'].unique())) # print('Duplicated customers num:', len(df_s)) df_s = df[customers.duplicated(keep=False)].sort_values("CUSTOMER") print('Unique customers num:', len(df['CUSTOMER'].unique().tolist())) print('Duplicated customers num:', len(df_s)) print('Duplicated customers unique num:', len(df_s['CUSTOMER'].unique().tolist())) f_name = f'./output/duplicated_acc.id_{date.today().isoformat()}_{client_type}.xlsx' print(df_s.groupby(['CUSTOMER']).size().reset_index(name='count')) df_s.groupby( ['CUSTOMER']).size().reset_index(name='NO.OF.ACCOUNTS').sort_values( ['NO.OF.ACCOUNTS'], ascending=False).to_excel( f'./output/{client_type}_CUSTOMER_MORE_ACCOUNTS.xlsx', index=None) df_s[[ 'CUSTOMER', 'ACCOUNT', 'SECTOR.CU.NATCLI', 'CURRENCY', 'REGROUPE', 'IS.RELATION.OF' ]].sort_values(['CUSTOMER']).to_excel(f_name, index=None, header=True)
def combain_regroupe(): df_cst = cst.read_customer_file() df_cst['CUSTOMER.CODE'] = df_cst['CUSTOMER.CODE'].astype(str) df_acc = acc.read_accounts_file() #only regrupe are interesting df_acc = df_acc[df_acc['REGROUPE'].notnull()] df_acc = df_acc[df_acc['REGROUPE'].notnull()] df_acc['CUSTOMER'] = df_acc['CUSTOMER'].astype(str) df_reg = df_acc['REGROUPE'].str.split('~', expand=True) df_reg = df_reg.rename(columns=lambda x: 'REGROUPE_' + str(x)) print(df_acc.shape) df_acc = pd.concat([df_reg, df_acc], axis=1).reindex(df_reg.index) print(df_acc.shape) # df_acc.to_excel('./output/regroupe.xlsx', index = None, header=True) #column to check regroupe = 'REGROUPE_2' df_merged = pd.merge(df_acc, df_cst, left_on=regroupe, right_on='CUSTOMER.CODE', how='outer', indicator=True) df_merged = df_merged[df_merged[regroupe].notnull()] print("after notnull", df_merged.shape) # left_merged = 'both' #left_only left_merged = 'left_only' df_regr1 = df_merged.query('_merge == @left_merged')[[ 'ACCOUNT', 'CUSTOMER.CODE', 'REGROUPE', regroupe ]].reset_index(drop=True) print(df_regr1.shape, df_regr1)
def convert_retail_to_json(): """ Function read retail customer and produce json payload """ df = read_customer_file() # ########################## print statistic - get only RETAIL CUSTOMERS df = df.query('SECTOR < 1007 | SECTOR == 1901') # second take only those with active accounts df_acc = acc.read_accounts_file('RETAIL') ut.log_dataset_data(df_acc) df_acc = df_acc.drop_duplicates(['CUSTOMER'], keep='first') ut.log_dataset_data(df_acc) # df_acc = df_acc[df_acc['REGROUPE'].isnull()] df_acc['CUSTOMER'] = df_acc['CUSTOMER'].astype(str) ut.log_dataset_data(df_acc) # .drop('CUSTOMER', axis=1, inplace=True) acc_columns = ['CUSTOMER','REGROUPE_0', 'REGROUPE_1','REGROUPE_2'] df_basic = pd.merge(df, df_acc[acc_columns], left_on='CUSTOMER.CODE', right_on='CUSTOMER') ut.log_dataset_data(df_basic) df_regr_0 = pd.merge(df, df_acc[acc_columns], left_on='CUSTOMER.CODE', right_on='REGROUPE_0') ut.log_dataset_data(df_regr_0) df_regr_1 = pd.merge(df, df_acc[acc_columns], left_on='CUSTOMER.CODE', right_on='REGROUPE_1') ut.log_dataset_data(df_regr_1) df_regr_2 = pd.merge(df, df_acc[acc_columns], left_on='CUSTOMER.CODE', right_on='REGROUPE_2') ut.log_dataset_data(df_regr_2) df = pd.concat([df_basic, df_regr_0, df_regr_1,df_regr_2]) ut.log_dataset_data(df) df = df.drop_duplicates(['CUSTOMER.CODE'], keep='first') ut.log_dataset_data(df) if IS_TEST_ENVIRONMENT: df = df.head(TEST_PROBE) ut.log_dataset_data(df) # ##########################rename columns which does not need to be converted df.rename(columns={'GENDER': 'sex', 'NATIONALITY': 'nationality', 'DOMICILE': 'country_code', 'CUSTOMER.CODE': 'prospect_id'}, inplace=True) df = df.reset_index() # ################ CONVERTION MAPPING STARTS ######## # After discussion with business df['country_code'] = df['RESIDENCE'] if IS_TEST_ENVIRONMENT: df['first_name'] = df['sex'].map(fp.first_name) df['last_name'] = df['FAMILY.NAME'].map(fp.last_name) df['national_register_number'] = df['TAX.ID'].map( fp.national_register_number) else: df['first_name'] = df['GIVEN.NAMES'] df['last_name'] = df['FAMILY.NAME'] # TODO ! Waiting for final decision - what to do if tax.id is empty df['national_register_number'] = df['TAX.ID'] df['full_name'] = (df['first_name'] + ' ' + df['last_name']).astype(str) # DONE - Decision Jan Grybos Retail - 1001, SME 2001 # df['customer_segment_id'] = df['TARGET'].map(dm.segment_id) df['customer_segment_id'] = '1001' # First approach was taking from seperat file # from file_testr import get_leagal_doc_set # documents_dict = get_leagal_doc_set().to_dict(orient='index') # df['identity_documents'] = df.apply(lambda x: # cnv.identity_documents_array(x['prospect_id'], # x['country_code'], documents_dict), axis=1) df['identity_documents'] = df.apply(lambda x: cnv.identity_documents_array_splitted( x['country_code'], x['prospect_id'], str(x['LEGAL.DOC.NAME']), str(x['LEGAL.ID']),str(x['LEGAL.ISS.DATE']),str(x['LEGAL.EXP.DATE'])), axis=1) # ####tax_residence_main_country nested # Business - A.Bujalska decided that data is empty 02.10.2019 df['tax_residence_main_country'] = df.apply(lambda x: {'date': "", 'tin': x['national_register_number'], 'country': x['RESIDENCE']}, axis=1) #business A.Bujalska decided that array is empty 02.10.2019 df['tax_residence_other_countries'] = np.empty((len(df), 0)).tolist() ## residential_address - nested df['ra_street_name'], df['ra_street_number'], df['ra_apartment'] = zip( *df['STREET'].map(cnv.street_converter)) if IS_TEST_ENVIRONMENT: df['ra_city'] = df['TOWN.COUNTRY'].map(fp.city) df['ra_postal'] = df['POST.CODE'].map(fp.postalcode) else: df['ra_city'] = df['TOWN.COUNTRY'] df['ra_postal'] = df['POST.CODE'] df['ra_country'] = df['RESIDENCE'] df['residential_address'] = df.apply(lambda x: {'street_name': x.ra_street_name, 'street_number': x.ra_street_number, 'apartment': x.ra_apartment, 'city': x.ra_city, 'postal': x.ra_postal, 'country': x.ra_country}, axis=1) # mailing_address = residential_address df['mailing_address'] = df['residential_address'] # phone_number phone_number_prefix df['phone_number'] = df.apply(lambda x: cnv.combine_phone_numbers(x['SMS.1'], x['PHONE.1']), axis=1) # let's add country to the GSM df['CL_GSM_ADDED_PREFIX'] = ( df['country_code'] + df['phone_number']).astype(str).map(cln.phone_prefix_updater) # grab more data from spoused to be valid gsm df['CL_GSM_VALID'], df['phone_number_prefix'], df['phone_number'], df['CL_GSM_POSSIBLE'] = zip( *df['CL_GSM_ADDED_PREFIX'].map(cnv.phone_segmenter)) if IS_TEST_ENVIRONMENT: df['email_address'] = df.apply(lambda x: fp.email( x['EMAIL.1'], x['prospect_id']), axis=1) df['birth_place'] = df['L.CU.POB'].map(fp.city) df['birth_date'] = df['DATE.OF.BIRTH'].map(fp.date_of_birth) else: df['email_address'] = df.apply(lambda x: cnv.email(x['EMAIL.1'], x['prospect_id']), axis=1) df['birth_place'] = df['L.CU.POB'] df['birth_date'] = df['DATE.OF.BIRTH'].map(cnv.date_converter) df['title'] = df['TITLE'].map(dm.title) df['us_person'] = df.apply(lambda x: cnv.us_person( x['RESIDENCE'], x['nationality']), axis=1) # Business decission: tax_residence_other_countries - nested - not exists 1.10.2019 df['pep'] = df['L.CU.PEP'].map(cnv.pep) # Tomasz Mierzwinski - Confluence 03.10.2019 df['customer_kyc_risk'] = df['CALC.RISK.CLASS'].map(dm.customer_kyc_risk) # TODO!! - field does not exits df["customer_role"] = "OWNER" df['marital_status_id'] = df['MARITAL.STATUS'].map(dm.marital_status_id) df['residence'] = df['RESIDENCE'].map(dm.residence_mapper) df['occupation'] = df['EMPLOYMENT.STATUS'].map(dm.occupation) df['rooted'] = False # mapping add [2, 3, 1, 4] 1 FR, 2 NL, 3, EN, 4 IT df['display_and_messaging_language'] = df['LANGUAGE'].map(dm.display_lang) df['legal_language'] = df['display_and_messaging_language'] # dictionary SRC.TO.BMPB - field added by Khalid 3.10.2019 - total number 86 records! df["source_of_funds"] = df['SRC.TO.BMPB'].map(dm.source_of_funds) # TODO! - dictionary to be mapped: L.CU.SRC.WEALTH - no data in customer df['source_of_wealth'] = "GAMBLING" df['normalized_email_address'] = df['email_address'].map( cln.normalize_email) # Tomasz Motyl decided - only ACTIVE clients are migrated # df['CUSTOMER.STATUS'].map(dm.customer_status) df['customer_status'] = 'ACTIVE' # fields after first tests df["pin_code_set"] = False df["face_id_set"] = False df["touch_id_set"] = False df['agreements'] = np.empty((len(df), 0)).tolist() df["consents"] = np.empty((len(df), 0)).tolist() # these fields will be no londer empty arrays # Flora Fred decided to have it all filed in df['agreements'] = df['agreements'].apply(lambda x: ut.agreements_producer('RETAIL')) df['consents'] = df['consents'].apply(lambda x: ut.consents_producer()) columns = ['first_name', 'last_name', 'full_name', 'sex', 'national_register_number', 'identity_documents', # 'document_type_list', # finally it is identity_documents 'customer_segment_id', 'nationality', 'tax_residence_main_country', 'tax_residence_other_countries', 'residential_address', 'mailing_address', 'phone_number_prefix', 'phone_number', 'email_address', 'us_person', 'birth_date', 'birth_place', 'pep', 'customer_kyc_risk', 'customer_role', 'residence', 'occupation', 'rooted', 'pin_code_set', 'face_id_set', 'touch_id_set', 'agreements', 'consents', 'source_of_funds', 'normalized_email_address', 'legal_language', 'display_and_messaging_language', 'source_of_wealth', 'prospect_id' ] # request from Florentyna Frend # df['first_not_empty_doc_id'], df['first_not_empty_doc_type'], df['first_not_empty_doc_expiration']= zip(*df.apply(lambda x: cnv.identity_documents_array_splitted( # x['country_code'], x['prospect_id'], str(x['LEGAL.DOC.NAME']), # str(x['LEGAL.ID']),str(x['LEGAL.ISS.DATE']),str(x['LEGAL.EXP.DATE']), True), axis=1)) # df_flora = df[['national_register_number', 'prospect_id', 'country_code', 'first_not_empty_doc_id', 'first_not_empty_doc_type','first_not_empty_doc_expiration']] # df_flora['national_register_number'] = df_flora['national_register_number'].map(lambda x: None if (x is not None and len(str(x)) < 3) else x) # df_flora = df_flora[df_flora['national_register_number'].isnull()] # df_flora.to_excel('flora_tax_id.xlsx', index = None, header=True) res = df[columns].to_json(orient='records') res = json.loads(res) with open(ut.file_name('customer_retail'), 'w', encoding='UTF-8') as f: json.dump(res, f, indent=4) print(f'Num of customer no phones is: {fp.missed_phone_counter}') print(f'Num of customer no tax.id\'s is: {fp.missed_tax_id_counter}') print(f'Num of customer no emails is: {fp.missed_email_counter}')
def combine_relations(client_type='SME', save_relations=False, skip_marge=True): df_cst = cst.read_customer_file() df_cst['CUSTOMER'] = df_cst['CUSTOMER.CODE'].astype(str) df = acc.read_accounts_file() df['CUSTOMER'] = df['CUSTOMER'].astype(str) df_acc = df[df['IS.RELATION.OF'].notnull()] if client_type == 'SME': df_acc = df_acc[df_acc['SECTOR.CU.NATCLI'].isin( ['BA', 'CO', 'ET', 'OR'])] else: df_acc = df_acc[df_acc['SECTOR.CU.NATCLI'].isin(['RE'])] # print(f"Number of unique clients: {len(df_acc['CUSTOMER'].unique())}") df_rel_to = df_acc['IS.RELATION.OF'].str.split('|', expand=True) df_rel_to.rename(columns=lambda x: str(x) + '_RELATION.OF' if x > 9 else '0' + str(x) + '_RELATION.OF', inplace=True) # if we want excluded data # excluded_clients(df_rel_to, df_cst, df_acc) df_rel_type = df_acc['IS.RELATION.TYPE'].str.split('|', expand=True) df_rel_type.rename(columns=lambda x: str(x) + '_RELATION.TYPE' if x > 9 else '0' + str(x) + '_RELATION.TYPE', inplace=True) df_rat = pd.concat([df_rel_to, df_rel_type], axis=1).reindex(df_rel_to.index) df_cst = df_cst[['CUSTOMER.CODE', 'NAME.1']] cust_dict = df_cst.set_index('CUSTOMER.CODE').to_dict()['NAME.1'] i = 0 for column in df_rel_to.columns.tolist(): st = '0' + str(i) if i < 9 else str(i) df_rat[f'{st}_CUSTOMER_NAME'] = df_rat[column].map( lambda x: cust_dict.get(x)) i += 1 # sort columns to get who, and how is in relation df_rat = df_rat.reindex(sorted(df_rat.columns), axis=1) df_acc = pd.concat([df_rat, df_acc], axis=1).reindex(df_rat.index) columns = ['CUSTOMER', 'ACCOUNT', 'ACCOUNT.TITLE.1' ] + df_rat.columns.tolist() df_acc = df_acc[columns] df_acc.to_excel(f'./output/relations_{client_type}.xlsx', index=None, header=True) if save_relations is True: df_acc.to_excel(f'./output/relations_{client_type}.xlsx', index=None, header=True) # i = 0 # for column in df_rel_to.columns.tolist(): # df_merged = pd.merge(df_cst, df_acc, left_on='CUSTOMER.CODE', right_on=column) # cols = [column, 'NAME.1', df_rel_type.columns.tolist()[i]] # # print(df_merged[cols]) # df_merged[cols].to_excel(f'./output/relations_merge_{client_type}_{i}.xlsx', index = None, header=True, sheet_name='RELATIONS') # i +=1 # print(df_merged) # how_merged = 'both' # print(df_merged.query('_merge == @how_merged')) # df_merged.query('_merge == @how_merged').to_excel(f'./output/relations_merge_{client_type}.xlsx', index = None, header=True) if skip_marge is True: return # column to check rattache = 'IS.RELATION.OF_6' df_merged = pd.merge(df_acc, df_cst, left_on=rattache, right_on='CUSTOMER.CODE', how='outer', indicator=True) df_merged = df_merged[df_merged[rattache].notnull()] # left_merged = 'both' #left_only left_merged = 'left_only' df_rat1 = df_merged.query('_merge == @left_merged')[[ 'ACCOUNT', 'REGROUPE', 'CUSTOMER.CODE', 'IS.RELATION.OF', rattache ]].reset_index(drop=True) print(df_rat1.shape, df_rat1.head(70))