Exemple #1
0
def incorrect_email_address():

    # migration-first-ver
    df = cst.read_customer_file()
    # emails only in retail schema
    df = df.query('SECTOR < 1007 | SECTOR == 1901')

    df['VALID_EMAIL'] = df['EMAIL.1'].map(vld.valid_email)

    # df.query('VALID_EMAIL == 0')[['CUSTOMER.CODE', 'EMAIL.1']].to_excel('wrong-empty-emails.xlsx', index = None, header=True)
    df_bad_emails = df.query('VALID_EMAIL == 0')[[
        'CUSTOMER.CODE', 'EMAIL.1', 'PHONE.1', 'SMS.1', 'NAME.1', 'NAME.2',
        'ADDRESS'
    ]]

    df_acc = acc.read_accounts_file()

    df_m = pd.merge(df_acc,
                    df_bad_emails,
                    left_on='CUSTOMER',
                    right_on='CUSTOMER.CODE')
    df_m = df_m[[
        'CUSTOMER', 'ACCOUNT', 'PHONE.1', 'SMS.1', 'NAME.1', 'NAME.2',
        'ADDRESS', 'EMAIL.1'
    ]]
    df_m.to_excel('accounts_wrong_email.xlsx', index=None, header=True)
Exemple #2
0
def dictionary_producer(target, dictionary):

    df = None
    if target == ProductFiles.CUSTOMER:
        df = cst.read_customer_file()
    elif target == ProductFiles.ACCOUNT:
        df = acc.read_accounts_file()
    elif target == ProductFiles.LEGAL_DOC:
        df = get_leagal_doc_set()
    else:
        print('Invalid argument - target')
        return

    # print(df.shape)

    print(dictionary, df[dictionary].dropna().unique().tolist())

    print(dictionary, len(df[dictionary].dropna().unique().tolist()))

    # dict_list = df['LEGAL.DOC.NAME'].map(
    #     cln.document_type).dropna().unique().tolist()

    # print(list(filter(lambda x: x.find('RESIDE') > -1 , df.columns.tolist())))

    # print for dictionary
    dict_list = df[dictionary].dropna().unique().tolist()
    for x in dict_list:
        #     # if len(x.split('|')) > 2:
        #     #     print(x)
        print(f'\'{x}\':\'VOD_{x}\',')
Exemple #3
0
def save_xlsx(target=ProductFiles.CUSTOMER):
    df = None
    if target == ProductFiles.CUSTOMER:
        df = cst.read_customer_file()
    elif target == ProductFiles.ACCOUNT:
        df = acc.read_accounts_file()
    else:
        print('Invalid argument - target')
        return
    print(df.shape)
    # print(df.columns)
    f_name = f"./output/{target}_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.xlsx"
    df.to_excel(f_name, index=None, header=True)
Exemple #4
0
def combine_customer_account():
    df_cst = cst.read_customer_file()
    # choose which one
    df_acc = acc.read_accounts_file('RETAIL')

    df_acc = df_acc[~df_acc['REGROUPE'].notnull()]
    df_acc = df_acc.drop_duplicates(['CUSTOMER'], keep='first')

    df_merged = pd.merge(df_acc,
                         df_cst,
                         left_on='CUSTOMER',
                         right_on='CUSTOMER.CODE')
    # df_merged = df_acc[~df_acc['CUSTOMER'].isin(df_cst['CUSTOMER.CODE'])]
    print('Shape is:', df_merged.shape)

    valid_birthday_tax(df_merged)
Exemple #5
0
def accounts(client_type='SME'):

    df = acc.read_accounts_file()

    if client_type == 'SME':
        df = df[df['SECTOR.CU.NATCLI'].isin(['BA', 'CO', 'ET', 'OR'])]
    else:
        df = df[df['SECTOR.CU.NATCLI'].isin(['RE'])]

    # print('COLUMN LIST',df.columns)   122
    df[['CUSTOMER', 'ACCOUNT', 'SECTOR.CU.NATCLI',
        'CURRENCY']].to_excel(f'acc_{client_type.lower()}.xlsx',
                              index=None,
                              header=True)

    print(df['ACCOUNT'].count(), len(df['ACCOUNT'].unique().tolist()))

    # get only duplicated list
    #this way is cost effective
    # df_s = pd.concat(g for _, g in df.groupby("CUSTOMER") if len(g) > 1)
    customers = df["CUSTOMER"]
    # df_s = df[customers.isin(customers[customers.duplicated()])].sort_values("CUSTOMER")
    # print('Unique customers num:', len(df_s['CUSTOMER'].unique()))
    # print('Duplicated customers num:', len(df_s))
    df_s = df[customers.duplicated(keep=False)].sort_values("CUSTOMER")
    print('Unique customers num:', len(df['CUSTOMER'].unique().tolist()))
    print('Duplicated customers num:', len(df_s))
    print('Duplicated customers unique num:',
          len(df_s['CUSTOMER'].unique().tolist()))

    f_name = f'./output/duplicated_acc.id_{date.today().isoformat()}_{client_type}.xlsx'
    print(df_s.groupby(['CUSTOMER']).size().reset_index(name='count'))
    df_s.groupby(
        ['CUSTOMER']).size().reset_index(name='NO.OF.ACCOUNTS').sort_values(
            ['NO.OF.ACCOUNTS'], ascending=False).to_excel(
                f'./output/{client_type}_CUSTOMER_MORE_ACCOUNTS.xlsx',
                index=None)

    df_s[[
        'CUSTOMER', 'ACCOUNT', 'SECTOR.CU.NATCLI', 'CURRENCY', 'REGROUPE',
        'IS.RELATION.OF'
    ]].sort_values(['CUSTOMER']).to_excel(f_name, index=None, header=True)
Exemple #6
0
def combain_regroupe():
    df_cst = cst.read_customer_file()
    df_cst['CUSTOMER.CODE'] = df_cst['CUSTOMER.CODE'].astype(str)
    df_acc = acc.read_accounts_file()

    #only regrupe are interesting
    df_acc = df_acc[df_acc['REGROUPE'].notnull()]

    df_acc = df_acc[df_acc['REGROUPE'].notnull()]
    df_acc['CUSTOMER'] = df_acc['CUSTOMER'].astype(str)

    df_reg = df_acc['REGROUPE'].str.split('~', expand=True)
    df_reg = df_reg.rename(columns=lambda x: 'REGROUPE_' + str(x))

    print(df_acc.shape)
    df_acc = pd.concat([df_reg, df_acc], axis=1).reindex(df_reg.index)
    print(df_acc.shape)

    # df_acc.to_excel('./output/regroupe.xlsx', index = None, header=True)

    #column to check
    regroupe = 'REGROUPE_2'

    df_merged = pd.merge(df_acc,
                         df_cst,
                         left_on=regroupe,
                         right_on='CUSTOMER.CODE',
                         how='outer',
                         indicator=True)
    df_merged = df_merged[df_merged[regroupe].notnull()]

    print("after notnull", df_merged.shape)

    # left_merged = 'both' #left_only
    left_merged = 'left_only'
    df_regr1 = df_merged.query('_merge == @left_merged')[[
        'ACCOUNT', 'CUSTOMER.CODE', 'REGROUPE', regroupe
    ]].reset_index(drop=True)
    print(df_regr1.shape, df_regr1)
def convert_retail_to_json():
    """
    Function read retail customer and produce json payload

    """

    df = read_customer_file()
    # ########################## print statistic - get only RETAIL CUSTOMERS

    df = df.query('SECTOR < 1007 | SECTOR == 1901')

    # second take only those with active accounts
    df_acc = acc.read_accounts_file('RETAIL')
    
    ut.log_dataset_data(df_acc)
    df_acc = df_acc.drop_duplicates(['CUSTOMER'], keep='first')
    ut.log_dataset_data(df_acc)
    
    # df_acc = df_acc[df_acc['REGROUPE'].isnull()]
    
    df_acc['CUSTOMER'] = df_acc['CUSTOMER'].astype(str)
    ut.log_dataset_data(df_acc)

    # .drop('CUSTOMER', axis=1, inplace=True)
    acc_columns = ['CUSTOMER','REGROUPE_0', 'REGROUPE_1','REGROUPE_2']
    df_basic = pd.merge(df, df_acc[acc_columns],
                  left_on='CUSTOMER.CODE', right_on='CUSTOMER')
    ut.log_dataset_data(df_basic)
    df_regr_0 = pd.merge(df, df_acc[acc_columns],
                  left_on='CUSTOMER.CODE', right_on='REGROUPE_0')
    ut.log_dataset_data(df_regr_0)

    df_regr_1 = pd.merge(df, df_acc[acc_columns],
                  left_on='CUSTOMER.CODE', right_on='REGROUPE_1')
    ut.log_dataset_data(df_regr_1)

    df_regr_2 = pd.merge(df, df_acc[acc_columns],
                  left_on='CUSTOMER.CODE', right_on='REGROUPE_2')
    ut.log_dataset_data(df_regr_2)
    
    df = pd.concat([df_basic, df_regr_0, df_regr_1,df_regr_2])
    ut.log_dataset_data(df)
    
    df = df.drop_duplicates(['CUSTOMER.CODE'], keep='first')
    ut.log_dataset_data(df)

    if IS_TEST_ENVIRONMENT:
        df = df.head(TEST_PROBE)
        ut.log_dataset_data(df)

    # ##########################rename columns which does not need to be converted
    df.rename(columns={'GENDER': 'sex', 'NATIONALITY':
                       'nationality', 'DOMICILE': 'country_code',
                       'CUSTOMER.CODE': 'prospect_id'}, inplace=True)
    df = df.reset_index()
    # ################ CONVERTION MAPPING STARTS ########
    
    # After discussion with business
    df['country_code'] = df['RESIDENCE']

    if IS_TEST_ENVIRONMENT:
        df['first_name'] = df['sex'].map(fp.first_name)
        df['last_name'] = df['FAMILY.NAME'].map(fp.last_name)
        df['national_register_number'] = df['TAX.ID'].map(
            fp.national_register_number)
    else:
        df['first_name'] = df['GIVEN.NAMES']
        df['last_name'] = df['FAMILY.NAME']
        
        # TODO ! Waiting for final decision - what to do if tax.id is empty
        df['national_register_number'] = df['TAX.ID']

    df['full_name'] = (df['first_name'] + ' ' + df['last_name']).astype(str)

    # DONE - Decision Jan Grybos Retail - 1001, SME 2001
    # df['customer_segment_id'] = df['TARGET'].map(dm.segment_id)
    df['customer_segment_id'] = '1001'

    # First approach was taking from seperat file
    # from file_testr import get_leagal_doc_set
    # documents_dict = get_leagal_doc_set().to_dict(orient='index')
    # df['identity_documents'] = df.apply(lambda x:
    #                                     cnv.identity_documents_array(x['prospect_id'], 
    #                                         x['country_code'], documents_dict), axis=1)
    
    df['identity_documents'] = df.apply(lambda x:
                                        cnv.identity_documents_array_splitted( 
                                            x['country_code'], x['prospect_id'], str(x['LEGAL.DOC.NAME']),
                                            str(x['LEGAL.ID']),str(x['LEGAL.ISS.DATE']),str(x['LEGAL.EXP.DATE'])), axis=1)

    
    # ####tax_residence_main_country nested
    # Business - A.Bujalska decided that data is empty 02.10.2019
    df['tax_residence_main_country'] = df.apply(lambda x:
                                                {'date': "",
                                                 'tin': x['national_register_number'],
                                                 'country': x['RESIDENCE']}, axis=1)

    #business A.Bujalska decided that array is empty 02.10.2019
    df['tax_residence_other_countries'] = np.empty((len(df), 0)).tolist()

    
    ## residential_address - nested
    df['ra_street_name'],  df['ra_street_number'], df['ra_apartment'] = zip(
        *df['STREET'].map(cnv.street_converter))
    
    if IS_TEST_ENVIRONMENT:
        df['ra_city'] = df['TOWN.COUNTRY'].map(fp.city)
        df['ra_postal'] = df['POST.CODE'].map(fp.postalcode)
    else:
        df['ra_city'] = df['TOWN.COUNTRY']
        df['ra_postal'] = df['POST.CODE']
    
    df['ra_country'] = df['RESIDENCE']

    df['residential_address'] = df.apply(lambda x:
                                         {'street_name': x.ra_street_name,
                                          'street_number': x.ra_street_number,
                                          'apartment': x.ra_apartment,
                                          'city': x.ra_city,
                                          'postal': x.ra_postal,
                                          'country': x.ra_country}, axis=1)

    # mailing_address = residential_address
    df['mailing_address'] = df['residential_address']

    # phone_number  phone_number_prefix
    df['phone_number'] = df.apply(lambda x: cnv.combine_phone_numbers(x['SMS.1'], x['PHONE.1']), axis=1)

    # let's add country to the GSM
    df['CL_GSM_ADDED_PREFIX'] = (
        df['country_code'] + df['phone_number']).astype(str).map(cln.phone_prefix_updater)
    # grab more data from spoused to be valid gsm
    
    df['CL_GSM_VALID'], df['phone_number_prefix'], df['phone_number'], df['CL_GSM_POSSIBLE'] = zip(
        *df['CL_GSM_ADDED_PREFIX'].map(cnv.phone_segmenter))

    if IS_TEST_ENVIRONMENT:
        df['email_address'] = df.apply(lambda x: fp.email(
            x['EMAIL.1'], x['prospect_id']), axis=1)
        df['birth_place'] = df['L.CU.POB'].map(fp.city)
        df['birth_date'] = df['DATE.OF.BIRTH'].map(fp.date_of_birth)
    else:
        df['email_address'] = df.apply(lambda x: cnv.email(x['EMAIL.1'], x['prospect_id']), axis=1)
        df['birth_place'] = df['L.CU.POB']
        df['birth_date'] = df['DATE.OF.BIRTH'].map(cnv.date_converter)

    df['title'] = df['TITLE'].map(dm.title)
    df['us_person'] = df.apply(lambda x: cnv.us_person(
        x['RESIDENCE'], x['nationality']), axis=1)

    # Business decission: tax_residence_other_countries - nested - not exists 1.10.2019

    df['pep'] = df['L.CU.PEP'].map(cnv.pep)
    # Tomasz Mierzwinski - Confluence 03.10.2019
    df['customer_kyc_risk'] = df['CALC.RISK.CLASS'].map(dm.customer_kyc_risk)

    # TODO!! - field does not exits
    df["customer_role"] = "OWNER"

    df['marital_status_id'] = df['MARITAL.STATUS'].map(dm.marital_status_id)
    df['residence'] = df['RESIDENCE'].map(dm.residence_mapper)
    df['occupation'] = df['EMPLOYMENT.STATUS'].map(dm.occupation)

    df['rooted'] = False
    # mapping add [2, 3, 1, 4] 1 FR, 2 NL, 3, EN, 4 IT
    df['display_and_messaging_language'] = df['LANGUAGE'].map(dm.display_lang)
    df['legal_language'] = df['display_and_messaging_language']

    # dictionary SRC.TO.BMPB - field added by Khalid 3.10.2019 - total number 86 records!
    df["source_of_funds"] = df['SRC.TO.BMPB'].map(dm.source_of_funds)
    
    # TODO! - dictionary to be mapped: L.CU.SRC.WEALTH - no data in customer
    df['source_of_wealth'] = "GAMBLING"

    df['normalized_email_address'] = df['email_address'].map(
        cln.normalize_email)

    # Tomasz Motyl decided - only ACTIVE clients are migrated
    # df['CUSTOMER.STATUS'].map(dm.customer_status)
    df['customer_status'] = 'ACTIVE'

    # fields after first tests
    df["pin_code_set"] = False
    df["face_id_set"] = False
    df["touch_id_set"] = False


    df['agreements'] = np.empty((len(df), 0)).tolist()
    df["consents"] = np.empty((len(df), 0)).tolist()

    # these fields will be no londer empty arrays
    # Flora Fred decided to have it all filed in
    df['agreements'] = df['agreements'].apply(lambda x: ut.agreements_producer('RETAIL'))
    df['consents'] = df['consents'].apply(lambda x: ut.consents_producer())
    
    columns = ['first_name', 'last_name', 'full_name', 'sex', 'national_register_number',
               'identity_documents',
            #    'document_type_list', # finally  it is identity_documents
               'customer_segment_id', 'nationality',
               'tax_residence_main_country',
               'tax_residence_other_countries',
               'residential_address',
               'mailing_address',
               'phone_number_prefix', 'phone_number', 'email_address', 'us_person',
               'birth_date', 'birth_place', 'pep', 'customer_kyc_risk', 'customer_role', 'residence', 'occupation', 'rooted',
               'pin_code_set', 'face_id_set', 'touch_id_set', 'agreements', 'consents',
               'source_of_funds', 'normalized_email_address', 'legal_language', 'display_and_messaging_language', 'source_of_wealth',
               'prospect_id'
               ]

   
    
    # request from Florentyna Frend
    # df['first_not_empty_doc_id'], df['first_not_empty_doc_type'], df['first_not_empty_doc_expiration']= zip(*df.apply(lambda x: cnv.identity_documents_array_splitted( 
    #                                         x['country_code'], x['prospect_id'], str(x['LEGAL.DOC.NAME']),
    #                                         str(x['LEGAL.ID']),str(x['LEGAL.ISS.DATE']),str(x['LEGAL.EXP.DATE']), True), axis=1))
    
    # df_flora = df[['national_register_number', 'prospect_id', 'country_code', 'first_not_empty_doc_id', 'first_not_empty_doc_type','first_not_empty_doc_expiration']]
    # df_flora['national_register_number'] = df_flora['national_register_number'].map(lambda x: None if (x is not None and len(str(x)) < 3) else x)
    # df_flora = df_flora[df_flora['national_register_number'].isnull()]
    # df_flora.to_excel('flora_tax_id.xlsx', index = None, header=True)

    res = df[columns].to_json(orient='records')
    res = json.loads(res)
    with open(ut.file_name('customer_retail'), 'w', encoding='UTF-8') as f:
        json.dump(res, f, indent=4)

    print(f'Num of customer no phones is: {fp.missed_phone_counter}')
    print(f'Num of customer no tax.id\'s is: {fp.missed_tax_id_counter}')
    print(f'Num of customer no emails is: {fp.missed_email_counter}')
Exemple #8
0
def combine_relations(client_type='SME',
                      save_relations=False,
                      skip_marge=True):
    df_cst = cst.read_customer_file()
    df_cst['CUSTOMER'] = df_cst['CUSTOMER.CODE'].astype(str)

    df = acc.read_accounts_file()
    df['CUSTOMER'] = df['CUSTOMER'].astype(str)

    df_acc = df[df['IS.RELATION.OF'].notnull()]

    if client_type == 'SME':
        df_acc = df_acc[df_acc['SECTOR.CU.NATCLI'].isin(
            ['BA', 'CO', 'ET', 'OR'])]
    else:
        df_acc = df_acc[df_acc['SECTOR.CU.NATCLI'].isin(['RE'])]

    # print(f"Number of unique clients: {len(df_acc['CUSTOMER'].unique())}")

    df_rel_to = df_acc['IS.RELATION.OF'].str.split('|', expand=True)
    df_rel_to.rename(columns=lambda x: str(x) + '_RELATION.OF'
                     if x > 9 else '0' + str(x) + '_RELATION.OF',
                     inplace=True)

    # if we want excluded data
    # excluded_clients(df_rel_to, df_cst, df_acc)

    df_rel_type = df_acc['IS.RELATION.TYPE'].str.split('|', expand=True)
    df_rel_type.rename(columns=lambda x: str(x) + '_RELATION.TYPE'
                       if x > 9 else '0' + str(x) + '_RELATION.TYPE',
                       inplace=True)

    df_rat = pd.concat([df_rel_to, df_rel_type],
                       axis=1).reindex(df_rel_to.index)

    df_cst = df_cst[['CUSTOMER.CODE', 'NAME.1']]
    cust_dict = df_cst.set_index('CUSTOMER.CODE').to_dict()['NAME.1']

    i = 0
    for column in df_rel_to.columns.tolist():
        st = '0' + str(i) if i < 9 else str(i)
        df_rat[f'{st}_CUSTOMER_NAME'] = df_rat[column].map(
            lambda x: cust_dict.get(x))
        i += 1

    # sort columns to get who, and how is in relation
    df_rat = df_rat.reindex(sorted(df_rat.columns), axis=1)

    df_acc = pd.concat([df_rat, df_acc], axis=1).reindex(df_rat.index)
    columns = ['CUSTOMER', 'ACCOUNT', 'ACCOUNT.TITLE.1'
               ] + df_rat.columns.tolist()
    df_acc = df_acc[columns]
    df_acc.to_excel(f'./output/relations_{client_type}.xlsx',
                    index=None,
                    header=True)

    if save_relations is True:
        df_acc.to_excel(f'./output/relations_{client_type}.xlsx',
                        index=None,
                        header=True)

    # i = 0
    # for column in df_rel_to.columns.tolist():
    #     df_merged = pd.merge(df_cst, df_acc, left_on='CUSTOMER.CODE', right_on=column)
    #     cols = [column, 'NAME.1', df_rel_type.columns.tolist()[i]]
    #     # print(df_merged[cols])
    #     df_merged[cols].to_excel(f'./output/relations_merge_{client_type}_{i}.xlsx', index = None, header=True, sheet_name='RELATIONS')
    #     i +=1

    # print(df_merged)
    # how_merged = 'both'
    # print(df_merged.query('_merge == @how_merged'))
    # df_merged.query('_merge == @how_merged').to_excel(f'./output/relations_merge_{client_type}.xlsx', index = None, header=True)

    if skip_marge is True:
        return

    # column to check
    rattache = 'IS.RELATION.OF_6'

    df_merged = pd.merge(df_acc,
                         df_cst,
                         left_on=rattache,
                         right_on='CUSTOMER.CODE',
                         how='outer',
                         indicator=True)
    df_merged = df_merged[df_merged[rattache].notnull()]

    # left_merged = 'both' #left_only
    left_merged = 'left_only'

    df_rat1 = df_merged.query('_merge == @left_merged')[[
        'ACCOUNT', 'REGROUPE', 'CUSTOMER.CODE', 'IS.RELATION.OF', rattache
    ]].reset_index(drop=True)
    print(df_rat1.shape, df_rat1.head(70))