def df_encoder(state='TX', sel_feature='primary_merchant_name'):
    '''
    Returns
    -------
    [df, df_null, embedding_maps[sel_feature]]
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    states = [
        'NJ', 'AL', 'WA', 'NM', 'TX', 'AR', 'NY', 'CT', 'MI', 'CO', 'CA', 'PA',
        'IN', 'OK', 'MD', 'AK', 'VA', 'GA', 'NC', 'TN', 'OH', 'IL', 'FL', 'AZ',
        'DC', 'LA', 'KY', 'KS', 'IA', 'SC', 'WI', 'DE', 'HI', 'MT', 'MO', 'NV',
        'ID', 'MN', 'MS', 'OR', 'UT', 'NH', 'MA', 'WV', 'NE', 'ND', 'RI', 'VT',
        'WY', 'ME', 'SD', 'PR', 'GU'
    ]

    fields = [
        'unique_mem_id', 'unique_bank_account_id',
        'unique_bank_transaction_id', 'amount', 'currency', 'description',
        'transaction_base_type', 'transaction_category_name',
        'primary_merchant_name', 'city', 'state', 'transaction_origin',
        'optimized_transaction_date', 'account_type', 'account_source_type',
        'account_score', 'user_score', 'panel_file_created_date'
    ]

    try:
        if state in states:
            filter_query = f"SELECT {', '.join(field for field in fields)} \
                            FROM bank_record WHERE unique_mem_id IN ( \
                             SELECT unique_mem_id \
                             FROM user_demographic \
                             WHERE state = '{state}' LIMIT 1)"

            transaction_query = execute_read_query(connection, filter_query)
            df = pd.DataFrame(transaction_query, columns=fields)
            print(f"{len(df)} transactions for random user in state: {state}.")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback

    # first conversion from date to datetime objects; then conversion to unix
    df['panel_file_created_date'] = pd.to_datetime(
        df['panel_file_created_date'])
    df['optimized_transaction_date'] = pd.to_datetime(
        df['optimized_transaction_date'])

    # set optimized transaction_date as index for later
    df.set_index('optimized_transaction_date', drop=False, inplace=True)
    '''
    After successfully loading the data, columns that are of no importance have been removed and missing values replaced
    Then the dataframe is ready to be encoded to get rid of all non-numerical data
    '''
    try:
        # Include below if need unique ID's later:
        df['unique_mem_id'] = df['unique_mem_id'].astype('str',
                                                         errors='ignore')
        df['unique_bank_account_id'] = df['unique_bank_account_id'].astype(
            'str', errors='ignore')
        df['unique_bank_transaction_id'] = df[
            'unique_bank_transaction_id'].astype('str', errors='ignore')
        df['amount'] = df['amount'].astype('float64')
        df['transaction_base_type'] = df['transaction_base_type'].replace(
            to_replace=["debit", "credit"], value=[1, 0])
    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")

    # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped
    date_features = ['optimized_transaction_date', 'panel_file_created_date']
    try:
        for feature in date_features:
            if df[feature].isnull().sum() == 0:
                df[feature] = df[feature].apply(lambda x: dt.timestamp(x))
            else:
                df = df.drop(columns=feature, axis=1)
                print(f"Column {feature} dropped")

    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")
    '''
    The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY
    are encoded manually and cleared of empty values
    '''
    encoding_features = [
        'primary_merchant_name', 'city', 'state', 'description',
        'transaction_category_name', 'transaction_origin', 'currency'
    ]

    # dropping currency if there is only one
    if len(df['currency'].value_counts()) == 1:
        df = df.drop(columns=['currency'], axis=1)
        encoding_features.remove('currency')

    UNKNOWN_TOKEN = '<unknown>'
    embedding_maps = {}
    for feature in encoding_features:
        unique_list = df[feature].unique().astype('str').tolist()
        unique_list.append(UNKNOWN_TOKEN)
        le = LabelEncoder()
        le.fit_transform(unique_list)
        embedding_maps[feature] = dict(
            zip(le.classes_, le.transform(le.classes_)))

        # APPLICATION TO OUR DATASET
        df[feature] = df[feature].apply(
            lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN)
        df[feature] = df[feature].map(lambda x: le.transform([x])[0]
                                      if type(x) == str else x)

    #drop all features left with empty (NaN) values
    df = df.dropna()

    # extract rows with empty value of selected feature into own db
    df_null = df[df[sel_feature] == 0]
    df = df[df[sel_feature] != 0]

    return [df, df_null, embedding_maps[sel_feature]]
Esempio n. 2
0
def db_insert_section(section=1):

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    fields = [
        'unique_mem_id', 'unique_bank_account_id',
        'unique_bank_transaction_id', 'amount', 'currency', 'description',
        'transaction_base_type', 'transaction_category_name',
        'primary_merchant_name', 'city', 'state', 'transaction_origin',
        'optimized_transaction_date', 'account_type', 'account_source_type',
        'account_score', 'user_score', 'panel_file_created_date'
    ]

    try:
        filter_query = f"(SELECT {', '.join(field for field in fields)} FROM card_record \
                        WHERE unique_mem_id IN (SELECT unique_mem_id FROM user_demographic\
                        ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET\
                        {10000*(section-1)})) UNION ALL (SELECT {', '.join(field for field in fields)} \
                         FROM bank_record WHERE unique_mem_id IN (SELECT unique_mem_id FROM \
                          user_demographic ORDER BY unique_mem_id ASC LIMIT 10000 OFFSET {10000*(section-1)}))"

        transaction_query = execute_read_query(connection, filter_query)
        main_df = pd.DataFrame(transaction_query, columns=fields)
        print(f"{len(transaction_query)} transactions.")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback

    for num, user in enumerate(main_df.groupby('unique_mem_id')):
        print(
            f"user {user[0]}, {num+1}/10000 users, {round(((num+1)/10000)*100, 2)}%."
        )

        # running functions in order
        df, df_null, embedding_map = df_encoder(
            df=main_df, sel_feature='primary_merchant_name')

        X_features, X_test, X_train, y_test, y_train = split_data(
            full_df=df, null_df=df_null)

        grid_search_svc = svc_class(X_train, X_test, y_train, y_test)

        prediction_df, predictions = add_pred(grid_search=grid_search_svc,
                                              X_features=X_features,
                                              label='primary_merchant_name')

        merchants = []

        for i in predictions:
            for val, enc in embedding_map.items():
                if enc in predictions:
                    merchants.append(val)
                else:
                    merchants.append("unseen in training")
        merchants_short = merchants[:len(predictions)]

        # attach merchants to prediction_df; list will be turned into df column
        prediction_df = prediction_df.assign(merchants=merchants_short)

        card_ess = ['Rewards', 'Transfers', 'Refunds/Adjustments', 'Gifts']
        card_non_ess = [
            'Groceries', 'Automotive/Fuel', 'Home Improvement', 'Travel',
            'Restaurants', 'Healthcare/Medical', 'Credit Card Payments',
            'Electronics/General Merchandise', 'Entertainment/Recreation',
            'Postage/Shipping', 'Other Expenses', 'Personal/Family',
            'Service Charges/Fees', 'Services/Supplies', 'Utilities',
            'Office Expenses', 'Cable/Satellite/Telecom',
            'Subscriptions/Renewals', 'Insurance'
        ]
        bank_ess = [
            'Deposits', 'Salary/Regular Income', 'Transfers',
            'Investment/Retirement Income', 'Rewards', 'Other Income',
            'Refunds/Adjustments', 'Interest Income', 'Gifts',
            'Expense Reimbursement'
        ]
        bank_non_ess = [
            'Service Charges/Fees', 'Credit Card Payments', 'Utilities',
            'Healthcare/Medical', 'Loans', 'Check Payment',
            'Electronics/General Merchandise', 'Groceries', 'Automotive/Fuel',
            'Restaurants', 'Personal/Family', 'Entertainment/Recreation',
            'Services/Supplies', 'Other Expenses', 'ATM/Cash Withdrawals',
            'Cable/Satellite/Telecom', 'Postage/Shipping', 'Insurance',
            'Travel', 'Taxes', 'Home Improvement', 'Education',
            'Charitable Giving', 'Subscriptions/Renewals', 'Rent',
            'Office Expenses', 'Mortgage'
        ]

        # Iterate through rows and mark transactions
        try:
            tr_class = pd.Series([], dtype='object')
            for index, i in enumerate(
                    prediction_df['transaction_category_name']):
                if i in bank_ess:
                    tr_class[index] = "essential"
                elif i in card_ess:
                    tr_class[index] = "essential"
                elif i in bank_non_ess:
                    tr_class[index] = "non_essential"
                elif i in card_non_ess:
                    tr_class[index] = "non_essential"
                else:
                    tr_class[index] = "unknown"
            prediction_df = prediction_df.assign(
                tr_essentiality=tr_class.values)
        except BaseException as error:
            print(f"column is already existing or following: {error}")

        cash_env = [
            'Rewards', 'Transfers', 'Refunds/Adjustments', 'Interest Income',
            'Restaurants', 'Electronics/General Merchandise',
            'Entertainment/Recreation', 'Postage/Shipping', 'Other Expenses',
            'Other Income', 'Expense Reimbursement', 'Personal/Family',
            'Travel', 'Office Expenses', 'Deposits', 'Salary/Regular Income',
            'Investment/Retirement Income', 'ATM/Cash Withdrawals'
        ]

        bill_env = [
            'Check Payment', 'Rent', 'Mortgage', 'Subscriptions/Renewals',
            'Healthcare/Medical', 'Credit Card Payments',
            'Service Charges/Fees', 'Services/Supplies', 'Utilities',
            'Insurance', 'Taxes', 'Home Improvement', 'Cable/Satellite/Telecom'
        ]

        car_env = ['Automotive/Fuel']

        gro_env = ['Groceries']

        chari_env = ['Charitable Giving', 'Gifts']

        #iterate through rows and create a new columns with a note that it is either an expense or income
        try:
            envel_cat = pd.Series([], dtype='object')
            for index, i in enumerate(
                    prediction_df['transaction_category_name']):
                if i in cash_env:
                    envel_cat[index] = "cash"
                elif i in bill_env:
                    envel_cat[index] = "bill"
                elif i in car_env:
                    envel_cat[index] = "automobile"
                elif i in gro_env:
                    envel_cat[index] = "groceries"
                elif i in chari_env:
                    envel_cat[index] = "charity"
                else:
                    envel_cat[index] = "not_classified"
            prediction_df = prediction_df.assign(
                envelope_category=envel_cat.values)
        except BaseException as error:
            print(f"envelope column is already existing or following {error}")

        db_name = "postgres"
        db_user = "******"
        db_pw = "envel"
        db_host = "0.0.0.0"
        db_port = "5432"
        '''
        Always use %s placeholder for queries; psycopg2 will convert most data automatically
        For special cases or conversion problems use adapters or "AsIs"
        Enclose the tuples in square brackets or leave without square brackets (no performance diff)
        Dataframes have to be split up into tuples and passed as list
        '''

        try:
            connection = create_connection(db_name=db_name,
                                           db_user=db_user,
                                           db_password=db_pw,
                                           db_host=db_host,
                                           db_port=db_port)
            print("Sesson_1\n-------------")
            cursor = connection.cursor()
            sql_insert_query = """
            INSERT INTO test (test_col, test_col_2, test_col_3, test_col_4, test_col_5,
                              test_col_6, test_col_7, test_col_8, test_col_9, test_col_10,
                              test_col_11, test_col_12, test_col_13, test_col_14,
                              test_col_15, test_col_16, test_col_17, test_col_18,
                              test_col_19, test_col_20)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                    %s, %s, %s, %s, %s, %s, %s);
            """

            for i in prediction_df.itertuples():
                # executemany() to insert multiple rows rows
                # one-element-tuple with (i, )
                cursor.execute(sql_insert_query, ([
                    i.unique_mem_id, i.unique_bank_account_id,
                    i.unique_bank_transaction_id, i.amount, i.description,
                    i.transaction_base_type, i.transaction_category_name,
                    i.city, i.state, i.transaction_origin,
                    i.optimized_transaction_date, i.account_type,
                    i.account_source_type, i.account_score, i.user_score,
                    i.panel_file_created_date, i.primary_merchant_name,
                    i.merchants, i.tr_essentiality, i.envelope_category
                ]))

            connection.commit()
            print(len(prediction_df), "record(s) inserted successfully.")

        except (Exception, psycopg2.Error) as error:
            print("Failed inserting record; {}".format(error))

        finally:
            # closing database connection.
            if (connection):
                cursor.close()
                connection.close()
                print("Operation completed.\nPostgreSQL connection is closed.")
        print("=========================")
        return 'insert done'
Esempio n. 3
0
def pull_df(rng=4, spending_report=False, plots=False):
    '''
    Parameters
    ----------
    rng : int, Random Seed for user picker. The default is 4.
    spending_report : bool, Save a spending report in directory if True. Default is False.
    plots : bool, Plots various graphs if True. Default is False.
    Returns
    -------
    df.
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    # establish connection to get user IDs for all users in MA
    filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'"
    transaction_query = execute_read_query(connection, filter_query)
    query_df = pd.DataFrame(
        transaction_query,
        columns=['unique_mem_id', 'state', 'city', 'income_class'])

    try:
        for i in pd.Series(query_df['unique_mem_id'].unique()).sample(
                n=1, random_state=rng):
            filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'"
            transaction_query = execute_read_query(connection, filter_query)
            df = pd.DataFrame(
                transaction_query,
                columns=[
                    'unique_mem_id', 'unique_bank_account_id',
                    'unique_bank_transaction_id', 'amount', 'currency',
                    'description', 'transaction_date', 'post_date',
                    'transaction_base_type', 'transaction_category_name',
                    'primary_merchant_name', 'secondary_merchant_name', 'city',
                    'state', 'zip_code', 'transaction_origin',
                    'factual_category', 'factual_id', 'file_created_date',
                    'optimized_transaction_date', 'yodlee_transaction_status',
                    'mcc_raw', 'mcc_inferred', 'swipe_date',
                    'panel_file_created_date', 'update_type', 'is_outlier',
                    'change_source', 'account_type', 'account_source_type',
                    'account_score', 'user_score', 'lag', 'is_duplicate'
                ])
            print(f"User {i} has {len(df)} transactions on record.")
            # all these columns are empty or almost empty and contain no viable information
            df = df.drop(columns=[
                'secondary_merchant_name', 'swipe_date', 'update_type',
                'is_outlier', 'is_duplicate', 'change_source', 'lag',
                'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category',
                'zip_code', 'yodlee_transaction_status', 'file_created_date',
                'panel_file_created_date', 'account_source_type',
                'account_type', 'account_score', 'user_score', 'post_date',
                'transaction_date'
            ],
                         axis=1)
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback
    '''
    Plotting of various relations
    The Counter object keeps track of permutations in a dictionary which can then be read and
    used as labels
    '''
    if plots:
        # Pie chart States
        state_ct = Counter(list(df['state']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels, values = zip(*state_ct.items())
        # Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values,
               labels=labels,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction locations of user {df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()

        # Pie chart transaction type
        trans_ct = Counter(list(df['transaction_category_name']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels_2, values_2 = zip(*trans_ct.items())
        #Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values_2,
               labels=labels_2,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        ax.title("Transaction categories")
        ax.legend(loc='center right')
        plt.show()
    '''
    Generate a spending report of the unaltered dataframe
    Use the datetime columns just defined
    This report measures either the sum or mean of transactions happening
    on various days of the week/or wihtin a week or a month over the course of the year
    '''

    df['optimized_transaction_date'] = pd.to_datetime(
        df['optimized_transaction_date'])

    # set optimized transaction_date as index for later
    df.set_index('optimized_transaction_date', drop=False, inplace=True)

    df = df.drop(['unique_bank_account_id', 'unique_bank_transaction_id'],
                 axis=1)
    # generate the spending report with the above randomly picked user ID
    if spending_report:
        create_spending_report(df=df.copy())

    return df
Esempio n. 4
0
def df_encoder(rng=4,
               spending_report=False,
               plots=False,
               include_lag_features=True):
    '''
    Parameters
    ----------
    rng : int, Random Seed for user picker. The default is 4.
    spending_report : bool, Save a spending report in directory if True. Default is False.
    plots : bool, Plots various graphs if True. Default is False.
    include_lag_features : include lag feature 'amount' to database with 3, 7, and 30 day rolls. Default is True

    Returns
    -------
    bank_df.
    '''

    connection = create_connection(db_name=acc.YDB_name,
                                   db_user=acc.YDB_user,
                                   db_password=acc.YDB_password,
                                   db_host=acc.YDB_host,
                                   db_port=acc.YDB_port)

    # establish connection to get user IDs for all users in MA
    filter_query = f"SELECT unique_mem_id, state, city, income_class FROM user_demographic WHERE state = 'MA'"
    transaction_query = execute_read_query(connection, filter_query)
    query_df = pd.DataFrame(
        transaction_query,
        columns=['unique_mem_id', 'state', 'city', 'income_class'])

    # dateframe to gather bank data from one randomly chosen user
    # test user 1= 4
    # test user 2= 8
    try:
        for i in pd.Series(query_df['unique_mem_id'].unique()).sample(
                n=1, random_state=rng):
            print(i)
            filter_query = f"SELECT * FROM bank_record WHERE unique_mem_id = '{i}'"
            transaction_query = execute_read_query(connection, filter_query)
            bank_df = pd.DataFrame(
                transaction_query,
                columns=[
                    'unique_mem_id', 'unique_bank_account_id',
                    'unique_bank_transaction_id', 'amount', 'currency',
                    'description', 'transaction_date', 'post_date',
                    'transaction_base_type', 'transaction_category_name',
                    'primary_merchant_name', 'secondary_merchant_name', 'city',
                    'state', 'zip_code', 'transaction_origin',
                    'factual_category', 'factual_id', 'file_created_date',
                    'optimized_transaction_date', 'yodlee_transaction_status',
                    'mcc_raw', 'mcc_inferred', 'swipe_date',
                    'panel_file_created_date', 'update_type', 'is_outlier',
                    'change_source', 'account_type', 'account_source_type',
                    'account_score', 'user_score', 'lag', 'is_duplicate'
                ])
            print(f"User {i} has {len(bank_df)} transactions on record.")
            #all these columns are empty or almost empty and contain no viable information
            bank_df = bank_df.drop(columns=[
                'secondary_merchant_name', 'swipe_date', 'update_type',
                'is_outlier', 'is_duplicate', 'change_source', 'lag',
                'mcc_inferred', 'mcc_raw', 'factual_id', 'factual_category',
                'zip_code', 'yodlee_transaction_status', 'file_created_date',
                'panel_file_created_date', 'account_type',
                'account_source_type', 'account_score'
            ],
                                   axis=1)
    except OperationalError as e:
        print(f"The error '{e}' occurred")
        connection.rollback
    '''
    Plotting of various relations
    The Counter object keeps track of permutations in a dictionary which can then be read and
    used as labels
    '''
    if plots:
        # Pie chart States
        state_ct = Counter(list(bank_df['state']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels, values = zip(*state_ct.items())
        # Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values,
               labels=labels,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction locations of user {bank_df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()

        # Pie chart transaction type
        trans_ct = Counter(list(bank_df['transaction_category_name']))
        # The * operator can be used in conjunction with zip() to unzip the list.
        labels_2, values_2 = zip(*trans_ct.items())
        #Pie chart, where the slices will be ordered and plotted counter-clockwise:
        fig1, ax = plt.subplots(figsize=(20, 12))
        ax.pie(values_2,
               labels=labels_2,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        # Equal aspect ratio ensures that pie is drawn as a circle.
        ax.axis('equal')
        #ax.title('Transaction categories of user {bank_df[unique_mem_id][0]}')
        ax.legend(loc='center right')
        plt.show()
    '''
    Generate a spending report of the unaltered dataframe
    Use the datetime columns just defined
    This report measures either the sum or mean of transactions happening
    on various days of the week/or wihtin a week or a month  over the course of the year
    '''
    # convert all date col from date to datetime objects
    # date objects will block Select K Best if not converted
    # first conversion from date to datetime objects; then conversion to unix
    bank_df['post_date'] = pd.to_datetime(bank_df['post_date'])
    bank_df['transaction_date'] = pd.to_datetime(bank_df['transaction_date'])
    bank_df['optimized_transaction_date'] = pd.to_datetime(
        bank_df['optimized_transaction_date'])
    bank_df['file_created_date'] = pd.to_datetime(bank_df['file_created_date'])
    bank_df['panel_file_created_date'] = pd.to_datetime(
        bank_df['panel_file_created_date'])

    # set optimized transaction_date as index for later
    bank_df.set_index('optimized_transaction_date', drop=False, inplace=True)

    # generate the spending report with the above randomly picked user ID
    if spending_report:
        create_spending_report(df=bank_df.copy())
    '''
    After successfully loading the data, columns that are of no importance have been removed and missing values replaced
    Then the dataframe is ready to be encoded to get rid of all non-numerical data
    '''
    try:
        # Include below if need unique ID's later:
        # bank_df['unique_mem_id'] = bank_df['unique_mem_id'].astype(
        #     'str', errors='ignore')
        # bank_df['unique_bank_account_id'] = bank_df['unique_bank_account_id'].astype(
        #     'str', errors='ignore')
        # bank_df['unique_bank_transaction_id'] = bank_df['unique_bank_transaction_id'].astype(
        #     'str', errors='ignore')
        bank_df['amount'] = bank_df['amount'].astype('float64')
        bank_df['transaction_base_type'] = bank_df[
            'transaction_base_type'].replace(to_replace=["debit", "credit"],
                                             value=[1, 0])
    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")

    # attempt to convert date objects to unix timestamps as numeric value (fl64) if they have no missing values; otherwise they are being dropped
    date_features = [
        'post_date', 'transaction_date', 'optimized_transaction_date',
        'file_created_date', 'panel_file_created_date'
    ]
    try:
        for feature in date_features:
            if bank_df[feature].isnull().sum() == 0:
                bank_df[feature] = bank_df[feature].apply(
                    lambda x: dt.timestamp(x))
            else:
                bank_df = bank_df.drop(columns=feature, axis=1)
                print(f"Column {feature} dropped")

    except (TypeError, OSError, ValueError) as e:
        print(f"Problem with conversion: {e}")
    '''
    The columns PRIMARY_MERCHANT_NAME; CITY, STATE, DESCRIPTION, TRANSACTION_CATEGORY_NAME, CURRENCY
    are encoded manually and cleared of empty values
    '''
    encoding_features = [
        'primary_merchant_name', 'city', 'state', 'description',
        'transaction_category_name', 'transaction_origin', 'currency'
    ]
    UNKNOWN_TOKEN = '<unknown>'
    embedding_maps = {}
    for feature in encoding_features:
        unique_list = bank_df[feature].unique().astype('str').tolist()
        unique_list.append(UNKNOWN_TOKEN)
        le = LabelEncoder()
        le.fit_transform(unique_list)
        embedding_maps[feature] = dict(
            zip(le.classes_, le.transform(le.classes_)))

        # APPLICATION TO OUR DATASET
        bank_df[feature] = bank_df[feature].apply(
            lambda x: x if x in embedding_maps[feature] else UNKNOWN_TOKEN)
        bank_df[feature] = bank_df[feature].map(lambda x: le.transform([x])[0]
                                                if type(x) == str else x)

    # dropping currency if there is only one
    if len(bank_df['currency'].value_counts()) == 1:
        bank_df = bank_df.drop(columns=['currency'], axis=1)
    '''
    IMPORTANT
    The lagging features produce NaN for the first two rows due to unavailability
    of values
    NaNs need to be dropped to make scaling and selection of features working
    '''
    if include_lag_features:
        #FEATURE ENGINEERING
        #typical engineered features based on lagging metrics
        #mean + stdev of past 3d/7d/30d/ + rolling volume
        date_index = bank_df.index.values
        bank_df.reset_index(drop=True, inplace=True)
        #pick lag features to iterate through and calculate features
        lag_features = ["amount"]
        #set up time frames; how many days/months back/forth
        t1 = 3
        t2 = 7
        t3 = 30
        #rolling values for all columns ready to be processed
        bank_df_rolled_3d = bank_df[lag_features].rolling(window=t1,
                                                          min_periods=0)
        bank_df_rolled_7d = bank_df[lag_features].rolling(window=t2,
                                                          min_periods=0)
        bank_df_rolled_30d = bank_df[lag_features].rolling(window=t3,
                                                           min_periods=0)

        #calculate the mean with a shifting time window
        bank_df_mean_3d = bank_df_rolled_3d.mean().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_mean_7d = bank_df_rolled_7d.mean().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_mean_30d = bank_df_rolled_30d.mean().shift(
            periods=1).reset_index().astype(np.float32)

        #calculate the std dev with a shifting time window
        bank_df_std_3d = bank_df_rolled_3d.std().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_std_7d = bank_df_rolled_7d.std().shift(
            periods=1).reset_index().astype(np.float32)
        bank_df_std_30d = bank_df_rolled_30d.std().shift(
            periods=1).reset_index().astype(np.float32)

        for feature in lag_features:
            bank_df[f"{feature}_mean_lag{t1}"] = bank_df_mean_3d[feature]
            bank_df[f"{feature}_mean_lag{t2}"] = bank_df_mean_7d[feature]
            bank_df[f"{feature}_mean_lag{t3}"] = bank_df_mean_30d[feature]
            bank_df[f"{feature}_std_lag{t1}"] = bank_df_std_3d[feature]
            bank_df[f"{feature}_std_lag{t2}"] = bank_df_std_7d[feature]
            bank_df[f"{feature}_std_lag{t3}"] = bank_df_std_30d[feature]

        bank_df.set_index(date_index, drop=False, inplace=True)

    #drop all features left with empty (NaN) values
    bank_df = bank_df.dropna()
    #drop user IDs to avoid overfitting with useless information
    bank_df = bank_df.drop([
        'unique_mem_id', 'unique_bank_account_id', 'unique_bank_transaction_id'
    ],
                           axis=1)

    if plots:
        # seaborn plots
        ax_desc = bank_df['description'].astype('int64', errors='ignore')
        ax_amount = bank_df['amount'].astype('int64', errors='ignore')
        sns.pairplot(bank_df)
        sns.boxplot(x=ax_desc, y=ax_amount)
        sns.heatmap(bank_df)

    return bank_df