def fte_sales_channels(train, test, y, db_conn, folds, cache_file):

  cache_key_train = 'fte_sales_channels_train'
  cache_key_test = 'fte_sales_channels_test'

  # Check if cache file exist and if data for this step is cached
  train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test)
  if train_cached is not None and test_cached is not None:
      logger.info('fte_sales_channels - Cache found, will use cached data')
      train = pd.concat([train, train_cached], axis = 1, copy = False)
      test = pd.concat([test, test_cached], axis = 1, copy = False)
      return train, test, y, db_conn, folds, cache_file

  logger.info('fte_sales_channels - Cache not found, will recompute from scratch')

  ########################################################

  def _trans(df, table):
    df['p_avg_seller_size'] = encode_average(df, db_conn, table, 'previous_application', 'SELLERPLACE_AREA')
    df['p_avg_channel_size'] = encode_average(df, db_conn, table, 'previous_application', 'CHANNEL_TYPE')

  _trans(train, "application_train")
  _trans(test, "application_test")

  ########################################################

  columns = ['p_avg_seller_size', 'p_avg_channel_size']
  logger.info(f'Caching features in {cache_file}')
  train_cache = train[columns]
  test_cache = test[columns]
  save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache)

  return train, test, y, db_conn, folds, cache_file
Esempio n. 2
0
def fte_pos_cash_aggregate(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_pos_cash_aggregate_train'
    cache_key_test = 'fte_pos_cash_aggregate_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info(
            'fte_pos_cash_aggregate - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_pos_cash_aggregate - Cache not found, will recompute from scratch'
    )

    ########################################################

    # SQLite doesn't have stddev function, revert to Pandas
    # Note that the DB has 10M rows
    pos_cash = pd.read_sql_query(
        'select * FROM POS_CASH_balance ORDER BY SK_ID_CURR ASC;', db_conn)

    # Create the aggregate
    agg_POS_CASH = pos_cash.groupby('SK_ID_CURR').agg(
        dict(
            MONTHS_BALANCE=["sum", "mean", "max", "min", "std"],
            CNT_INSTALMENT=["sum", "mean", "max", "min", "std"],
            CNT_INSTALMENT_FUTURE=["sum", "mean", "max", "min", "std"],
            SK_DPD=["sum", "mean", "max", "std"],  # dropping mean
            SK_DPD_DEF=["sum", "mean", "max", "min", "std"],
            SK_ID_CURR='count'))
    agg_POS_CASH.columns = pd.Index(
        [e[0] + "_" + e[1] for e in agg_POS_CASH.columns.tolist()])

    train = train.merge(agg_POS_CASH,
                        left_on='SK_ID_CURR',
                        right_index=True,
                        how='left',
                        copy=False)
    test = test.merge(agg_POS_CASH,
                      left_on='SK_ID_CURR',
                      right_index=True,
                      how='left',
                      copy=False)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[agg_POS_CASH.columns]
    test_cache = test[agg_POS_CASH.columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
def fte_missed_installments(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_missed_installments_train'
    cache_key_test = 'fte_missed_installments_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info(
            'fte_missed_installments - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_missed_installments - Cache not found, will recompute from scratch'
    )

    ########################################################

    query = """
  select
    SK_ID_CURR,
    AMT_INSTALMENT - AMT_PAYMENT AS DIFF_EXPECTED_PMT,
    DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT AS DAYS_LATE
  from
    installments_payments
  """

    installments_diff = pd.read_sql_query(query, db_conn)
    agg_installments_diff = installments_diff.groupby('SK_ID_CURR').agg(
        ["sum", "mean", "max", "min", "std", "count"])
    agg_installments_diff.columns = pd.Index(
        [e[0] + "_" + e[1] for e in agg_installments_diff.columns.tolist()])

    train = train.merge(agg_installments_diff,
                        left_on='SK_ID_CURR',
                        right_index=True,
                        how='left',
                        copy=False)
    test = test.merge(agg_installments_diff,
                      left_on='SK_ID_CURR',
                      right_index=True,
                      how='left',
                      copy=False)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[agg_installments_diff.columns]
    test_cache = test[agg_installments_diff.columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
def fte_app_categoricals(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_app_categoricals_train'
    cache_key_test = 'fte_app_categoricals_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info('fte_app_categoricals - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_app_categoricals - Cache not found, will recompute from scratch')

    ########################################################

    def _trans(df, table):
        df['NAME_CONTRACT_TYPE'] = encode_categoricals(df, db_conn, table,
                                                       'NAME_CONTRACT_TYPE')
        df['NAME_TYPE_SUITE'] = encode_categoricals(df, db_conn, table,
                                                    'NAME_TYPE_SUITE')
        df['OCCUPATION_TYPE'] = encode_categoricals(df, db_conn, table,
                                                    'OCCUPATION_TYPE')
        df['ORGANIZATION_TYPE'] = encode_categoricals(df, db_conn, table,
                                                      'ORGANIZATION_TYPE')
        df['NAME_INCOME_TYPE'] = encode_categoricals(df, db_conn, table,
                                                     'NAME_INCOME_TYPE')
        df['NAME_EDUCATION_TYPE'] = encode_categoricals(
            df, db_conn, table, 'NAME_EDUCATION_TYPE')
        df['NAME_FAMILY_STATUS'] = encode_categoricals(df, db_conn, table,
                                                       'NAME_FAMILY_STATUS')
        df['NAME_HOUSING_TYPE'] = encode_categoricals(df, db_conn, table,
                                                      'NAME_HOUSING_TYPE')

    _trans(train, "application_train")
    _trans(test, "application_test")

    ########################################################

    columns = [
        'NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE',
        'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
        'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE'
    ]
    logger.info(f'Caching features in {cache_file}')
    train_cache = train[columns]
    test_cache = test[columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
Esempio n. 5
0
def tr_sentiment(train, test, y, folds, cache_file):

    print("############# Sentiment Analysis step ################")
    cache_key_train = 'nlp_sentiment_train'
    cache_key_test = 'nlp_sentiment_test'

    #Check if cache file exist and if data for this step is cached
    dict_train, dict_test = load_from_cache(cache_file, cache_key_train,
                                            cache_key_test)
    if dict_train is not None and dict_test is not None:
        train_out = train.assign(**dict_train)
        test_out = test.assign(**dict_test)
        return train_out, test_out, y, folds, cache_file

    print('# No cache detected, computing from scratch #')

    def _trans(df):
        return {
            'sentiment_polarity':
            df['description'].apply(lambda x: TextBlob(x).sentiment.polarity),
            'sentiment_subjectivity':
            df['description'].apply(
                lambda x: TextBlob(x).sentiment.subjectivity)
        }

    nlp_sentiment_train = _trans(train)
    nlp_sentiment_test = _trans(test)

    print('Caching features in ' + cache_file)
    save_to_cache(cache_file, cache_key_train, cache_key_test,
                  nlp_sentiment_train, nlp_sentiment_test)

    print('Adding features to dataframe')
    train_out = train.assign(**nlp_sentiment_train)
    test_out = test.assign(**nlp_sentiment_test)

    return train_out, test_out, y, folds, cache_file
def fte_prev_app_process(train, test, y, db_conn, folds, cache_file):

  cache_key_train = 'fte_prev_app_process_train'
  cache_key_test = 'fte_prev_app_process_test'

  # Check if cache file exist and if data for this step is cached
  train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test)
  if train_cached is not None and test_cached is not None:
      logger.info('fte_prev_app_process - Cache found, will use cached data')
      train = pd.concat([train, train_cached], axis = 1, copy = False)
      test = pd.concat([test, test_cached], axis = 1, copy = False)
      return train, test, y, db_conn, folds, cache_file

  logger.info('fte_prev_app_process - Cache not found, will recompute from scratch')

  ########################################################

  def _trans(df, table, columns):
    query = f"""
    select
      avg(-p.DAYS_DECISION) / 365.25 p_avg_years_decision,
      avg(case(p.NAME_CONTRACT_TYPE)
        when 'Revolving loans' Then NULL
        else -p.DAYS_FIRST_DUE
        end
      ) / 365.25 p_avg_years_first_due,
      avg(case(p.NAME_CONTRACT_TYPE)
        when 'Revolving loans' Then NULL
        else -p.DAYS_LAST_DUE_1ST_VERSION
        end
      ) / 365.25 p_avg_years_last_due_1st_version,
      avg(p.HOUR_APPR_PROCESS_START) p_avg_hour_start,
      sum(case(p.NAME_CONTRACT_TYPE)
        when 'Revolving loans' Then 1
        else 0
        end
      ) p_count_revolving_loans
    from
      {table} app
    left join
      previous_application p on app.SK_ID_CURR = p.SK_ID_CURR
    GROUP BY
      app.SK_ID_CURR
    ORDER BY
      app.SK_ID_CURR ASC;
    """

    df[columns] = pd.read_sql_query(query, db_conn)

    # TODO add currency, otherwise credit is not comparable

  columns = ['p_avg_years_decision',
        'p_avg_years_first_due',
        'p_avg_years_last_due_1st_version',
        'p_avg_hour_start',
        'p_count_revolving_loans'
        ]

  _trans(train, "application_train", columns)
  _trans(test, "application_test", columns)

  ########################################################

  logger.info(f'Caching features in {cache_file}')
  train_cache = train[columns]
  test_cache = test[columns]
  save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache)

  return train, test, y, db_conn, folds, cache_file
def fte_prev_credit_situation(train, test, y, db_conn, folds, cache_file):

  cache_key_train = 'fte_prev_credit_situation_train'
  cache_key_test = 'fte_prev_credit_situation_test'

  # Check if cache file exist and if data for this step is cached
  train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test)
  if train_cached is not None and test_cached is not None:
      logger.info('fte_prev_credit_situation - Cache found, will use cached data')
      train = pd.concat([train, train_cached], axis = 1, copy = False)
      test = pd.concat([test, test_cached], axis = 1, copy = False)
      return train, test, y, db_conn, folds, cache_file

  logger.info('fte_prev_credit_situation - Cache not found, will recompute from scratch')

  ########################################################

  def _trans(df, table, columns):
    query = f"""
    select
      IFNULL(sum(case(p.NAME_CONTRACT_TYPE) when 'Consumer loans' then 1 else 0 end), 0) AS p_total_consumer_loans,
      IFNULL(sum(case(p.NAME_CONTRACT_TYPE) when 'Cash loans' then 1 else 0 end), 0) AS p_total_cash_loans,
      IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Refused' then 1 else 0 end), 0) AS p_total_application_refused,
      IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Canceled' then 1 else 0 end), 0) AS p_total_application_canceled,
      IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Approved' then 1 else 0 end), 0) AS p_total_application_approved,
      -- IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Unused offer' then 1 else 0 end), 0) AS p_total_application_unused,
      IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Refused' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_refused,
      -- IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Canceled' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_canceled,
      IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Approved' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_approved,
      IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Unused offer' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_unused,
      avg(case(p.NAME_CONTRACT_STATUS) when 'Approved' then 1.0 when 'Unused offer' then 1.0 else 0.0 end) AS p_ratio_app_approved_total,
      IFNULL(avg(p.AMT_APPLICATION - p.AMT_CREDIT), 0) AS p_avg_diff_asked_offered,
      IFNULL(avg(p.CNT_PAYMENT), 0) AS p_avg_payment_schedule,
      IFNULL(avg(p.AMT_ANNUITY), 0) AS p_avg_annuity,
      IFNULL(avg(p.CNT_PAYMENT), 0) / app.AMT_ANNUITY AS p_ratio_annuity_p_app,
      avg(p.AMT_DOWN_PAYMENT) AS p_avg_down_payment
    from
      {table} app
    left join
      previous_application p on app.SK_ID_CURR = p.SK_ID_CURR
    GROUP BY
      app.SK_ID_CURR
    ORDER BY
      app.SK_ID_CURR ASC;
    """

    df[columns] = pd.read_sql_query(query, db_conn)

    # TODO add currency, otherwise credit is not comparable

  columns = ['p_total_consumer_loans',
        'p_total_cash_loans',
        'p_total_application_refused',
        'p_total_application_canceled',
        'p_total_application_approved',
        # 'p_total_application_unused',
        'p_avg_application_refused',
        # 'p_avg_application_canceled',
        'p_avg_application_approved',
        'p_avg_application_unused',
        'p_ratio_app_approved_total',
        'p_avg_diff_asked_offered',
        'p_avg_payment_schedule',
        'p_avg_annuity',
        'p_ratio_annuity_p_app',
        'p_avg_down_payment']

  _trans(train, "application_train", columns)
  _trans(test, "application_test", columns)

  ########################################################

  logger.info(f'Caching features in {cache_file}')
  train_cache = train[columns]
  test_cache = test[columns]
  save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache)

  return train, test, y, db_conn, folds, cache_file
Esempio n. 8
0
def tr_tfidf_lsa_lgb(train, test, y, folds, cache_file):
    print("############# TF-IDF + LSA step ################")
    cache_key_train = 'tfidf_lsa_lgb_train'
    cache_key_test = 'tfidf_lsa_lgb_test'

    #Check if cache file exist and if data for this step is cached
    dict_train, dict_test = load_from_cache(cache_file, cache_key_train,
                                            cache_key_test)
    if dict_train is not None and dict_test is not None:
        train_out = train.assign(**dict_train)
        test_out = test.assign(**dict_test)
        return train_out, test_out, y, folds, cache_file

    print('# No cache detected, computing from scratch #')
    vectorizer = TfidfVectorizer(max_features=2**16,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=True)

    train_raw, test_raw = _clean_desc(train, test)
    train_vect = vectorizer.fit_transform(train_raw['CleanDesc'])
    test_vect = vectorizer.transform(test_raw['CleanDesc'])
    # print(vectorizer.get_feature_names())

    svd = TruncatedSVD(100)  #Recommended 100 dimensions for LSA
    lsa = make_pipeline(
        svd,
        # Normalizer(copy=False) # Not needed for trees ensemble and Leaky on CV
    )

    # Run SVD on the training data, then project the training data.
    X_train_lsa = lsa.fit_transform(train_vect)
    X_test_lsa = lsa.transform(test_vect)
    #explained_variance = svd.explained_variance_ratio_.sum()
    #print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

    le = LabelEncoder()
    y_encode = le.fit_transform(y)

    # Separate train in train + validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train_lsa,
                                                      y_encode,
                                                      test_size=0.2,
                                                      random_state=42)

    # train
    gbm = LGBMClassifier(n_estimators=2048,
                         seed=42,
                         objective='multiclass',
                         colsample_bytree='0.8',
                         subsample='0.8')

    # Predict out-of-folds train data
    print('Start training - Number of folds: ', len(folds))
    train_predictions = out_of_fold_predict(gbm, X_train_lsa, y_encode, folds)

    tfidf_train_names = {
        'tfidf_' + le.classes_[0]: [row[0] for row in train_predictions],
        'tfidf_' + le.classes_[1]: [row[1] for row in train_predictions],
        'tfidf_' + le.classes_[2]: [row[2] for row in train_predictions]
    }

    gbm.fit(X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_logloss',
            early_stopping_rounds=50,
            verbose=False)

    # Now validate the predict value using the previously split validation set
    print('Start validating TF-IDF + LSA...')
    # predict
    y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration)
    # eval
    print('We stopped at boosting round: ', gbm.best_iteration)
    print('The mlogloss of prediction is:', mlogloss(y_val, y_pred))

    # Now compute the value for the actual test data using out-of-folds predictions
    print('Start predicting TF-IDF + LSA...')
    test_predictions = gbm.predict_proba(X_test_lsa,
                                         num_iteration=gbm.best_iteration)

    tfidf_test_names = {
        'tfidf_' + le.classes_[0]: [row[0] for row in test_predictions],
        'tfidf_' + le.classes_[1]: [row[1] for row in test_predictions],
        'tfidf_' + le.classes_[2]: [row[2] for row in test_predictions]
    }

    print('Caching features in ' + cache_file)
    save_to_cache(cache_file, cache_key_train, cache_key_test,
                  tfidf_train_names, tfidf_test_names)

    print('Adding features to dataframe')
    train_out = train.assign(**tfidf_train_names)
    test_out = test.assign(**tfidf_test_names)

    return train_out, test_out, y, folds, cache_file
def fte_bureau_credit_situation(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_bureau_credit_situation_train'
    cache_key_test = 'fte_bureau_credit_situation_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info(
            'fte_bureau_credit_situation - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_bureau_credit_situation - Cache not found, will recompute from scratch'
    )

    ########################################################

    def _trans(df, table, columns):
        query = f"""
    select
      IFNULL(count(b.SK_ID_BUREAU), 0) AS b_total_prev_applications,
      IFNULL(sum(case CREDIT_ACTIVE when 'Active' then 1 else 0 end), 0) AS b_current_active_credit,
      IFNULL(sum(case CREDIT_ACTIVE when 'Sold' then 1 else 0 end), 0) AS b_sold_credit,
      IFNULL(sum(case CREDIT_ACTIVE when 'Bad debt' then 1 else 0 end), 0) AS b_bad_debt_credit,
      IFNULL(avg(case CREDIT_ACTIVE when 'Active' then 1 else 0 end), 0) AS b_active_applications_ratio,
      IFNULL(count(distinct CREDIT_TYPE), 0) AS b_nb_distinct_credit_type,
      IFNULL(sum(AMT_CREDIT_SUM), 0) AS b_total_prev_credit,
      IFNULL(sum(case CREDIT_ACTIVE when 'Active' then AMT_CREDIT_SUM else 0 end), 0) AS b_active_credit_amount,
      IFNULL(sum(AMT_CREDIT_SUM_DEBT), 0) AS b_current_debt,
      IFNULL(sum(AMT_CREDIT_MAX_OVERDUE), 0) AS b_current_overdue,
      IFNULL(max(-DAYS_CREDIT), 99 * 365.25) / 365.25 AS b_first_credit_years_ago,
      IFNULL(min(-DAYS_CREDIT), 99 * 365.25) / 365.25 AS b_last_credit_years_ago,
      IFNULL(max(DAYS_CREDIT_ENDDATE), -99 * 365.25) / 365.25 AS b_existing_credit_close_date,
      IFNULL(max(-DAYS_ENDDATE_FACT), 99 * 365.25) / 365.25 AS b_years_since_no_card_credit,
      -- IFNULL(min(-DAYS_CREDIT_UPDATE), 99 * 365.25) AS b_last_DAYS_CREDIT_UPDATE,
      case CREDIT_CURRENCY
        when 'currency_1' then 0
        when 'currency_2' then 1
        when 'currency_3' then 2
        else 3
      end b_currency,
      IFNULL(sum(CNT_CREDIT_PROLONG), 0) AS b_credit_days_extension
    from
      {table} app
    left join
      bureau b on app.SK_ID_CURR = b.SK_ID_CURR
    GROUP BY
      app.SK_ID_CURR
    ORDER BY
      app.SK_ID_CURR ASC;
    """

        df[columns] = pd.read_sql_query(query, db_conn)

        # TODO add currency, otherwise credit is not comparable

    columns = [
        'b_total_prev_applications',
        'b_current_active_credit',
        'b_sold_credit',
        'b_bad_debt_credit',
        'b_active_applications_ratio',
        'b_nb_distinct_credit_type',
        'b_total_prev_credit',
        'b_active_credit_amount',
        'b_current_debt',
        'b_current_overdue',
        'b_first_credit_years_ago',
        'b_last_credit_years_ago',
        'b_existing_credit_close_date',
        'b_years_since_no_card_credit',
        # 'b_last_DAYS_CREDIT_UPDATE',
        'b_currency',
        'b_credit_days_extension'
    ]

    _trans(train, "application_train", columns)
    _trans(test, "application_test", columns)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[columns]
    test_cache = test[columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
def tr_managerskill(train, test, y, folds, cache_file):
    print("\n\n############# Manager skill step ################")
    cache_key_train = 'managerskill_train'
    cache_key_test = 'managerskill_test'

    #Check if cache file exist and if data for this step is cached
    dict_train, dict_test = load_from_cache(cache_file, cache_key_train,
                                            cache_key_test)
    if dict_train is not None and dict_test is not None:
        train_out = train.assign(**dict_train)
        test_out = test.assign(**dict_test)
        return train_out, test_out, y, folds, cache_file

    print('# No cache detected, computing from scratch #')
    lb = LabelBinarizer(sparse_output=True)
    lb.fit(list(train['manager_id'].values) + list(test['manager_id'].values))

    X_train_mngr = lb.transform(train['manager_id']).astype(np.float32)
    X_test_mngr = lb.transform(test['manager_id']).astype(np.float32)

    le = LabelEncoder()
    y_encode = le.fit_transform(y)

    # Separate train in train + validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train_mngr,
                                                      y_encode,
                                                      test_size=0.2,
                                                      random_state=42)

    # train
    gbm = LGBMClassifier(n_estimators=2048,
                         seed=42,
                         objective='multiclass',
                         colsample_bytree='0.8',
                         subsample='0.8')

    # Predict out-of-folds train data
    print('Start training - Number of folds: ', len(folds))
    train_predictions = out_of_fold_predict(gbm, X_train_mngr, y_encode, folds)

    mngr_train_names = {
        'mngr_' + le.classes_[0]: [row[0] for row in train_predictions],
        'mngr_' + le.classes_[1]: [row[1] for row in train_predictions],
        'mngr_' + le.classes_[2]: [row[2] for row in train_predictions],
    }
    mngr_train_names['mngr_skill'] = [
        2 * h + m for (h, m) in zip(mngr_train_names['mngr_high'],
                                    mngr_train_names['mngr_medium'])
    ]

    gbm.fit(X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_logloss',
            early_stopping_rounds=50,
            verbose=False)

    # Now validate the predict value using the previously split validation set
    print('Start validating Manager skill...')
    # predict
    y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration)
    # eval
    print('We stopped at boosting round: ', gbm.best_iteration)
    print('The mlogloss of prediction is:', mlogloss(y_val, y_pred))

    # Now compute the value for the actual test data using out-of-folds predictions
    print('Start predicting Manager skill...')
    test_predictions = gbm.predict_proba(X_test_mngr,
                                         num_iteration=gbm.best_iteration)

    mngr_test_names = {
        'mngr_' + le.classes_[0]: [row[0] for row in test_predictions],
        'mngr_' + le.classes_[1]: [row[1] for row in test_predictions],
        'mngr_' + le.classes_[2]: [row[2] for row in test_predictions]
    }
    mngr_test_names['mngr_skill'] = [
        2 * h + m for (h, m) in zip(mngr_test_names['mngr_high'],
                                    mngr_test_names['mngr_medium'])
    ]

    print('Caching features in ' + cache_file)
    save_to_cache(cache_file, cache_key_train, cache_key_test,
                  mngr_train_names, mngr_test_names)

    print('Adding features to dataframe')
    train_out = train.assign(**mngr_train_names)
    test_out = test.assign(**mngr_test_names)

    return train_out, test_out, y, folds, cache_file
def fte_application(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_application_train'
    cache_key_test = 'fte_application_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info('fte_application - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_application - Cache not found, will recompute from scratch')

    ########################################################

    def _trans(df, table, columns):
        query = f"""
    select
      OBS_30_CNT_SOCIAL_CIRCLE,
      DEF_30_CNT_SOCIAL_CIRCLE,
      OBS_60_CNT_SOCIAL_CIRCLE,
      DEF_60_CNT_SOCIAL_CIRCLE,
      REGION_POPULATION_RELATIVE,
      --REGION_RATING_CLIENT,
      REGION_RATING_CLIENT_W_CITY,
      REG_CITY_NOT_LIVE_CITY,
      --REG_CITY_NOT_WORK_CITY,
      --REG_REGION_NOT_LIVE_REGION
      --REG_REGION_NOT_WORK_REGION,
      --LIVE_REGION_NOT_WORK_REGION,
      CNT_CHILDREN,
      CNT_FAM_MEMBERS,
      case(CODE_GENDER) when "F" THEN 1 else 0 end isWoman,
      -- case(FLAG_OWN_CAR) when "Y" THEN 1 else 0 end ownCar,
      case(FLAG_OWN_REALTY) when "Y" THEN 1 else 0 end ownRealEstate,
      FLAG_DOCUMENT_3,
      FLOORSMAX_AVG,
      FLOORSMIN_AVG,
      YEARS_BUILD_AVG,
      LIVINGAREA_MODE,
      YEARS_BEGINEXPLUATATION_MODE,
      LANDAREA_MODE,
      BASEMENTAREA_MODE,
      TOTALAREA_MODE,
      APARTMENTS_AVG,
      COMMONAREA_AVG,
      NONLIVINGAREA_MEDI,
      --LIVINGAPARTMENTS_MODE,
      --YEARS_BUILD_MODE,
      --ENTRANCES_AVG,
      --AMT_REQ_CREDIT_BUREAU_DAY,
      --AMT_REQ_CREDIT_BUREAU_HOUR,
      AMT_REQ_CREDIT_BUREAU_MON,
      AMT_REQ_CREDIT_BUREAU_QRT,
      --AMT_REQ_CREDIT_BUREAU_WEEK,
      AMT_REQ_CREDIT_BUREAU_YEAR,
      FLAG_MOBIL,
      --FLAG_EMP_PHONE,
      FLAG_WORK_PHONE,
      --FLAG_CONT_MOBILE,
      --FLAG_PHONE,
      --FLAG_EMAIL,
      EXT_SOURCE_1,
      EXT_SOURCE_2,
      EXT_SOURCE_3
    from
      {table}
    ORDER BY
      SK_ID_CURR ASC
    """

        df[columns] = pd.read_sql_query(query, db_conn)

    columns = [
        'OBS_30_CNT_SOCIAL_CIRCLE',
        'DEF_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE',
        'DEF_60_CNT_SOCIAL_CIRCLE',
        'REGION_POPULATION_RELATIVE',
        # 'REGION_RATING_CLIENT',
        'REGION_RATING_CLIENT_W_CITY',
        'REG_CITY_NOT_LIVE_CITY',
        # 'REG_CITY_NOT_WORK_CITY',
        # 'REG_REGION_NOT_LIVE_REGION',
        # 'REG_REGION_NOT_WORK_REGION',
        # 'LIVE_REGION_NOT_WORK_REGION',
        'CNT_CHILDREN',
        'CNT_FAM_MEMBERS',
        'isWoman',
        # 'FLAG_OWN_CAR',
        'FLAG_OWN_REALTY',
        'FLAG_DOCUMENT_3',
        'FLOORSMAX_AVG',
        'FLOORSMIN_AVG',
        'YEARS_BUILD_AVG',
        'LIVINGAREA_MODE',
        'YEARS_BEGINEXPLUATATION_MODE',
        'LANDAREA_MODE',
        'BASEMENTAREA_MODE',
        'TOTALAREA_MODE',
        'APARTMENTS_AVG',
        'COMMONAREA_AVG',
        'NONLIVINGAREA_MEDI',
        # 'LIVINGAPARTMENTS_MODE',
        # 'YEARS_BUILD_MODE',
        # 'ENTRANCES_AVG',
        # 'AMT_REQ_CREDIT_BUREAU_DAY',
        # 'AMT_REQ_CREDIT_BUREAU_HOUR',
        'AMT_REQ_CREDIT_BUREAU_MON',
        'AMT_REQ_CREDIT_BUREAU_QRT',
        # 'AMT_REQ_CREDIT_BUREAU_WEEK',
        'AMT_REQ_CREDIT_BUREAU_YEAR',
        'FLAG_MOBIL',
        #'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE',
        #'FLAG_CONT_MOBILE',
        #'FLAG_PHONE',
        #'FLAG_EMAIL',
        'EXT_SOURCE_1',
        'EXT_SOURCE_2',
        'EXT_SOURCE_3'
    ]

    _trans(train, "application_train", columns)
    _trans(test, "application_test", columns)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[columns]
    test_cache = test[columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
Esempio n. 12
0
def fte_withdrawals(train, test, y, db_conn, folds, cache_file):

    cache_key_train = 'fte_withdrawals_train'
    cache_key_test = 'fte_withdrawals_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info('fte_withdrawals - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_withdrawals - Cache not found, will recompute from scratch')

    ########################################################

    def _trans(df, table, columns):
        query = f"""
    with instalments_per_app AS (
      select
        SK_ID_CURR,
        max(CNT_INSTALMENT_MATURE_CUM) AS nb_installments
      from
        credit_card_balance
      GROUP BY
        SK_ID_CURR, SK_ID_PREV
      ORDER BY
        SK_ID_CURR
      )
    select
      count(distinct SK_ID_PREV) as cb_nb_of_credit_cards,
      avg(CNT_DRAWINGS_ATM_CURRENT) as cb_avg_atm_withdrawal_count,
      avg(AMT_DRAWINGS_ATM_CURRENT) as cb_avg_atm_withdrawal_amount,
      sum(AMT_DRAWINGS_ATM_CURRENT) as cb_sum_atm_withdrawal_amount,
      avg(CNT_DRAWINGS_CURRENT) as cb_avg_withdrawal_count,
      avg(AMT_DRAWINGS_CURRENT) as cb_avg_withdrawal_amount,
      avg(CNT_DRAWINGS_POS_CURRENT) as cb_avg_pos_withdrawal_count,
      avg(AMT_DRAWINGS_POS_CURRENT) as cb_avg_pos_withdrawal_amount,
      avg(SK_DPD) as cb_avg_day_past_due,
      avg(SK_DPD_DEF) as cb_avg_day_past_due_tolerated
      --sum(nb_installments) as instalments_per_app
    FROM
      {table} app
    left join
      credit_card_balance ccb
        on app.SK_ID_CURR = ccb.SK_ID_CURR
    left join
      instalments_per_app ipa
        on ipa.SK_ID_CURR = ccb.SK_ID_CURR
    --where
    --  MONTHS_BALANCE BETWEEN -3 and -1
    group by
      ccb.sk_id_curr
    ORDER by
      app.SK_ID_CURR ASC
    """

        df[columns] = pd.read_sql_query(query, db_conn)

    columns = [
        'cb_nb_of_credit_cards', 'cb_avg_atm_withdrawal_count',
        'cb_avg_atm_withdrawal_amount', 'cb_sum_atm_withdrawal_amount',
        'cb_avg_withdrawal_count', 'cb_avg_withdrawal_amount',
        'cb_avg_pos_withdrawal_count', 'cb_avg_pos_withdrawal_amount',
        'cb_avg_day_past_due', 'cb_avg_day_past_due_tolerated'
        # 'instalments_per_app'
    ]

    _trans(train, "application_train", columns)
    _trans(test, "application_test", columns)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[columns]
    test_cache = test[columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file
Esempio n. 13
0
def fte_pos_cash_current_status(train, test, y, db_conn, folds, cache_file):
    ## Count the still active/completed/amortized debt credit

    cache_key_train = 'fte_pos_cash_current_status_train'
    cache_key_test = 'fte_pos_cash_current_status_test'

    # Check if cache file exist and if data for this step is cached
    train_cached, test_cached = load_from_cache(cache_file, cache_key_train,
                                                cache_key_test)
    if train_cached is not None and test_cached is not None:
        logger.info(
            'fte_pos_cash_current_status - Cache found, will use cached data')
        train = pd.concat([train, train_cached], axis=1, copy=False)
        test = pd.concat([test, test_cached], axis=1, copy=False)
        return train, test, y, db_conn, folds, cache_file

    logger.info(
        'fte_pos_cash_current_status - Cache not found, will recompute from scratch'
    )

    ########################################################

    # In SQLite we avoid joining on temporary tables/subqueries/with-statement as they are not indexed
    # and super-slow. ORDER BY on the following result is very slow to ...
    # (~>1min vs 20ms for just dumping the result on i5-5257U)

    query = """
  SELECT
    SK_ID_CURR, -- SK_ID_PREV,
    NAME_CONTRACT_STATUS AS pos_cash_NAME_CONTRACT_STATUS
  FROM
    POS_CASH_balance
  GROUP BY
    SK_ID_PREV
	HAVING
	  MONTHS_BALANCE = max(MONTHS_BALANCE)
  """

    pos_cash_current = pd.read_sql_query(query, db_conn)
    # Pivot
    pos_cash_current = pd.get_dummies(pos_cash_current,
                                      columns=[
                                          'pos_cash_NAME_CONTRACT_STATUS'
                                      ]).groupby('SK_ID_CURR').sum()

    # TODO: add a proper feature selection phase
    pos_cash_current = pos_cash_current[[
        'pos_cash_NAME_CONTRACT_STATUS_Active',
        'pos_cash_NAME_CONTRACT_STATUS_Completed'
    ]]

    train = train.merge(pos_cash_current,
                        left_on='SK_ID_CURR',
                        right_index=True,
                        how='left',
                        copy=False)
    test = test.merge(pos_cash_current,
                      left_on='SK_ID_CURR',
                      right_index=True,
                      how='left',
                      copy=False)

    ########################################################

    logger.info(f'Caching features in {cache_file}')
    train_cache = train[pos_cash_current.columns]
    test_cache = test[pos_cash_current.columns]
    save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache,
                  test_cache)

    return train, test, y, db_conn, folds, cache_file