Example #1
0
def get_mean_feature(tr,
                     val,
                     cols,
                     target,
                     tr_filename="../output/tr_tmp.pkl",
                     val_filename="../output/val_tmp.pkl",
                     cv=5,
                     thresh=2,
                     seed=786,
                     rewrite=False):
    all_cols = cols + [target]

    mean_enc = TargetEncoder(cols=cols, targetcol=target, func='mean')
    cvlist = KFold(n_splits=cv, shuffle=True,
                   random_state=seed).split(tr[target])

    tr_data = cross_val_predict(mean_enc,
                                tr[all_cols],
                                tr[target],
                                cv=cvlist,
                                method='transform',
                                verbose=1)
    val_data = mean_enc.fit(tr[all_cols]).transform(val[all_cols])

    return tr_data, val_data
Example #2
0
def clean_data (df):

	df = df.copy()
	df.loc[:,'time_elapsed'] = df.loc[:,'task_end_timestamp'].subtract(df.loc[:,'task_start_timestamp'])
	df['time_elapsed'] = df['time_elapsed'].replace(0, np.nan)
	part_column = df.part
	te = TargetEncoder('part')
	df = te.fit_transform(df, df['time_elapsed'])
	df['part_original'] = part_column
	df = impute_model_kNN(df, 5)
	return df
Example #3
0
def get_expanding_count(tr,
                        val,
                        cols,
                        target,
                        tr_filename="../output/tr_tmp.pkl",
                        val_filename="../output/val_tmp.pkl",
                        seed=786,
                        rewrite=False):
    col_name = "_".join(cols) + '_expcount'
    all_cols = cols + [target]
    tr[col_name] = tr[all_cols].groupby(cols)[target].expanding(
        min_periods=1).count().shift().fillna(0).reset_index(level=0,
                                                             drop=True)

    exp_mean = TargetEncoder(cols=cols, targetcol=target, func='count')
    exp_mean.fit(tr[all_cols])
    val[col_name] = exp_mean.transform(val[all_cols])
    val[col_name] = val[col_name].fillna(0)

    return tr[col_name].values, val[col_name].values
Example #4
0
def get_count_feature(tr,
                      val,
                      cols,
                      target,
                      use_supp=False,
                      test_supp=None,
                      tr_filename="../output/tr_tmp.npy",
                      val_filename="../output/val_tmp.npy",
                      seed=786,
                      rewrite=False):
    all_cols = cols + [target]

    mean_enc = TargetEncoder(cols=cols, targetcol=target, func='count')
    if use_supp:
        mean_enc.fit(
            pd.concat([tr[all_cols],
                       test_supp[all_cols]]).reset_index(drop=False))
    else:
        mean_enc.fit(
            pd.concat([tr[all_cols], val[all_cols]]).reset_index(drop=False))
    tr_data = mean_enc.transform(tr[all_cols])
    val_data = mean_enc.transform(val[all_cols])

    return tr_data, val_data
Example #5
0
def get_std_feature(tr,
                    val,
                    cols,
                    target,
                    tr_filename="../output/tr_tmp.npy",
                    val_filename="../output/val_tmp.npy",
                    seed=786,
                    rewrite=False):
    all_cols = cols + [target]

    var_enc = TargetEncoder(cols=cols, targetcol=target, func='std')
    var_enc.fit(
        pd.concat([tr[all_cols], val[all_cols]]).reset_index(drop=True))
    tr_data = var_enc.transform(tr[all_cols])
    val_data = var_enc.transform(val[all_cols])

    return tr_data, val_data
Example #6
0
def get_unq_feature(tr,
                    val,
                    cols,
                    target,
                    tr_filename="../output/tr_tmp.pkl",
                    val_filename="../output/val_tmp.pkl",
                    seed=786,
                    rewrite=False):
    col_name = "_".join(cols) + '_unq_' + target
    all_cols = cols + [target]
    unq_cnt = TargetEncoder(cols=cols, targetcol=target, func='nunique')
    unq_cnt.fit(
        pd.concat([tr[all_cols], val[all_cols]]).reset_index(drop=True))
    tr[col_name] = unq_cnt.transform(tr[all_cols])
    val[col_name] = unq_cnt.transform(val[all_cols])

    tr[col_name] = tr[col_name].fillna(0)
    val[col_name] = val[col_name].fillna(0)

    return tr[col_name].values.astype(np.int32), val[col_name].values.astype(
        np.int32)
    X_imaget1_cat = train["image_top_1"].map(image_top_1_cat).values.reshape(
        -1, 1)
    X_imaget1_cat_test = test["image_top_1"].map(
        image_top_1_cat).values.reshape((-1, 1))
    #pipe = make_pipeline(TfidfVectorizer(ngram_range=(1,1), max_features=100000),
    #                      TruncatedSVD(20))
    #X_tsvd = pipe.fit_transform(train["description"].astype(str))
    #X_tsvd_test = pipe.transform(test["description"].astype(str))
    lbenc = LabelEncoder()
    train["deal_label"] = ContinousBinning(
        bin_array=[-1, 0.05, 0.4, 0.7, 1.1]).fit_transform(
            train["deal_probability"])
    train["deal_label"] = LabelEncoder().fit_transform(train["deal_label"])

    cat_deal_count = TargetEncoder(cols=["category_name"],
                                   targetcol="deal_label",
                                   func=np.bincount)
    X_cat_bins = cross_val_predict(cat_deal_count,
                                   train,
                                   y=train["deal_probability"],
                                   cv=cvlist,
                                   method="transform") / int(
                                       (1 - 1 / NFOLDS) * len(train))
    X_cat_bins = [
        arr.tolist() if len(arr) == 4 else [0, 0, 0, 0] for arr in X_cat_bins
    ]
    X_cat_bins = np.vstack(X_cat_bins)

    X_cat_bins_test = cat_deal_count.fit(train).transform(test) / len(train)
    X_cat_bins_test = [
        arr.tolist() if len(arr) == 4 else [0, 0, 0, 0]
Example #8
0
    for df in train, test:  #, train_active, test_active:
        df['city'] = df['region'].astype(str) + "_" + df["city"].astype(str)
        df = df.fillna(-1)

    y = train['deal_probability'].values
    cvlist = list(KFold(10, random_state=123).split(y))

    logger.info("Done. Read data with shape {} and {}".format(
        train.shape, test.shape))

    #########################################################
    ##  Count encode base features                         ##
    #########################################################
    logger.info("Generating count encoding features for base features")
    for col in CAT_COLS:
        trenc = TargetEncoder(cols=[col], targetcol='item_id', func='count')
        try:
            cols = [col] + ['item_id']
            trenc.fit(pd.concat([train[cols], test[cols]
                                 ]))  #train_active[cols], test_active[cols]]))
            X_train = trenc.transform(train)
            X_test = trenc.transform(test)

            logger.info("Saving count features for {}".format(col))
            np.save("../utility/X_train_{}_counts.npy".format(col), X_train)
            np.save("../utility/X_test_{}_counts.npy".format(col), X_test)
        except:
            logger.info("Could not find a valid transformation")
            continue

    #########################################################
        df['city'] = df['region'].astype(str) + "_" + df["city"].astype(str)
        df["param_1_2_3"] = df["param_1"].astype(str) + "_" + \
                            df["param_2"].astype(str) + "_" + \
                            df["param_3"].astype(str)
        df = df.fillna(-1)

    y = train['deal_probability'].values
    cvlist = list(KFold(5, random_state=123).split(y))

    logger.info("Done. Read data with shape {} and {}".format(train.shape, test.shape))
    #del train, test

    ################### Unique features #######################################
    logger.info("Get unique counts for different combinations")
    trenc = TargetEncoder(cols=["parent_category_name", "param_1_2_3"],
                          targetcol = "deal_probability",
                          func= 'nunique')
    train["pcat_p123_deal_nunq"] = cross_val_predict(trenc, train, y, cv=cvlist, method='transform')
    test["pcat_p123_deal_nunq"] = trenc.fit(train).transform(test)

    trenc = TargetEncoder(cols=["category_name", "param_1_2_3"],
                          targetcol = "deal_probability",
                          func= 'nunique')
    train["cat_p123_deal_nunq"] = cross_val_predict(trenc, train, y, cv=cvlist, method='transform')
    test["cat_p123_deal_nunq"] = trenc.fit(train).transform(test)

    trenc = TargetEncoder(cols=["city", "param_1_2_3"],
                          targetcol = "deal_probability",
                          func= 'nunique')
    train["city_p123_deal_nunq"] = cross_val_predict(trenc, train, y, cv=cvlist, method='transform')
    test["city_p123_deal_nunq"] = trenc.fit(train).transform(test)
    pipe2_features = ["cat_rel_price", "image_top_rel_price",#
                      "param1_rel_price", 
                  #"cat_city_rel_price", 
                   #   "image_city_rel_price"
                     ]


    # In[233]:


    #########################Target mean features #############################
    logger.info("Processing Targt mean encoding features")
    pipe3 = make_pipeline(
                make_union(
                    make_pipeline(TargetEncoder(cols=['category_name'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['city'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['image_top_1'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['user_id'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    #make_pipeline(TargetEncoder(cols=['param_2'], targetcol='deal_probability', func='mean'),
                    #             FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['region','parent_category_name'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['city','category_name'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
                    make_pipeline(TargetEncoder(cols=['city','image_top_1'], targetcol='deal_probability', func='mean'),
                                 FunctionTransformer(np.reshape, validate=False, kw_args={"newshape":(-1,1)})),
    'reg_lambds': 0.0005,
    'min_data_in_leaf': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'objective': 'binary',
    'num_leaves': 256,
    'verbose': 10
}

Pipeline.predict_proba_corr = predict_proba_corr

cat_feats = [c for c in feats if 'cat_' in c]

fs_sub = FeatureSubsetTransformer(feats)

en_target = TargetEncoder(cols=cat_feats, add_to_orig=True)
en_onehot = en.OneHotEncoder(cols=cat_feats)
en_binary = en.BinaryEncoder(cols=cat_feats)

for enc in ['target', 'onehot', 'en_binary', 'none']:
    lgb1 = lgb.LGBMClassifier(**lgb_params)
    feat_name = "base_feats" + '_' + enc
    if enc == 'target':
        print("here.......................")
        pipe = Pipeline([(feat_name, make_pipeline(fs_sub, en_target)),
                         ('lgb', lgb1)])
    elif enc == 'binary':
        pipe = Pipeline([(feat_name, make_pipeline(fs_sub, en_binary)),
                         ('lgb', lgb1)])
    elif enc == 'onehot':
        pipe = Pipeline([(feat_name, make_pipeline(fs_sub, en_onehot)),