X_anti_1234 = load('data/train_anti_1234_c1_r')
X_test_anti_1234 = load('data/test_anti_1234_c1_r')

X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b, "ann_alt_ngram_wm", X, y, X_test, generate_test=True, xempty=None)
# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234,
               train_alt, train_ngram, train_wm, psim_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2,
                    X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt,
                    test_ngram, test_wm, psim_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40,
                  dropx=[0.3, 0.5, 0.5],
                  nb_neuronx=[1024, 512],
                  validation_split=0.,
                  verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b,
            "ann_alt_ngram_wm",
            X,
            y,
            X_test,
            generate_test=True,
            xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b, "ann10b_ver2", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train, tf_train))
X_test = np.hstack((X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test, tf_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0, random_state=1)

cv_generate(b, "ann_tfidf", X, y, X_test, generate_test=True, xempty=None)
Exemple #5
0
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234,
               train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2,
                    X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt,
                    test_ngram, test_wm, psim_test, prod1234_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40,
                  dropx=[0.3, 0.5, 0.5],
                  nb_neuronx=[1024, 512],
                  validation_split=0.,
                  verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b,
            "ann10b_noamazon",
            X,
            y,
            X_test,
            generate_test=True,
            xempty=None)
train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test))

print(X.shape, X_test.shape, y.shape)

np.random.seed(44)
nn = RandomForestRegressor(100)

cv_generate(nn, "RF", X, y, X_test, generate_test=True, xempty=None)
Exemple #7
0
X_test_anti_1234 = load('data/test_anti_1234_c1_r')

X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

predicted_train = pd.read_csv('input/stacking_master_train_V4_rerun.csv').drop('Unnamed: 0', axis=1).values
predicted_test = pd.read_csv('input/stacking_master_test_V4_rerun.csv').drop('Unnamed: 0', axis=1).values

X = np.hstack((X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, predicted_train, prod1234_train))
X_test = np.hstack((X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, predicted_test, prod1234_test))

np.random.seed(43)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0, random_state=1)

cv_generate(b, "ANN_2level_pred4", X, y, X_test, generate_test=True, xempty=None)
Exemple #8
0
        kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa,
                                           greater_is_better=True)

        model = grid_search.GridSearchCV(estimator=clf,
                                         param_grid=param_grid,
                                         scoring=kappa_scorer,
                                         verbose=10,
                                         n_jobs=-1,
                                         iid=True,
                                         refit=True,
                                         cv=5)
        model.fit(X, y)

        X_train, X_test_, y_train, y_test = cross_validation.train_test_split(
            X, y, test_size=0.25, random_state=0)

        svd_ = clf.steps[1][1]
        #print svd_.dual_coef_

        print("Best score: %0.3f" % model.best_score_)
        print("Best parameters set:")
        best_parameters = model.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

        best_model = model.best_estimator_
        #best_model.fit(X,y)

        b = BaggingRegressor(best_model, 5, bootstrap=False, verbose=10)
        cv_generate(b, "SVM5b_final", X, y, X_test)
X_1234_3 = load('data/train1234_3_c1_r')
X_test_1234_3 = load('data/test1234_3_c1_r')
X_anti_1234 = load('data/train_anti_1234_c1_r')
X_test_anti_1234 = load('data/test_anti_1234_c1_r')

X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b, "ann_wm_c1r2", X, y, X_test, generate_test=True, xempty=None)
Exemple #10
0
X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b, "ann_1234_7_ver2", X, y, X_test, generate_test=True, xempty=None)
Exemple #11
0
train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234,
               train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2,
                    X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt,
                    test_ngram, test_wm, psim_test, prod1234_test))

print(X.shape, X_test.shape, y.shape)

np.random.seed(44)
nn = KNeighborsClassifier()
b = BaggingRegressor(nn, 5, bootstrap=False, verbose=0)

cv_generate(b, "KNN-5-bagging", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test))

np.random.seed(44)
nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0)
b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0)

cv_generate(b, "ann10b_noamazon", X, y, X_test, generate_test=True, xempty=None)
                #'learner__C': [15,10],'learner__gamma':[0,0.001]}
                #C=6.0 and ngram=1,4 gives the best performance for the data we have right now



                kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)


                model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer,
                                                                         verbose=10, n_jobs=-1, iid=True, refit=True, cv=5)
                model.fit(X, y)


                X_train, X_test_, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)


                svd_=clf.steps[1][1]
                #print svd_.dual_coef_

                print("Best score: %0.3f" % model.best_score_)
                print("Best parameters set:")
                best_parameters = model.best_estimator_.get_params()
                for param_name in sorted(param_grid.keys()):
                        print("\t%s: %r" % (param_name, best_parameters[param_name]))

                best_model = model.best_estimator_
                #best_model.fit(X,y)

                b = BaggingRegressor(best_model,5, bootstrap=False, verbose=10)
                cv_generate(b, "SVM5b_final", X, y, X_test)
train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
test_ngram = load('data/test1234_ngram_r')

train_wm, test_wm = load('data/wm_features')

# psim_train = pd.read_csv('data/product_simscore_train.csv').values
# psim_test = pd.read_csv('data/product_simscore_test.csv').values
psim_train = pd.read_csv('data/product_simscore_train.csv').values
psim_test = pd.read_csv('data/product_simscore_test.csv').values

prod1234_train = load('data/train1234_7')
prod1234_test = load('data/test1234_7')

# tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values
# tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values

X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train))
X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test))

print(X.shape, X_test.shape, y.shape)

np.random.seed(44)
nn = KNeighborsClassifier()
b = BaggingRegressor(nn, 5, bootstrap=False, verbose=0)

cv_generate(b, "KNN-5-bagging", X, y, X_test, generate_test=True, xempty=None)