X_anti_1234 = load('data/train_anti_1234_c1_r') X_test_anti_1234 = load('data/test_anti_1234_c1_r') X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann_alt_ngram_wm", X, y, X_test, generate_test=True, xempty=None)
# psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann_alt_ngram_wm", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann10b_ver2", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train, tf_train)) X_test = np.hstack((X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test, tf_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0, random_state=1) cv_generate(b, "ann_tfidf", X, y, X_test, generate_test=True, xempty=None)
psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann10b_noamazon", X, y, X_test, generate_test=True, xempty=None)
train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) print(X.shape, X_test.shape, y.shape) np.random.seed(44) nn = RandomForestRegressor(100) cv_generate(nn, "RF", X, y, X_test, generate_test=True, xempty=None)
X_test_anti_1234 = load('data/test_anti_1234_c1_r') X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') predicted_train = pd.read_csv('input/stacking_master_train_V4_rerun.csv').drop('Unnamed: 0', axis=1).values predicted_test = pd.read_csv('input/stacking_master_test_V4_rerun.csv').drop('Unnamed: 0', axis=1).values X = np.hstack((X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, predicted_train, prod1234_train)) X_test = np.hstack((X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, predicted_test, prod1234_test)) np.random.seed(43) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0, random_state=1) cv_generate(b, "ANN_2level_pred4", X, y, X_test, generate_test=True, xempty=None)
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better=True) model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, scoring=kappa_scorer, verbose=10, n_jobs=-1, iid=True, refit=True, cv=5) model.fit(X, y) X_train, X_test_, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25, random_state=0) svd_ = clf.steps[1][1] #print svd_.dual_coef_ print("Best score: %0.3f" % model.best_score_) print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) best_model = model.best_estimator_ #best_model.fit(X,y) b = BaggingRegressor(best_model, 5, bootstrap=False, verbose=10) cv_generate(b, "SVM5b_final", X, y, X_test)
X_1234_3 = load('data/train1234_3_c1_r') X_test_1234_3 = load('data/test1234_3_c1_r') X_anti_1234 = load('data/train_anti_1234_c1_r') X_test_anti_1234 = load('data/test_anti_1234_c1_r') X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann_wm_c1r2", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann_1234_7_ver2", X, y, X_test, generate_test=True, xempty=None)
train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) print(X.shape, X_test.shape, y.shape) np.random.seed(44) nn = KNeighborsClassifier() b = BaggingRegressor(nn, 5, bootstrap=False, verbose=0) cv_generate(b, "KNN-5-bagging", X, y, X_test, generate_test=True, xempty=None)
X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) np.random.seed(44) nn = NnRegression(nb_epoch=40, dropx=[0.3, 0.5, 0.5], nb_neuronx=[1024, 512], validation_split=0., verbose=0) b = BaggingRegressor(nn, 10, bootstrap=False, verbose=0) cv_generate(b, "ann10b_noamazon", X, y, X_test, generate_test=True, xempty=None)
#'learner__C': [15,10],'learner__gamma':[0,0.001]} #C=6.0 and ngram=1,4 gives the best performance for the data we have right now kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True) model = grid_search.GridSearchCV(estimator = clf, param_grid=param_grid, scoring=kappa_scorer, verbose=10, n_jobs=-1, iid=True, refit=True, cv=5) model.fit(X, y) X_train, X_test_, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0) svd_=clf.steps[1][1] #print svd_.dual_coef_ print("Best score: %0.3f" % model.best_score_) print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) best_model = model.best_estimator_ #best_model.fit(X,y) b = BaggingRegressor(best_model,5, bootstrap=False, verbose=10) cv_generate(b, "SVM5b_final", X, y, X_test)
train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r') test_ngram = load('data/test1234_ngram_r') train_wm, test_wm = load('data/wm_features') # psim_train = pd.read_csv('data/product_simscore_train.csv').values # psim_test = pd.read_csv('data/product_simscore_test.csv').values psim_train = pd.read_csv('data/product_simscore_train.csv').values psim_test = pd.read_csv('data/product_simscore_test.csv').values prod1234_train = load('data/train1234_7') prod1234_test = load('data/test1234_7') # tf_train = pd.read_csv('data/Tf_idf_train.csv', header=None).values # tf_test = pd.read_csv('data/Tf_idf_test.csv', header=None).values X = np.hstack((X2, X_extra, X_1234, X_1234_2, X_1234_3, X_union_f, X_anti_1234, train_alt, train_ngram, train_wm, psim_train, prod1234_train)) X_test = np.hstack((X2_test, X_extra_test, X_test_1234, X_test_1234_2, X_test_1234_3, X_test_union_f, X_test_anti_1234, test_alt, test_ngram, test_wm, psim_test, prod1234_test)) print(X.shape, X_test.shape, y.shape) np.random.seed(44) nn = KNeighborsClassifier() b = BaggingRegressor(nn, 5, bootstrap=False, verbose=0) cv_generate(b, "KNN-5-bagging", X, y, X_test, generate_test=True, xempty=None)