## Re-training ## ################# print("For training and testing...") path = "%s/All" % config.feat_folder for feat_name,column_name in zip(feat_names, column_names): print "generate %s feat" % feat_name tfv = getTFV(ngram_range=ngram_range) X_tfidf_train = tfv.fit_transform(dfTrain[column_name]) X_tfidf_test = tfv.transform(dfTest[column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_tfidf_train, f, -1) with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_tfidf_test, f, -1) ## svd svd = TruncatedSVD(n_components=svd_n_components, n_iter=15) X_svd_train = svd.fit_transform(X_tfidf_train) X_svd_test = svd.transform(X_tfidf_test) with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open("%s/test.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) print("Done.") ## save feat names print("Feature names are stored in %s" % feat_name_file) feat_names += [ "%s_individual_svd%d"%(f, svd_n_components) for f in feat_names ] dump_feat_name(feat_names, feat_name_file) print("All Done.")
with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_valid, f, -1) print X_train.shape print X_valid.shape ## extract statistical distance features if stats_feat_flag: dfTrain2 = dfTrain.iloc[trainInd].copy() dfValid = dfTrain.iloc[validInd].copy() extract_statistical_distance_feat(path, dfTrain2, dfValid, "valid", feat_names) ## save feat names print("Feature names are stored in %s" % feat_name_file) ## dump feat name dump_feat_name(feat_names, feat_name_file) print("----time elapsed----", str(timedelta(seconds=time.time() - start_time))) print("Done.") #=========================================== ###For testing... print sys.argv[1] if sys.argv[1] == "Testing": if sys.argv[2] == "All": Ntest = range(config.test_subset_number) else: exec("Ntest =" + sys.argv[2]) path = "%s/All" % config.feat_folder ## use full version for X_train
# for fold, (trainInd, validInd) in enumerate(skf[run]): print("Run: %d, Fold: %d" % (run + 1, fold + 1)) path = "%s/Run%d/Fold%d" % (config.feat_folder, run + 1, fold + 1) dfTrain2 = dfTrain.iloc[trainInd].copy() dfValid = dfTrain.iloc[validInd].copy() print dfTrain2.shape, dfValid.shape ## extract feat new_feat_names = extract_feat(path, dfTrain2, dfValid, "valid", feat_names, column_names) print("----time elapsed----", str(timedelta(seconds=time.time() - start_time))) ## dump feat name dump_feat_name(new_feat_names, feat_name_file) print("Done.") #=========================================== ###For testing... if sys.argv[1] == "Testing": if sys.argv[2] == "All": Ntest = range(config.test_subset_number) else: exec("Ntest =" + sys.argv[2]) for vec_type in vec_types: print "Processing vector %s" % vec_type feat_names = ["question1", "question2"] feat_names = [ name + "_%s_%s_vocabulary" % (vec_type, vocabulary_type)