def extract_tfidf_feat(df): df["all_text"] = list(df.apply(cat_text, axis=1)) vec_types = ["tfidf", "bow"] feat_names = ["question1", "question2"] for vec_type in vec_types: if vec_type == "tfidf": vec = getTFV(ngram_range=(1,3)) elif vec_type == "bow": vec = getBOW(ngram_range=(1,3)) # get common vocabulary vec.fit(df["all_text"]) vocabulary = vec.vocabulary_ print("generate ngram %s feat for %s" % (vec_type, feat_names[0])) if vec_type == "tfidf": vec = getTFV(ngram_range=(1, 3), vocabulary=vocabulary) elif vec_type == "bow": vec = getBOW(ngram_range=(1, 3), vocabulary=vocabulary) # fit common vocabulary on each specific question q1_vec = vec.fit_transform(df[feat_names[0]]) # with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[0], vec_type), "wb") as f: # cPickle.dump(q1_vec, f, -1) q2_vec = vec.fit_transform(df[feat_names[1]]) # with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[1], vec_type), "wb") as f: # cPickle.dump(q2_vec, f, -1) print("q1_vec has shape: %s, while q2_vec has shape: %s" % (q1_vec.shape, q2_vec.shape)) # calculate Cos distance of these 2 vecs print("generate common %s cosine sim feat for q1 and q2" % vec_type) df["%s_cos_of_q1_q2" % vec_type] = np.asarray(map(cosine_sim, q1_vec, q2_vec))[:, np.newaxis] # calculate SVD Cos distance of these 2 vecs # print("generate svd %s cosine sim feat for q1 and q2" % vec_type) # vertically stack q1 and q2 # q1_q2_vec = vstack([q1_vec, q2_vec]) # for n_components in svd_n_components: # svd = TruncatedSVD(n_components=n_components, n_iter=15) # svd.fit(q1_q2_vec) # q1_svd_vec = svd.transform(q1_vec) # q2_svd_vec = svd.transform(q2_vec) # print("q1_svd_vec has shape: %s, while q2_svd_vec has shape: %s" % (q1_svd_vec.shape, q2_svd_vec.shape)) # df["svd%s_%s_cos_of_q1_q2" % (n_components, vec_type)] = np.asarray(map(cosine_sim, q1_svd_vec, q2_svd_vec))[:, np.newaxis] return df
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names): new_feat_names = copy(feat_names) # first fit a bow/tfidf on the all_text to get # the common vocabulary to ensure query/title/description # has the same length bow/tfidf for computing the similarity if vocabulary_type == "common": if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range) vec.fit(dfTrain["all_text"]) vocabulary = vec.vocabulary_ elif vocabulary_type == "individual": vocabulary = None for feat_name, column_name in zip(feat_names, column_names): ############# # basic bow/tfidf feat # ############# print "generate %s feat for %s" % (vec_type, column_name) if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary) X_train = vec.fit_transform(dfTrain[column_name]) X_test = vec.transform(dfTest[column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_train, f, -1) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(X_test, f, -1) if stats_feat_flag: ################### # bow/tfidf cosine sim stats feat # ################### # get the indices of pooled samples relevance_indices_dict = get_sample_indices_by_relevance(dfTrain) query_relevance_indices_dict = get_sample_indices_by_relevance( dfTrain, "qid") # skip query part if column_name in ["product_title", "product_description"]: print "generate %s stats feat for %s" % (vec_type, column_name) # train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open( "%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump( cosine_sim_stats_feat_by_query_relevance_train, f, -1) # test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values) with open( "%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1) # update feat names new_feat_names.append("%s_cosine_sim_stats_feat_by_relevance" % feat_name) new_feat_names.append( "%s_cosine_sim_stats_feat_by_query_relevance" % feat_name) ########### # cosine sim feat # ########### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s cosine sim feat for %s and %s" % ( vec_type, feat_names[i], feat_names[j]) for mod in ["train", mode]: with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]), "rb") as f: target_vec = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] # dump feat with open( "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type), "wb") as f: cPickle.dump(sim, f, -1) # update feat names new_feat_names.append("%s_%s_%s_cosine_sim" % (feat_names[i], feat_names[j], vec_type)) ######### # SVD features # ######### # we fit svd use stacked query/title/description bow/tfidf for further cosine simalirity computation for i, feat_name in enumerate(feat_names): with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) if i == 0: X_vec_all_train = X_vec_train else: X_vec_all_train = vstack([X_vec_all_train, X_vec_train]) for n_components in svd_n_components: svd = TruncatedSVD(n_components=n_components, n_iter=15) svd.fit(X_vec_all_train) # load bow/tfidf (for less coding...) for feat_name, column_name in zip(feat_names, column_names): print "generate common %s-svd%d feat for %s" % ( vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) X_svd_train = svd.transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open( "%s/train.%s_common_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) # update feat names new_feat_names.append("%s_common_svd%d" % (feat_name, n_components)) if stats_feat_flag: ################### # bow/tfidf-svd cosine sim stats feat # ################### if column_name in ["product_title", "product_description"]: print "generate common %s-svd%d stats feat for %s" % ( vec_type, n_components, column_name) # train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open( "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump( cosine_sim_stats_feat_by_query_relevance_train, f, -1) # test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values) with open( "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump( cosine_sim_stats_feat_by_query_relevance_test, f, -1) # update feat names new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components)) new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components)) ########### # cosine sim feat # ########### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s-svd%d cosine sim feat for %s and %s" % ( vec_type, n_components, feat_names[i], feat_names[j]) for mod in ["train", mode]: with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[i], n_components), "rb") as f: target_vec = cPickle.load(f) with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[j], n_components), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] # dump feat with open( "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type, n_components), "wb") as f: cPickle.dump(sim, f, -1) # update feat names new_feat_names.append( "%s_%s_%s_common_svd%d_cosine_sim" % (feat_names[i], feat_names[j], vec_type, n_components)) ############# # Individual SVD feat # ############# # generate individual svd feat for feat_name, column_name in zip(feat_names, column_names): print "generate individual %s-svd%d feat for %s" % ( vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) svd = TruncatedSVD(n_components=n_components, n_iter=15) X_svd_train = svd.fit_transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open( "%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open( "%s/%s.%s_individual_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) # update feat names new_feat_names.append("%s_individual_svd%d" % (feat_name, n_components)) if stats_feat_flag: ##################### # bow/tfidf-svd cosine sim stats feat # ##################### if column_name in ["product_title", "product_description"]: print "generate individual %s-svd%d stats feat for %s" % ( vec_type, n_components, column_name) # train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open( "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump( cosine_sim_stats_feat_by_query_relevance_train, f, -1) # test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values) with open( "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump( cosine_sim_stats_feat_by_query_relevance_test, f, -1) # update feat names new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components)) new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components)) """ ############# # bow/tfidf-tsne feat # ############# # generate t-sne feat for n_components in tsne_n_components: for feat_name,column_name in zip(feat_names, column_names): print "generate individual %s-tsne%d feat for %s" % (vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) tsne = TSNE(n_components=n_components, init='pca', random_state=2015, metric="cosine") X = vstack([X_vec_train, X_vec_test]) Y = tsne.fit_transform(X) num_train = X_vec_train.shape[0] X_tsne_train = Y[:num_train] X_tsne_test = Y[num_train:] with open("%s/train.%s_individual_tsne%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_tsne_train, f, -1) with open("%s/%s.%s_individual_tsne%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_tsne_test, f, -1) ######################### # bow/tfidf-tsne euclidean distance stats feat # ######################### if column_name in ["product_title", "product_description"]: print "generate individual %s-tsne%d stats feat for %s" % (vec_type, n_components, column_name) # train euclidean_dist_stats_feat_by_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_train, dfTrain["id"].values, relevance_indices_dict) euclidean_dist_stats_feat_by_query_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_relevance_train, f, -1) with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_train, f, -1) # test euclidean_dist_stats_feat_by_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_test, dfTest["id"].values, relevance_indices_dict) euclidean_dist_stats_feat_by_query_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values) with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_relevance_test, f, -1) with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_test, f, -1) # update feat names new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance" % (feat_name, n_components) ) new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance" % (feat_name, n_components) ) """ return new_feat_names
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names): print "Get feature names..." new_feat_names = copy(feat_names) ## first fit a bow/tfidf on the all_text to get ## the common vocabulary to ensure query/title/description ## has the same length bow/tfidf for computing the similarity print "Process vocab..." if vocabulary_type == "common": if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range) vec.fit(dfTrain["all_text"]) vocabulary = vec.vocabulary_ elif vocabulary_type == "individual": vocabulary = None ## get the indices of pooled samples relevance_indices_dict = get_sample_indices_by_relevance(dfTrain) query_relevance_indices_dict = get_sample_indices_by_relevance(dfTrain, "qid") for feat_name,column_name in zip(feat_names, column_names): print "Working on %s for %s" % (feat_name, column_name) if (os.path.isfile("%s/train.%s.feat.pkl" % (path, feat_name)) and os.path.isfile("%s/%s.%s.feat.pkl" % (path, mode, feat_name))): continue ########################## ## basic bow/tfidf feat ## ########################## print "Generate %s feat for %s" % (vec_type, column_name) if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary) if (os.path.isfile("%s/train.%s.feat.pkl" % (path, feat_name))): continue X_train = vec.fit_transform(dfTrain[column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_train, f, -1) if os.path.isfile("%s/%s.%s.feat.pkl" % (path, mode, feat_name))): continue X_test = vec.transform(dfTest[column_name]) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(X_test, f, -1) if stats_feat_flag: ##################################### ## bow/tfidf cosine sim stats feat ## ##################################### if (os.path.isfile("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name)) and os.path.isfile("%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name)) and os.path.isfile("%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name)) and os.path.isfile("%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name))): continue ## skip query part if column_name in ["product_title", "product_description"]: print "generate %s stats feat for %s" % (vec_type, column_name) ## train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, relevance_indices_dict) with open("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open("%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1) ## test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, relevance_indices_dict) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values)
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names): print 'inside fun', vec_type new_feat_names = copy(feat_names) ## first fit a bow/tfidf on the all_text to get ## the common vocabulary to ensure question1/question2 ## has the same length bow/tfidf for computing the similarity if vocabulary_type == "common": if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range) vec.fit(dfTrain["all_text"]) vocabulary = vec.vocabulary_ elif vocabulary_type == "individual": vocabulary = None for feat_name, column_name in zip(feat_names, column_names): ########################## ## basic bow/tfidf feat ## ########################## print "generate %s feat for %s\n" % (vec_type, column_name) if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary) X_train = vec.fit_transform(dfTrain[column_name]) X_test = vec.transform(dfTest[column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_train, f, -1) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(X_test, f, -1) ##################### ## cosine sim feat ## ##################### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s cosine sim feat for %s and %s" % ( vec_type, feat_names[i], feat_names[j]) for mod in ["train", mode]: with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]), "rb") as f: target_vec = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] # sim = np.asarray(map(cosine_sim, target_vec, obs_vec)).reshape(-1,1) ## dump feat with open( "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type), "wb") as f: cPickle.dump(sim, f, -1) ## update feat names new_feat_names.append("%s_%s_%s_cosine_sim" % (feat_names[i], feat_names[j], vec_type)) ################## ## SVD features ## ################## ## we fit svd use stacked question1/question2 bow/tfidf for further cosine simalirity computation for i, feat_name in enumerate(feat_names): with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) if i == 0: X_vec_all_train = X_vec_train else: X_vec_all_train = vstack([X_vec_all_train, X_vec_train]) for n_components in svd_n_components: svd = TruncatedSVD(n_components=n_components, n_iter=15) svd.fit(X_vec_all_train) ## load bow/tfidf (for less coding...) for feat_name, column_name in zip(feat_names, column_names): print "generate common %s-svd%d feat for %s" % ( vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) X_svd_train = svd.transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open( "%s/train.%s_common_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) ## update feat names new_feat_names.append("%s_common_svd%d" % (feat_name, n_components)) ##################### ## cosine sim feat ## ##################### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s-svd%d cosine sim feat for %s and %s" % ( vec_type, n_components, feat_names[i], feat_names[j]) for mod in ["train", mode]: with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[i], n_components), "rb") as f: target_vec = cPickle.load(f) with open( "%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[j], n_components), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] ## dump feat with open( "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type, n_components), "wb") as f: cPickle.dump(sim, f, -1) ## update feat names new_feat_names.append( "%s_%s_%s_common_svd%d_cosine_sim" % (feat_names[i], feat_names[j], vec_type, n_components)) ######################### ## Individual SVD feat ## ######################### ## generate individual svd feat for feat_name, column_name in zip(feat_names, column_names): print "generate individual %s-svd%d feat for %s" % ( vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) svd = TruncatedSVD(n_components=n_components, n_iter=15) X_svd_train = svd.fit_transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open( "%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open( "%s/%s.%s_individual_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) ## update feat names new_feat_names.append("%s_individual_svd%d" % (feat_name, n_components)) return new_feat_names
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names): new_feat_names = copy(feat_names) ## first fit a bow/tfidf on the all_text to get ## the common vocabulary to ensure query/title/description ## has the same length bow/tfidf for computing the similarity if vocabulary_type == "common": if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range) vec.fit(dfTrain["all_text"]) vocabulary = vec.vocabulary_ elif vocabulary_type == "individual": vocabulary = None for feat_name, column_name in zip(feat_names, column_names): ########################## ## basic bow/tfidf feat ## ########################## print "generate %s feat for %s" % (vec_type, column_name) if vec_type == "tfidf": vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary) elif vec_type == "bow": vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary) X_train = vec.fit_transform(dfTrain[column_name]) X_test = vec.transform(dfTest[column_name]) with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(X_train, f, -1) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(X_test, f, -1) if stats_feat_flag: ##################################### ## bow/tfidf cosine sim stats feat ## ##################################### ## get the indices of pooled samples relevance_indices_dict = get_sample_indices_by_relevance(dfTrain) query_relevance_indices_dict = get_sample_indices_by_relevance(dfTrain, "qid") ## skip query part if column_name in ["product_title", "product_description"]: print "generate %s stats feat for %s" % (vec_type, column_name) ## train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, relevance_indices_dict ) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values, ) with open("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb" ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1) ## test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, relevance_indices_dict ) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values, ) with open("%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name), "wb" ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1) ## update feat names new_feat_names.append("%s_cosine_sim_stats_feat_by_relevance" % feat_name) new_feat_names.append("%s_cosine_sim_stats_feat_by_query_relevance" % feat_name) ##################### ## cosine sim feat ## ##################### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s cosine sim feat for %s and %s" % (vec_type, feat_names[i], feat_names[j]) for mod in ["train", mode]: with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]), "rb") as f: target_vec = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] ## dump feat with open( "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type), "wb" ) as f: cPickle.dump(sim, f, -1) ## update feat names new_feat_names.append("%s_%s_%s_cosine_sim" % (feat_names[i], feat_names[j], vec_type)) ################## ## SVD features ## ################## ## we fit svd use stacked query/title/description bow/tfidf for further cosine simalirity computation for i, feat_name in enumerate(feat_names): with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) if i == 0: X_vec_all_train = X_vec_train else: X_vec_all_train = vstack([X_vec_all_train, X_vec_train]) for n_components in svd_n_components: svd = TruncatedSVD(n_components=n_components, n_iter=15) svd.fit(X_vec_all_train) ## load bow/tfidf (for less coding...) for feat_name, column_name in zip(feat_names, column_names): print "generate common %s-svd%d feat for %s" % (vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) X_svd_train = svd.transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open("%s/train.%s_common_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) ## update feat names new_feat_names.append("%s_common_svd%d" % (feat_name, n_components)) if stats_feat_flag: ##################################### ## bow/tfidf-svd cosine sim stats feat ## ##################################### if column_name in ["product_title", "product_description"]: print "generate common %s-svd%d stats feat for %s" % (vec_type, n_components, column_name) ## train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, relevance_indices_dict, ) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values, ) with open( "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1) ## test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, relevance_indices_dict, ) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values, ) with open( "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1) ## update feat names new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components) ) new_feat_names.append( "%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components) ) ##################### ## cosine sim feat ## ##################### for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): print "generate common %s-svd%d cosine sim feat for %s and %s" % ( vec_type, n_components, feat_names[i], feat_names[j], ) for mod in ["train", mode]: with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[i], n_components), "rb") as f: target_vec = cPickle.load(f) with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[j], n_components), "rb") as f: obs_vec = cPickle.load(f) sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis] ## dump feat with open( "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type, n_components), "wb", ) as f: cPickle.dump(sim, f, -1) ## update feat names new_feat_names.append( "%s_%s_%s_common_svd%d_cosine_sim" % (feat_names[i], feat_names[j], vec_type, n_components) ) ######################### ## Individual SVD feat ## ######################### ## generate individual svd feat for feat_name, column_name in zip(feat_names, column_names): print "generate individual %s-svd%d feat for %s" % (vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) svd = TruncatedSVD(n_components=n_components, n_iter=15) X_svd_train = svd.fit_transform(X_vec_train) X_svd_test = svd.transform(X_vec_test) with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_train, f, -1) with open("%s/%s.%s_individual_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_svd_test, f, -1) ## update feat names new_feat_names.append("%s_individual_svd%d" % (feat_name, n_components)) if stats_feat_flag: ######################################### ## bow/tfidf-svd cosine sim stats feat ## ######################################### if column_name in ["product_title", "product_description"]: print "generate individual %s-svd%d stats feat for %s" % (vec_type, n_components, column_name) ## train cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, relevance_indices_dict, ) cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values, ) with open( "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1) with open( "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1) ## test cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, relevance_indices_dict, ) cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat( "cosine", X_svd_train, dfTrain["id"].values, X_svd_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values, ) with open( "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1) with open( "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb", ) as f: cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1) ## update feat names new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components) ) new_feat_names.append( "%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components) ) """ ######################### ## bow/tfidf-tsne feat ## ######################### ## generate t-sne feat for n_components in tsne_n_components: for feat_name,column_name in zip(feat_names, column_names): print "generate individual %s-tsne%d feat for %s" % (vec_type, n_components, column_name) with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f: X_vec_train = cPickle.load(f) with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f: X_vec_test = cPickle.load(f) tsne = TSNE(n_components=n_components, init='pca', random_state=2015, metric="cosine") X = vstack([X_vec_train, X_vec_test]) Y = tsne.fit_transform(X) num_train = X_vec_train.shape[0] X_tsne_train = Y[:num_train] X_tsne_test = Y[num_train:] with open("%s/train.%s_individual_tsne%d.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(X_tsne_train, f, -1) with open("%s/%s.%s_individual_tsne%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(X_tsne_test, f, -1) ################################################## ## bow/tfidf-tsne euclidean distance stats feat ## ################################################## if column_name in ["product_title", "product_description"]: print "generate individual %s-tsne%d stats feat for %s" % (vec_type, n_components, column_name) ## train euclidean_dist_stats_feat_by_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_train, dfTrain["id"].values, relevance_indices_dict) euclidean_dist_stats_feat_by_query_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_train, dfTrain["id"].values, query_relevance_indices_dict, dfTrain["qid"].values) with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_relevance_train, f, -1) with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_train, f, -1) ## test euclidean_dist_stats_feat_by_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_test, dfTest["id"].values, relevance_indices_dict) euclidean_dist_stats_feat_by_query_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values, X_tsne_test, dfTest["id"].values, query_relevance_indices_dict, dfTest["qid"].values) with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_relevance_test, f, -1) with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f: cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_test, f, -1) ## update feat names new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance" % (feat_name, n_components) ) new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance" % (feat_name, n_components) ) """ return new_feat_names