Esempio n. 1
0
def extract_tfidf_feat(df):
    df["all_text"] = list(df.apply(cat_text, axis=1))
    vec_types = ["tfidf", "bow"]
    feat_names = ["question1", "question2"]
    for vec_type in vec_types:
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=(1,3))
        elif vec_type == "bow":
            vec = getBOW(ngram_range=(1,3))

        # get common vocabulary
        vec.fit(df["all_text"])
        vocabulary = vec.vocabulary_
        print("generate ngram %s feat for %s" % (vec_type, feat_names[0]))
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=(1, 3), vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=(1, 3), vocabulary=vocabulary)

        # fit common vocabulary on each specific question
        q1_vec = vec.fit_transform(df[feat_names[0]])
#        with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[0], vec_type), "wb") as f:
#            cPickle.dump(q1_vec, f, -1)
        q2_vec = vec.fit_transform(df[feat_names[1]])
#        with open("%s/train.%s.%s.pkl" % (config.processed_data_path, feat_names[1], vec_type), "wb") as f:
#            cPickle.dump(q2_vec, f, -1)
        print("q1_vec has shape: %s, while q2_vec has shape: %s" % (q1_vec.shape, q2_vec.shape))

        # calculate Cos distance of these 2 vecs
        print("generate common %s cosine sim feat for q1 and q2" % vec_type)
        df["%s_cos_of_q1_q2" % vec_type] = np.asarray(map(cosine_sim, q1_vec, q2_vec))[:, np.newaxis]

        # calculate SVD Cos distance of these 2 vecs
#        print("generate svd %s cosine sim feat for q1 and q2" % vec_type)
        # vertically stack q1 and q2
#        q1_q2_vec = vstack([q1_vec, q2_vec])
#        for n_components in svd_n_components:
#            svd = TruncatedSVD(n_components=n_components, n_iter=15)
#            svd.fit(q1_q2_vec)
#            q1_svd_vec = svd.transform(q1_vec)
#            q2_svd_vec = svd.transform(q2_vec)
#            print("q1_svd_vec has shape: %s, while q2_svd_vec has shape: %s" % (q1_svd_vec.shape, q2_svd_vec.shape))
#            df["svd%s_%s_cos_of_q1_q2" % (n_components, vec_type)] = np.asarray(map(cosine_sim, q1_svd_vec, q2_svd_vec))[:, np.newaxis]

    return df
    ######################
    ## Cross validation ##
    ######################
    print("For cross-validation...")
    for run in range(config.n_runs):
        ## use 33% for training and 67 % for validation
        ## so we switch trainInd and validInd
        for fold, (validInd, trainInd) in enumerate(skf[run]):
            print("Run: %d, Fold: %d" % (run+1, fold+1))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run+1, fold+1)
                
            for feat_name,column_name in zip(feat_names, column_names):
                print "generate %s feat" % feat_name
                ## tfidf
                tfv = getTFV(ngram_range=ngram_range)
                X_tfidf_train = tfv.fit_transform(dfTrain.iloc[trainInd][column_name])
                X_tfidf_valid = tfv.transform(dfTrain.iloc[validInd][column_name])
                with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_tfidf_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(X_tfidf_valid, f, -1)

                ## svd
                svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
                X_svd_train = svd.fit_transform(X_tfidf_train)
                X_svd_test = svd.transform(X_tfidf_valid)
                with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_train, f, -1)
                with open("%s/valid.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
                    cPickle.dump(X_svd_test, f, -1)
Esempio n. 3
0
    ###########
    # Cross validation #
    ###########
    print("For cross-validation...")
    for run in range(config.n_runs):
        # use 33% for training and 67 % for validation
        # so we switch trainInd and validInd
        for fold, (validInd, trainInd) in enumerate(skf[run]):
            print("Run: %d, Fold: %d" % (run + 1, fold + 1))
            path = "%s/Run%d/Fold%d" % (config.feat_folder, run + 1, fold + 1)

            for feat_name, column_name in zip(feat_names, column_names):
                print "generate %s feat" % feat_name
                # tfidf
                tfv = getTFV(ngram_range=ngram_range)
                X_tfidf_train = tfv.fit_transform(
                    dfTrain.iloc[trainInd][column_name])
                X_tfidf_valid = tfv.transform(
                    dfTrain.iloc[validInd][column_name])
                with open("%s/train.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_train, f, -1)
                with open("%s/valid.%s.feat.pkl" % (path, feat_name),
                          "wb") as f:
                    cPickle.dump(X_tfidf_valid, f, -1)

                # svd
                svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
                X_svd_train = svd.fit_transform(X_tfidf_train)
                X_svd_test = svd.transform(X_tfidf_valid)
Esempio n. 4
0
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names):

    new_feat_names = copy(feat_names)
    # first fit a bow/tfidf on the all_text to get
    # the common vocabulary to ensure query/title/description
    # has the same length bow/tfidf for computing the similarity
    if vocabulary_type == "common":
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range)
        vec.fit(dfTrain["all_text"])
        vocabulary = vec.vocabulary_
    elif vocabulary_type == "individual":
        vocabulary = None
    for feat_name, column_name in zip(feat_names, column_names):

        #############
        # basic bow/tfidf feat #
        #############
        print "generate %s feat for %s" % (vec_type, column_name)
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary)
        X_train = vec.fit_transform(dfTrain[column_name])
        X_test = vec.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)
        with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)

        if stats_feat_flag:
            ###################
            # bow/tfidf cosine sim stats feat #
            ###################
            # get the indices of pooled samples
            relevance_indices_dict = get_sample_indices_by_relevance(dfTrain)
            query_relevance_indices_dict = get_sample_indices_by_relevance(
                dfTrain, "qid")
            # skip query part
            if column_name in ["product_title", "product_description"]:
                print "generate %s stats feat for %s" % (vec_type, column_name)
                # train
                cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_train,
                    dfTrain["id"].values, relevance_indices_dict)
                cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_train,
                    dfTrain["id"].values, query_relevance_indices_dict,
                    dfTrain["qid"].values)
                with open(
                        "%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f,
                                 -1)
                with open(
                        "%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, feat_name), "wb") as f:
                    cPickle.dump(
                        cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                # test
                cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_test,
                    dfTest["id"].values, relevance_indices_dict)
                cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_test,
                    dfTest["id"].values, query_relevance_indices_dict,
                    dfTest["qid"].values)
                with open(
                        "%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, mode, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f,
                                 -1)
                with open(
                        "%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, mode, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test,
                                 f, -1)

                # update feat names
                new_feat_names.append("%s_cosine_sim_stats_feat_by_relevance" %
                                      feat_name)
                new_feat_names.append(
                    "%s_cosine_sim_stats_feat_by_query_relevance" % feat_name)

    ###########
    # cosine sim feat #
    ###########
    for i in range(len(feat_names) - 1):
        for j in range(i + 1, len(feat_names)):
            print "generate common %s cosine sim feat for %s and %s" % (
                vec_type, feat_names[i], feat_names[j])
            for mod in ["train", mode]:
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]),
                          "rb") as f:
                    target_vec = cPickle.load(f)
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]),
                          "rb") as f:
                    obs_vec = cPickle.load(f)
                sim = np.asarray(map(cosine_sim, target_vec,
                                     obs_vec))[:, np.newaxis]
                # dump feat
                with open(
                        "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" %
                    (path, mod, feat_names[i], feat_names[j], vec_type),
                        "wb") as f:
                    cPickle.dump(sim, f, -1)
            # update feat names
            new_feat_names.append("%s_%s_%s_cosine_sim" %
                                  (feat_names[i], feat_names[j], vec_type))

    #########
    # SVD features #
    #########
    # we fit svd use stacked query/title/description bow/tfidf for further cosine simalirity computation
    for i, feat_name in enumerate(feat_names):
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
            X_vec_train = cPickle.load(f)
        if i == 0:
            X_vec_all_train = X_vec_train
        else:
            X_vec_all_train = vstack([X_vec_all_train, X_vec_train])

    for n_components in svd_n_components:
        svd = TruncatedSVD(n_components=n_components, n_iter=15)
        svd.fit(X_vec_all_train)
        # load bow/tfidf (for less coding...)
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate common %s-svd%d feat for %s" % (
                vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name),
                      "rb") as f:
                X_vec_test = cPickle.load(f)
            X_svd_train = svd.transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open(
                    "%s/train.%s_common_svd%d.feat.pkl" %
                (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open(
                    "%s/%s.%s_common_svd%d.feat.pkl" %
                (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)

            # update feat names
            new_feat_names.append("%s_common_svd%d" %
                                  (feat_name, n_components))

            if stats_feat_flag:
                ###################
                # bow/tfidf-svd cosine sim stats feat #
                ###################
                if column_name in ["product_title", "product_description"]:
                    print "generate common %s-svd%d stats feat for %s" % (
                        vec_type, n_components, column_name)
                    # train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_train, dfTrain["id"].values,
                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_train, dfTrain["id"].values,
                        query_relevance_indices_dict, dfTrain["qid"].values)
                    with open(
                            "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                            % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train,
                                     f, -1)
                    with open(
                            "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                            % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(
                            cosine_sim_stats_feat_by_query_relevance_train, f,
                            -1)
                    # test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_test, dfTest["id"].values,
                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_test, dfTest["id"].values,
                        query_relevance_indices_dict, dfTest["qid"].values)
                    with open(
                            "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                            % (path, mode, feat_name, n_components),
                            "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test,
                                     f, -1)
                    with open(
                            "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                            % (path, mode, feat_name, n_components),
                            "wb") as f:
                        cPickle.dump(
                            cosine_sim_stats_feat_by_query_relevance_test, f,
                            -1)

                    # update feat names
                    new_feat_names.append(
                        "%s_common_svd%d_cosine_sim_stats_feat_by_relevance" %
                        (feat_name, n_components))
                    new_feat_names.append(
                        "%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance"
                        % (feat_name, n_components))

        ###########
        # cosine sim feat #
        ###########
        for i in range(len(feat_names) - 1):
            for j in range(i + 1, len(feat_names)):
                print "generate common %s-svd%d cosine sim feat for %s and %s" % (
                    vec_type, n_components, feat_names[i], feat_names[j])
                for mod in ["train", mode]:
                    with open(
                            "%s/%s.%s_common_svd%d.feat.pkl" %
                        (path, mod, feat_names[i], n_components), "rb") as f:
                        target_vec = cPickle.load(f)
                    with open(
                            "%s/%s.%s_common_svd%d.feat.pkl" %
                        (path, mod, feat_names[j], n_components), "rb") as f:
                        obs_vec = cPickle.load(f)
                    sim = np.asarray(map(cosine_sim, target_vec,
                                         obs_vec))[:, np.newaxis]
                    # dump feat
                    with open(
                            "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" %
                        (path, mod, feat_names[i], feat_names[j], vec_type,
                         n_components), "wb") as f:
                        cPickle.dump(sim, f, -1)
                # update feat names
                new_feat_names.append(
                    "%s_%s_%s_common_svd%d_cosine_sim" %
                    (feat_names[i], feat_names[j], vec_type, n_components))

        #############
        # Individual SVD feat #
        #############
        # generate individual svd feat
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate individual %s-svd%d feat for %s" % (
                vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name),
                      "rb") as f:
                X_vec_test = cPickle.load(f)
            svd = TruncatedSVD(n_components=n_components, n_iter=15)
            X_svd_train = svd.fit_transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open(
                    "%s/train.%s_individual_svd%d.feat.pkl" %
                (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open(
                    "%s/%s.%s_individual_svd%d.feat.pkl" %
                (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)
            # update feat names
            new_feat_names.append("%s_individual_svd%d" %
                                  (feat_name, n_components))

            if stats_feat_flag:
                #####################
                # bow/tfidf-svd cosine sim stats feat #
                #####################
                if column_name in ["product_title", "product_description"]:
                    print "generate individual %s-svd%d stats feat for %s" % (
                        vec_type, n_components, column_name)
                    # train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_train, dfTrain["id"].values,
                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_train, dfTrain["id"].values,
                        query_relevance_indices_dict, dfTrain["qid"].values)
                    with open(
                            "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                            % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train,
                                     f, -1)
                    with open(
                            "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                            % (path, feat_name, n_components), "wb") as f:
                        cPickle.dump(
                            cosine_sim_stats_feat_by_query_relevance_train, f,
                            -1)
                    # test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_test, dfTest["id"].values,
                        relevance_indices_dict)
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                        "cosine", X_svd_train, dfTrain["id"].values,
                        X_svd_test, dfTest["id"].values,
                        query_relevance_indices_dict, dfTest["qid"].values)
                    with open(
                            "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                            % (path, mode, feat_name, n_components),
                            "wb") as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test,
                                     f, -1)
                    with open(
                            "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                            % (path, mode, feat_name, n_components),
                            "wb") as f:
                        cPickle.dump(
                            cosine_sim_stats_feat_by_query_relevance_test, f,
                            -1)

                    # update feat names
                    new_feat_names.append(
                        "%s_individual_svd%d_cosine_sim_stats_feat_by_relevance"
                        % (feat_name, n_components))
                    new_feat_names.append(
                        "%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance"
                        % (feat_name, n_components))
    """
    #############
    # bow/tfidf-tsne feat #
    #############
    # generate t-sne feat
    for n_components in tsne_n_components:
        for feat_name,column_name in zip(feat_names, column_names):
            print "generate individual %s-tsne%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            tsne = TSNE(n_components=n_components, init='pca', random_state=2015, metric="cosine")
            X = vstack([X_vec_train, X_vec_test])
            Y = tsne.fit_transform(X)
            num_train = X_vec_train.shape[0]
            X_tsne_train = Y[:num_train]
            X_tsne_test = Y[num_train:]
            with open("%s/train.%s_individual_tsne%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_train, f, -1)
            with open("%s/%s.%s_individual_tsne%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_test, f, -1)

            #########################
            # bow/tfidf-tsne euclidean distance stats feat #
            #########################
            if column_name in ["product_title", "product_description"]:
                print "generate individual %s-tsne%d stats feat for %s" % (vec_type, n_components, column_name)
                # train
                euclidean_dist_stats_feat_by_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_train, dfTrain["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_train, dfTrain["id"].values,
                                                                            query_relevance_indices_dict, dfTrain["qid"].values)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_train, f, -1)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_train, f, -1)
                # test
                euclidean_dist_stats_feat_by_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_test, dfTest["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_test, dfTest["id"].values,
                                                                            query_relevance_indices_dict, dfTest["qid"].values)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_test, f, -1)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_test, f, -1)

                # update feat names
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance" % (feat_name, n_components) )
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance" % (feat_name, n_components) )
    """

    return new_feat_names
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names):

    print "Get feature names..."
    new_feat_names = copy(feat_names)
    ## first fit a bow/tfidf on the all_text to get
    ## the common vocabulary to ensure query/title/description
    ## has the same length bow/tfidf for computing the similarity
    print "Process vocab..."
    if vocabulary_type == "common":
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range)
        vec.fit(dfTrain["all_text"])
        vocabulary = vec.vocabulary_
    elif vocabulary_type == "individual":
        vocabulary = None

    ## get the indices of pooled samples
    relevance_indices_dict = get_sample_indices_by_relevance(dfTrain)
    query_relevance_indices_dict = get_sample_indices_by_relevance(dfTrain, "qid")

    for feat_name,column_name in zip(feat_names, column_names):

        print "Working on %s for %s" % (feat_name, column_name)
        if (os.path.isfile("%s/train.%s.feat.pkl" % (path, feat_name)) and
            os.path.isfile("%s/%s.%s.feat.pkl" % (path, mode, feat_name))):
            continue

        ##########################
        ## basic bow/tfidf feat ##
        ##########################
        print "Generate %s feat for %s" % (vec_type, column_name)
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary)

        if (os.path.isfile("%s/train.%s.feat.pkl" % (path, feat_name))):
            continue

        X_train = vec.fit_transform(dfTrain[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)

        if os.path.isfile("%s/%s.%s.feat.pkl" % (path, mode, feat_name))):
            continue

        X_test = vec.transform(dfTest[column_name])
        with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)
        
        if stats_feat_flag:

            #####################################
            ## bow/tfidf cosine sim stats feat ##
            #####################################

            if (os.path.isfile("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name)) and
                os.path.isfile("%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name)) and
                os.path.isfile("%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name)) and
                os.path.isfile("%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name))):
                continue

            ## skip query part
            if column_name in ["product_title", "product_description"]:
                print "generate %s stats feat for %s" % (vec_type, column_name)
                ## train
                cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                    X_train, dfTrain["id"].values,
                                                                    relevance_indices_dict)
                with open("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                            X_train, dfTrain["id"].values,
                                                                            query_relevance_indices_dict, dfTrain["qid"].values)
                with open("%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                ## test
                cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                    X_test, dfTest["id"].values,
                                                                    relevance_indices_dict)
                cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat("cosine", X_train, dfTrain["id"].values,
                                                                            X_test, dfTest["id"].values,
                                                                            query_relevance_indices_dict, dfTest["qid"].values)
Esempio n. 6
0
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names):
    print 'inside fun', vec_type
    new_feat_names = copy(feat_names)
    ## first fit a bow/tfidf on the all_text to get
    ## the common vocabulary to ensure question1/question2
    ## has the same length bow/tfidf for computing the similarity
    if vocabulary_type == "common":
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range)
        vec.fit(dfTrain["all_text"])
        vocabulary = vec.vocabulary_
    elif vocabulary_type == "individual":
        vocabulary = None
    for feat_name, column_name in zip(feat_names, column_names):

        ##########################
        ## basic bow/tfidf feat ##
        ##########################
        print "generate %s feat for %s\n" % (vec_type, column_name)
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary)
        X_train = vec.fit_transform(dfTrain[column_name])
        X_test = vec.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)
        with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)

    #####################
    ## cosine sim feat ##
    #####################
    for i in range(len(feat_names) - 1):
        for j in range(i + 1, len(feat_names)):
            print "generate common %s cosine sim feat for %s and %s" % (
                vec_type, feat_names[i], feat_names[j])
            for mod in ["train", mode]:
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]),
                          "rb") as f:
                    target_vec = cPickle.load(f)
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]),
                          "rb") as f:
                    obs_vec = cPickle.load(f)
                sim = np.asarray(map(cosine_sim, target_vec,
                                     obs_vec))[:, np.newaxis]

                #               sim = np.asarray(map(cosine_sim, target_vec, obs_vec)).reshape(-1,1)

                ## dump feat
                with open(
                        "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" %
                    (path, mod, feat_names[i], feat_names[j], vec_type),
                        "wb") as f:
                    cPickle.dump(sim, f, -1)
            ## update feat names
            new_feat_names.append("%s_%s_%s_cosine_sim" %
                                  (feat_names[i], feat_names[j], vec_type))

    ##################
    ## SVD features ##
    ##################
    ## we fit svd use stacked question1/question2 bow/tfidf for further cosine simalirity computation
    for i, feat_name in enumerate(feat_names):
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
            X_vec_train = cPickle.load(f)
        if i == 0:
            X_vec_all_train = X_vec_train
        else:
            X_vec_all_train = vstack([X_vec_all_train, X_vec_train])

    for n_components in svd_n_components:
        svd = TruncatedSVD(n_components=n_components, n_iter=15)
        svd.fit(X_vec_all_train)
        ## load bow/tfidf (for less coding...)
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate common %s-svd%d feat for %s" % (
                vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name),
                      "rb") as f:
                X_vec_test = cPickle.load(f)
            X_svd_train = svd.transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)

            with open(
                    "%s/train.%s_common_svd%d.feat.pkl" %
                (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open(
                    "%s/%s.%s_common_svd%d.feat.pkl" %
                (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)

            ## update feat names
            new_feat_names.append("%s_common_svd%d" %
                                  (feat_name, n_components))

        #####################
        ## cosine sim feat ##
        #####################
        for i in range(len(feat_names) - 1):
            for j in range(i + 1, len(feat_names)):
                print "generate common %s-svd%d cosine sim feat for %s and %s" % (
                    vec_type, n_components, feat_names[i], feat_names[j])
                for mod in ["train", mode]:
                    with open(
                            "%s/%s.%s_common_svd%d.feat.pkl" %
                        (path, mod, feat_names[i], n_components), "rb") as f:
                        target_vec = cPickle.load(f)
                    with open(
                            "%s/%s.%s_common_svd%d.feat.pkl" %
                        (path, mod, feat_names[j], n_components), "rb") as f:
                        obs_vec = cPickle.load(f)

                    sim = np.asarray(map(cosine_sim, target_vec,
                                         obs_vec))[:, np.newaxis]
                    ## dump feat
                    with open(
                            "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl" %
                        (path, mod, feat_names[i], feat_names[j], vec_type,
                         n_components), "wb") as f:
                        cPickle.dump(sim, f, -1)
                ## update feat names
                new_feat_names.append(
                    "%s_%s_%s_common_svd%d_cosine_sim" %
                    (feat_names[i], feat_names[j], vec_type, n_components))

        #########################
        ## Individual SVD feat ##
        #########################
        ## generate individual svd feat
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate individual %s-svd%d feat for %s" % (
                vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name),
                      "rb") as f:
                X_vec_test = cPickle.load(f)
            svd = TruncatedSVD(n_components=n_components, n_iter=15)
            X_svd_train = svd.fit_transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open(
                    "%s/train.%s_individual_svd%d.feat.pkl" %
                (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open(
                    "%s/%s.%s_individual_svd%d.feat.pkl" %
                (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)
            ## update feat names
            new_feat_names.append("%s_individual_svd%d" %
                                  (feat_name, n_components))
    return new_feat_names
def extract_feat(path, dfTrain, dfTest, mode, feat_names, column_names):

    new_feat_names = copy(feat_names)
    ## first fit a bow/tfidf on the all_text to get
    ## the common vocabulary to ensure query/title/description
    ## has the same length bow/tfidf for computing the similarity
    if vocabulary_type == "common":
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range)
        vec.fit(dfTrain["all_text"])
        vocabulary = vec.vocabulary_
    elif vocabulary_type == "individual":
        vocabulary = None
    for feat_name, column_name in zip(feat_names, column_names):

        ##########################
        ## basic bow/tfidf feat ##
        ##########################
        print "generate %s feat for %s" % (vec_type, column_name)
        if vec_type == "tfidf":
            vec = getTFV(ngram_range=ngram_range, vocabulary=vocabulary)
        elif vec_type == "bow":
            vec = getBOW(ngram_range=ngram_range, vocabulary=vocabulary)
        X_train = vec.fit_transform(dfTrain[column_name])
        X_test = vec.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_train, f, -1)
        with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "wb") as f:
            cPickle.dump(X_test, f, -1)

        if stats_feat_flag:
            #####################################
            ## bow/tfidf cosine sim stats feat ##
            #####################################
            ## get the indices of pooled samples
            relevance_indices_dict = get_sample_indices_by_relevance(dfTrain)
            query_relevance_indices_dict = get_sample_indices_by_relevance(dfTrain, "qid")
            ## skip query part
            if column_name in ["product_title", "product_description"]:
                print "generate %s stats feat for %s" % (vec_type, column_name)
                ## train
                cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_train, dfTrain["id"].values, relevance_indices_dict
                )
                cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                    "cosine",
                    X_train,
                    dfTrain["id"].values,
                    X_train,
                    dfTrain["id"].values,
                    query_relevance_indices_dict,
                    dfTrain["qid"].values,
                )
                with open("%s/train.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                with open(
                    "%s/train.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name), "wb"
                ) as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                ## test
                cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                    "cosine", X_train, dfTrain["id"].values, X_test, dfTest["id"].values, relevance_indices_dict
                )
                cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                    "cosine",
                    X_train,
                    dfTrain["id"].values,
                    X_test,
                    dfTest["id"].values,
                    query_relevance_indices_dict,
                    dfTest["qid"].values,
                )
                with open("%s/%s.%s_cosine_sim_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name), "wb") as f:
                    cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                with open(
                    "%s/%s.%s_cosine_sim_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name), "wb"
                ) as f:
                    cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                ## update feat names
                new_feat_names.append("%s_cosine_sim_stats_feat_by_relevance" % feat_name)
                new_feat_names.append("%s_cosine_sim_stats_feat_by_query_relevance" % feat_name)

    #####################
    ## cosine sim feat ##
    #####################
    for i in range(len(feat_names) - 1):
        for j in range(i + 1, len(feat_names)):
            print "generate common %s cosine sim feat for %s and %s" % (vec_type, feat_names[i], feat_names[j])
            for mod in ["train", mode]:
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[i]), "rb") as f:
                    target_vec = cPickle.load(f)
                with open("%s/%s.%s.feat.pkl" % (path, mod, feat_names[j]), "rb") as f:
                    obs_vec = cPickle.load(f)
                sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis]
                ## dump feat
                with open(
                    "%s/%s.%s_%s_%s_cosine_sim.feat.pkl" % (path, mod, feat_names[i], feat_names[j], vec_type), "wb"
                ) as f:
                    cPickle.dump(sim, f, -1)
            ## update feat names
            new_feat_names.append("%s_%s_%s_cosine_sim" % (feat_names[i], feat_names[j], vec_type))

    ##################
    ## SVD features ##
    ##################
    ## we fit svd use stacked query/title/description bow/tfidf for further cosine simalirity computation
    for i, feat_name in enumerate(feat_names):
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
            X_vec_train = cPickle.load(f)
        if i == 0:
            X_vec_all_train = X_vec_train
        else:
            X_vec_all_train = vstack([X_vec_all_train, X_vec_train])

    for n_components in svd_n_components:
        svd = TruncatedSVD(n_components=n_components, n_iter=15)
        svd.fit(X_vec_all_train)
        ## load bow/tfidf (for less coding...)
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate common %s-svd%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            X_svd_train = svd.transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open("%s/train.%s_common_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)

            ## update feat names
            new_feat_names.append("%s_common_svd%d" % (feat_name, n_components))

            if stats_feat_flag:
                #####################################
                ## bow/tfidf-svd cosine sim stats feat ##
                #####################################
                if column_name in ["product_title", "product_description"]:
                    print "generate common %s-svd%d stats feat for %s" % (vec_type, n_components, column_name)
                    ## train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_train,
                        dfTrain["id"].values,
                        relevance_indices_dict,
                    )
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_train,
                        dfTrain["id"].values,
                        query_relevance_indices_dict,
                        dfTrain["qid"].values,
                    )
                    with open(
                        "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                    with open(
                        "%s/train.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                    ## test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_test,
                        dfTest["id"].values,
                        relevance_indices_dict,
                    )
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_test,
                        dfTest["id"].values,
                        query_relevance_indices_dict,
                        dfTest["qid"].values,
                    )
                    with open(
                        "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, mode, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                    with open(
                        "%s/%s.%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, mode, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                    ## update feat names
                    new_feat_names.append(
                        "%s_common_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components)
                    )
                    new_feat_names.append(
                        "%s_common_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components)
                    )

        #####################
        ## cosine sim feat ##
        #####################
        for i in range(len(feat_names) - 1):
            for j in range(i + 1, len(feat_names)):
                print "generate common %s-svd%d cosine sim feat for %s and %s" % (
                    vec_type,
                    n_components,
                    feat_names[i],
                    feat_names[j],
                )
                for mod in ["train", mode]:
                    with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[i], n_components), "rb") as f:
                        target_vec = cPickle.load(f)
                    with open("%s/%s.%s_common_svd%d.feat.pkl" % (path, mod, feat_names[j], n_components), "rb") as f:
                        obs_vec = cPickle.load(f)
                    sim = np.asarray(map(cosine_sim, target_vec, obs_vec))[:, np.newaxis]
                    ## dump feat
                    with open(
                        "%s/%s.%s_%s_%s_common_svd%d_cosine_sim.feat.pkl"
                        % (path, mod, feat_names[i], feat_names[j], vec_type, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(sim, f, -1)
                ## update feat names
                new_feat_names.append(
                    "%s_%s_%s_common_svd%d_cosine_sim" % (feat_names[i], feat_names[j], vec_type, n_components)
                )

        #########################
        ## Individual SVD feat ##
        #########################
        ## generate individual svd feat
        for feat_name, column_name in zip(feat_names, column_names):
            print "generate individual %s-svd%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            svd = TruncatedSVD(n_components=n_components, n_iter=15)
            X_svd_train = svd.fit_transform(X_vec_train)
            X_svd_test = svd.transform(X_vec_test)
            with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_train, f, -1)
            with open("%s/%s.%s_individual_svd%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_svd_test, f, -1)
            ## update feat names
            new_feat_names.append("%s_individual_svd%d" % (feat_name, n_components))

            if stats_feat_flag:
                #########################################
                ## bow/tfidf-svd cosine sim stats feat ##
                #########################################
                if column_name in ["product_title", "product_description"]:
                    print "generate individual %s-svd%d stats feat for %s" % (vec_type, n_components, column_name)
                    ## train
                    cosine_sim_stats_feat_by_relevance_train = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_train,
                        dfTrain["id"].values,
                        relevance_indices_dict,
                    )
                    cosine_sim_stats_feat_by_query_relevance_train = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_train,
                        dfTrain["id"].values,
                        query_relevance_indices_dict,
                        dfTrain["qid"].values,
                    )
                    with open(
                        "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_train, f, -1)
                    with open(
                        "%s/train.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_train, f, -1)
                    ## test
                    cosine_sim_stats_feat_by_relevance_test = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_test,
                        dfTest["id"].values,
                        relevance_indices_dict,
                    )
                    cosine_sim_stats_feat_by_query_relevance_test = generate_dist_stats_feat(
                        "cosine",
                        X_svd_train,
                        dfTrain["id"].values,
                        X_svd_test,
                        dfTest["id"].values,
                        query_relevance_indices_dict,
                        dfTest["qid"].values,
                    )
                    with open(
                        "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_relevance.feat.pkl"
                        % (path, mode, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_relevance_test, f, -1)
                    with open(
                        "%s/%s.%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance.feat.pkl"
                        % (path, mode, feat_name, n_components),
                        "wb",
                    ) as f:
                        cPickle.dump(cosine_sim_stats_feat_by_query_relevance_test, f, -1)

                    ## update feat names
                    new_feat_names.append(
                        "%s_individual_svd%d_cosine_sim_stats_feat_by_relevance" % (feat_name, n_components)
                    )
                    new_feat_names.append(
                        "%s_individual_svd%d_cosine_sim_stats_feat_by_query_relevance" % (feat_name, n_components)
                    )

    """
    #########################
    ## bow/tfidf-tsne feat ##
    #########################
    ## generate t-sne feat
    for n_components in tsne_n_components:
        for feat_name,column_name in zip(feat_names, column_names):
            print "generate individual %s-tsne%d feat for %s" % (vec_type, n_components, column_name)
            with open("%s/train.%s.feat.pkl" % (path, feat_name), "rb") as f:
                X_vec_train = cPickle.load(f)
            with open("%s/%s.%s.feat.pkl" % (path, mode, feat_name), "rb") as f:
                X_vec_test = cPickle.load(f)
            tsne = TSNE(n_components=n_components, init='pca', random_state=2015, metric="cosine")
            X = vstack([X_vec_train, X_vec_test])
            Y = tsne.fit_transform(X)
            num_train = X_vec_train.shape[0]
            X_tsne_train = Y[:num_train]
            X_tsne_test = Y[num_train:]
            with open("%s/train.%s_individual_tsne%d.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_train, f, -1)
            with open("%s/%s.%s_individual_tsne%d.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                cPickle.dump(X_tsne_test, f, -1)

            ##################################################
            ## bow/tfidf-tsne euclidean distance stats feat ##
            ##################################################
            if column_name in ["product_title", "product_description"]:
                print "generate individual %s-tsne%d stats feat for %s" % (vec_type, n_components, column_name)
                ## train
                euclidean_dist_stats_feat_by_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_train, dfTrain["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_train = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_train, dfTrain["id"].values,
                                                                            query_relevance_indices_dict, dfTrain["qid"].values)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_train, f, -1)
                with open("%s/train.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_train, f, -1)
                ## test
                euclidean_dist_stats_feat_by_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                    X_tsne_test, dfTest["id"].values,
                                                                    relevance_indices_dict)
                euclidean_dist_stats_feat_by_query_relevance_test = generate_dist_stats_feat("euclidean", X_tsne_train, dfTrain["id"].values,
                                                                            X_tsne_test, dfTest["id"].values,
                                                                            query_relevance_indices_dict, dfTest["qid"].values)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_relevance_test, f, -1)
                with open("%s/%s.%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance.feat.pkl" % (path, mode, feat_name, n_components), "wb") as f:
                    cPickle.dump(euclidean_dist_stats_feat_by_query_relevance_test, f, -1)

                ## update feat names
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_relevance" % (feat_name, n_components) )
                new_feat_names.append( "%s_individual_tsne%d_euclidean_dist_stats_feat_by_query_relevance" % (feat_name, n_components) )
    """

    return new_feat_names
print "Calculating cosine similiarity..."


def calc_cosine_dist(text_a, text_b, vect):
    return pairwise_distances(vect.transform([text_a]), vect.transform([text_b]), metric='cosine')[0][0]


def calc_set_intersection(text_a, text_b):
    a = set(text_a.split())
    b = set(text_b.split())
    return len(a.intersection(b)) * 1.0 / len(a)


start_time = time.time()

tfv_orig = getTFV(ngram_range=(1, 2))
tfv_stem = getTFV(ngram_range=(1, 2))
tfv_desc = getTFV(ngram_range=(1, 2))

tfv_orig.fit(list(train_df['search_term'].values) + list(test_df['search_term'].values) +
             list(train_df['product_title'].values) + list(test_df['product_title'].values))
tfv_stem.fit(list(train_df['query_stem'].values) + list(test_df['query_stem'].values) +
             list(train_df['title_stem'].values) + list(test_df['title_stem'].values))
tfv_desc.fit(list(train_df['query_stem'].values) + list(test_df['query_stem'].values) +
             list(train_df['desc_stem'].values) + list(test_df['desc_stem'].values))

print("Vectorizing data cost--- %s seconds ---" % (time.time() - start_time))

start_time = time.time()

# for training set
    extract_tfidf_features(df_train)
    extract_tfidf_features(df_test)

    ngram_range = (1, 1)
    svd_n_components = 10
    n_iter = 5

    #################
    ## Re-training ##
    #################
    print("For training and testing...")
    nd_train = []
    nd_test = []
    for i, (feat_name, column_name) in enumerate(zip(feat_names, column_names)):
        print("Generate %s feat" % feat_name)
        tfv = nlp_utils.getTFV(ngram_range=ngram_range)

        X_tfidf_train = tfv.fit_transform(df_train[column_name])
        print(type(X_tfidf_train))
        print("X_tfidf_train shape: {0}".format(X_tfidf_train.shape))

        X_tfidf_test = tfv.transform(df_test[column_name])
        print(type(X_tfidf_test))
        print("X_tfidf_test shape: {0}".format(X_tfidf_test.shape))

        ## svd
        svd = TruncatedSVD(n_components=svd_n_components, n_iter=n_iter)

        X_svd_train = svd.fit_transform(X_tfidf_train)
        print(type(X_svd_train))
        print("X_svd_train shape: {0}".format(X_svd_train.shape))