コード例 #1
0
ファイル: feature_match.py プロジェクト: qianteng/Quora_HD
def main():
    logname = "generate_feature_match_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        MatchQueryCount,
        MatchQueryRatio,
        LongestMatchSize,
        LongestMatchRatio,
    ]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    ## question2 in question1
    obs_fields_list.append(['question2'])
    target_fields_list.append(['question1'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
コード例 #2
0
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #4
0
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram,
        LastIntersectCount_Ngram,
        FirstIntersectRatio_Ngram,
        LastIntersectRatio_Ngram,
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    ## document in query
    obs_fields_list.append(["question2"])
    target_fields_list.append(["question1"])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #5
0
def main():
    logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append(
        config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" %
        (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
    model_prefixes.append("Homedepot")
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs,
                                               model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir +
                                                           ".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim,
            Doc2Vec_RMSE,
            # Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append(["search_term", "search_term_alt"][:1])
        target_fields_list.append([
            "product_title", "product_description", "product_attribute",
            "product_brand", "product_color"
        ])
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    doc2vec_model, doc2vec_model_sent_label, model_prefix
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #6
0
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    ## document in query
    obs_fields_list.append( ["question2"] )
    target_fields_list.append( ["question1"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #7
0
def run_position():
    logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectPosition_Ngram,
        LastIntersectPosition_Ngram,
        FirstIntersectNormPosition_Ngram,
        LastIntersectNormPosition_Ngram,
    ]

    obs_fields_list = [["question1"], ["question2"]]
    target_fields_list = [["question2"], ["question1"]]
    ngrams = [1, 2, 3, 12, 123]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #8
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields = ["question1", "question2"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
コード例 #9
0
def run_lsa_ngram_pair():
    """Symmetric in obs and target"""
    logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [LSA_Word_Ngram_Pair]
    ngrams = [1, 2, 3]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
コード例 #10
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
コード例 #11
0
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(["product_title", "product_description"][:1])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [
                        obs_ngram, target_ngram, config.SVD_DIM,
                        config.SVD_N_ITER
                    ]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
コード例 #12
0
def run_tfidf_ngram_cosinesim():
    """Symmetric in obs and target"""
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[2, 3], [4]]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator, ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
コード例 #13
0
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #14
0
def main():
    logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission
    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ][:1]
    obs_fields_list = []
    target_fields_list = []
    # only search_term and product_title are used in final submission
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
コード例 #15
0
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields,
                                    target_fields, param_list, config.FEAT_DIR,
                                    logger)
        pf.go()
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll,
                                        obs_fields, target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
コード例 #17
0
ファイル: feature_distance.py プロジェクト: ebernhardson/l2r
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    # single valued fields
    obs_fields_list = [["query", "norm_query"]]
    target_fields_list = [["hit_title", "opening_text" ]]
    ngrams = [1,2,3,12,123][:3]
    dedup = True
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()

    # multi-valued fields
    target_fields_list = [["category", "template", "heading",
            "outgoing_link", "external_link", "redirect.title",
            "auxiliary_text"]]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            multi_gen = MultiTargetEstimatorWrapper(generator)
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()
コード例 #18
0
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #19
0
ファイル: feature_doc2vec.py プロジェクト: qianteng/Quora_HD
def main():
    logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Quora dataset: question1/question2
    doc2vec_model_dirs.append(
        config.DOC2VEC_MODEL_DIR + "/Quora-doc2vec-D%d-min_count%d.model" %
        (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
    model_prefixes.append("Quora")
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs,
                                               model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir +
                                                           ".sent_label")
        except:
            continue

        ## pairwise
        generators = [
            Doc2Vec_CosineSim,
            Doc2Vec_RMSE,
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = [["question1"]]
        target_fields_list = [["question2"]]
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    doc2vec_model, doc2vec_model_sent_label, model_prefix
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #20
0
ファイル: feature_distance.py プロジェクト: qianteng/Quora_HD
def run_ngram_jaccard():
    """Symmetric in obs and target. """
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = [["question1"]]
    target_fields_list = [["question2"]]
    ngrams = [1, 2, 3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #21
0
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #22
0
def run_lsa_ngram_pair():
    logname = "generate_feature_lsa_ngram_pair_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #23
0
def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #24
0
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #25
0
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #26
0
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    if which == "tf":
        generators.append( StatCoocTF_Ngram )
    elif which == "norm_tf":
        generators.append( StatCoocNormTF_Ngram )
    elif which == "tfidf":
        generators.append( StatCoocTFIDF_Ngram )
    elif which == "norm_tfidf":
        generators.append( StatCoocNormTFIDF_Ngram )
    elif which == "bm25":
        generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #27
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1, 2, 3], [2, 3, 4, 5]]
    ngrams_list = [[3], [4]]
    # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    obs_fields = ["search_term", "product_title", "product_description"]
    for generator, ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator,
                                          dfAll,
                                          obs_fields,
                                          param_list,
                                          config.FEAT_DIR,
                                          logger,
                                          force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(["product_title", "product_description"])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
コード例 #28
0
def main():
    logname = "generate_feature_group_distance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [
        1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3
    ]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [
                        dfAll["id"], dfTrain2, target_field, relevance, ngram,
                        aggregation_mode
                    ]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard,
                                                dfAll, obs_fields,
                                                [target_field], param_list,
                                                sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [
                    dfAll["id"], dfTrain, target_field, relevance, ngram,
                    aggregation_mode
                ]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard,
                                            dfAll, obs_fields, [target_field],
                                            param_list, sub_feature_dir,
                                            logger)
                pf.go()
コード例 #29
0
def run_position():
    logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectPosition_Ngram,
        LastIntersectPosition_Ngram,
        FirstIntersectNormPosition_Ngram,
        LastIntersectNormPosition_Ngram,
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    ## document in query
    obs_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    target_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    ngrams = [1, 2, 3, 12, 123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #30
0
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #31
0
def run_edit_distance():
    logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #32
0
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                    pf.go()
コード例 #33
0
def main():
    logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectCount_Ngram,
        IntersectRatio_Ngram,
        CooccurrenceCount_Ngram,
        CooccurrenceRatio_Ngram,
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    ## document in query
    obs_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    target_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #34
0
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]   #DocIdOneHot not used
    obs_fields = ["question1", "question2"] 
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## id
    generators = [DocIdEcho]
    obs_fields = ["id"] 
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## qid
    generators = [MaxValue, DiffValue]
    obs_fields_list = [['qid1']]
    target_fields_list = [['qid2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
                param_list = []
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()

    ## DocLenRatio
    generators = [DocLenRatio]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
                param_list = []
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
                
    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["question1", "question2"]
    ngrams = [1, 2, 3, 4, 5, 12, 123]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()
コード例 #35
0
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [
        CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(
        ["product_title", "product_description", "product_attribute"])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
コード例 #36
0
def main():
    logname = "generate_feature_tfidf.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_fields = ['query', 'norm_query']
    target_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS]

    dfAll = table_utils._read(config.ALL_DATA)
    docs = table_utils._open_shelve_read(config.ES_PAGE_TERM_VEC_SHELVE)
    queries = table_utils._open_shelve_read(config.ES_QUERY_TERM_VEC_SHELVE)

    generators = [ES_TFIDF_Unigram_TopN_CosineSim]
    dedup = True
    for generator in generators:
        for target_field in target_fields:
            obs_fields_tv = [x + '_' + target_field for x in obs_fields]
            param_list = []
            # TODO: why iterate obs_fields instead of passing all at once?
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields_tv,
                                        [target_field], param_list,
                                        config.FEAT_DIR, logger, dedup)
            pf.go()
コード例 #37
0
def main():
    logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #38
0
def main():
    logname = "generate_feature_match_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        MatchQueryCount,
        MatchQueryRatio,
        LongestMatchSize,
        LongestMatchRatio,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount,
        MatchAttrRatio,
        IsIndoorOutdoorMatch,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(["product_attribute_list"])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
コード例 #39
0
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s_%s.log" % (
        which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    if which == "tf":
        generators.append(StatCoocTF_Ngram)
    elif which == "norm_tf":
        generators.append(StatCoocNormTF_Ngram)
    elif which == "tfidf":
        generators.append(StatCoocTFIDF_Ngram)
    elif which == "norm_tfidf":
        generators.append(StatCoocNormTFIDF_Ngram)
    elif which == "bm25":
        generators.append(StatCoocBM25_Ngram)

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    ## document in query
    obs_fields_list.append(["question2"])
    target_fields_list.append(["question1"])
    ngrams = [1, 2, 3, 12, 123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #40
0
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1, 2, 3], [2, 3, 4, 5]]
    ngrams_list = [[3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(
        ["product_title", "product_description", "product_attribute"])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator, ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
コード例 #41
0
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #42
0
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    for w in which.split(","):
        if w == "tf":
            generators.append( StatCoocTF_Ngram )
        elif w == "norm_tf":
            generators.append( StatCoocNormTF_Ngram )
        elif w == "tfidf":
            generators.append( StatCoocTFIDF_Ngram )
        elif w == "norm_tfidf":
            generators.append( StatCoocNormTFIDF_Ngram )
        elif w == "bm25":
            generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
コード例 #43
0
def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go()
コード例 #44
0
def main():
    logname = "generate_feature_match_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [
        MatchQueryCount, 
        MatchQueryRatio, 
        LongestMatchSize,
        LongestMatchRatio
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount, 
        MatchAttrRatio, 
        IsIndoorOutdoorMatch, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_attribute_list"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
コード例 #45
0
def main(which):
    logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMinG
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    word2vec_model_dirs = []
    model_prefixes = []
    if which == "homedepot":
        ## word2vec model trained with Homedepot dataset: brand/color/query/title/description
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
        model_prefixes.append( "Homedepot" )
    elif which == "wikipedia":
        ## word2vec model pretrained with Wikipedia+Gigaword 5
        word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" )
        model_prefixes.append( "Wikipedia" )
    elif which == "google":
        ## word2vec model pretrained with Google News
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" )
        model_prefixes.append( "GoogleNews" )

    for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True)
            elif ".txt" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False)
            else:
                word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir)
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "product_title", "product_description"]
        # generator = Word2Vec_Centroid_Vector
        # param_list = [word2vec_model, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Word2Vec_Importance,
            Word2Vec_N_Similarity, 
            Word2Vec_N_Similarity_Imp, 
            Word2Vec_Centroid_RMSE, 
            Word2Vec_Centroid_RMSE_IMP,
            # # not used in final submission
            # Word2Vec_Centroid_Vdiff, 
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()

        ## cosine sim
        generators = [
            Word2Vec_CosineSim,
        ]
        # double aggregation
        aggregation_mode_prev = ["mean", "max", "min", "median"]
        aggregation_mode = ["mean", "std", "max", "min", "median"]
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()