Esempio n. 1
0
def process(lang, pivot):
	print "[%s]: process for language %s" % (time_utils._timestamp(), lang)
	linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)])
	templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang])
	articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot])
	mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template")
	template1 = []; template2 = []
	article1 = []; article2 = []; ontology = []
	for template in templateDict:
		articles = templateDict[template]
		for article in articles:
			if article in linkDict:
				tmp = linkDict[article]
				template1.append(template)
				article1.append(article)
				article2.append(tmp)
				if tmp in articleDict:
					templateList = articleDict[tmp]
				else:
					templateList = []
				c = ""
				t = ""
				for Template in templateList:
					if Template in mapping.index:
						c = mapping.at[Template, "ontology"]
						t = Template
				template2.append(t)
				ontology.append(c)

	data = {"template1":template1, "article1":article1, "template2":template2, \
			"article2":article2, "ontology":ontology}
	df = pd.DataFrame(data)
	df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False)
	print "[%s]: processing complete" % time_utils._timestamp()
Esempio n. 2
0
def main():
    print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp())
    G = g.Graph()
    G.parse(config.ONTOLOGY, format="n3")

    q = '''
PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?child ?parent
WHERE {
	?child rr:subClassOf ?parent .
}'''

    results = G.query(q)
    ontologyDict = {}
    for row in results:
        child = str(row[0])
        parent = str(row[1])
        if parent in ontologyDict:
            ontologyDict[parent].append(child)
        else:
            ontologyDict[parent] = [
                child,
            ]
    pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict)
    print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 3
0
def getILL(lang, target):
    print "[%s]: generate ILL dict from language %s to language %s" % (
        time_utils._timestamp(), lang, target)
    infile = open(config.ILL[lang])
    prefix1 = config.LANG_PREFIX[lang]
    prefix2 = config.LANG_PREFIX[target]
    len1 = len(prefix1)
    len2 = len(prefix2)
    linkDict = {}
    for line in infile.readlines():
        if line[0] != "<":
            continue
        row = line.split()
        lang1 = row[0][1:-1]
        lang2 = row[2][1:-1]
        if prefix1 not in lang1:
            continue
        if prefix2 not in lang2:
            continue
        lang1 = lang1[len1:]
        lang2 = lang2[len2:]
        linkDict[lang1] = lang2
    print "%d links in total" % len(linkDict)
    pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict)
    print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 4
0
def Article2Template(lang="en"):
	print "[%s]: generate article2template dict for language %s" % (time_utils._timestamp(), lang)
	infile = open(config.ARTICLE_TEMPLATES[lang])
	prefix = config.LANG_PREFIX[lang]
	len_prefix = len(prefix)
	articleDict = {}
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		article = row[0][1:-1]
		template = row[2][1:-1]
		article = article[len_prefix:]
		template = template[len_prefix:]

		if "/" in template:
			continue

		if article in articleDict:
			articleDict[article].append(template)
		else:
			articleDict[article] = [template, ]
	print "%d articles in total" % len(articleDict)
	pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict)
	print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 5
0
def Article2Template(lang="en"):
    print "[%s]: generate article2template dict for language %s" % (
        time_utils._timestamp(), lang)
    infile = open(config.ARTICLE_TEMPLATES[lang])
    prefix = config.LANG_PREFIX[lang]
    len_prefix = len(prefix)
    articleDict = {}
    for line in infile.readlines():
        if line[0] != "<":
            continue
        row = line.split()
        article = row[0][1:-1]
        template = row[2][1:-1]
        article = article[len_prefix:]
        template = template[len_prefix:]

        if "/" in template:
            continue

        if article in articleDict:
            articleDict[article].append(template)
        else:
            articleDict[article] = [
                template,
            ]
    print "%d articles in total" % len(articleDict)
    pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict)
    print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 6
0
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields = ["question1", "question2"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 8
0
def run_position():
    logname = "generate_feature_first_last_ngram_position_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectPosition_Ngram,
        LastIntersectPosition_Ngram,
        FirstIntersectNormPosition_Ngram,
        LastIntersectNormPosition_Ngram,
    ]

    obs_fields_list = [["question1"], ["question2"]]
    target_fields_list = [["question2"], ["question1"]]
    ngrams = [1, 2, 3, 12, 123]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
Esempio n. 9
0
def run_lsa_ngram_pair():
    """Symmetric in obs and target"""
    logname = "generate_feature_lsa_ngram_pair_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [LSA_Word_Ngram_Pair]
    ngrams = [1, 2, 3]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append([
        "search_term", "search_term_product_name", "search_term_alt",
        "search_term_auto_corrected"
    ][:2])
    target_fields_list.append([
        "product_title", "product_title_product_name", "product_description",
        "product_attribute", "product_brand", "product_color"
    ])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields,
                                    target_fields, param_list, config.FEAT_DIR,
                                    logger)
        pf.go()
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll,
                                        obs_fields, target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
Esempio n. 12
0
def main():
    logname = "generate_feature_doc2vec_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append(
        config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model" %
        (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
    model_prefixes.append("Homedepot")
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs,
                                               model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(
                    doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir +
                                                           ".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim,
            Doc2Vec_RMSE,
            # Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append(["search_term", "search_term_alt"][:1])
        target_fields_list.append([
            "product_title", "product_description", "product_attribute",
            "product_brand", "product_color"
        ])
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    doc2vec_model, doc2vec_model_sent_label, model_prefix
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
    def __init__(self, model_folder, model_list, subm_prefix, 
                weight_opt_max_evals=10, w_min=-1., w_max=1., 
                inst_subsample=0.5, inst_subsample_replacement=False, 
                inst_splitter=None,
                model_subsample=1.0, model_subsample_replacement=True,
                bagging_size=10, init_top_k=5, epsilon=0.00001, 
                multiprocessing=False, multiprocessing_num_cores=1,
                enable_extreme=True, random_seed=0):

        self.model_folder = model_folder
        self.model_list = model_list
        self.subm_prefix = subm_prefix
        self.weight_opt_max_evals = weight_opt_max_evals
        self.w_min = w_min
        self.w_max = w_max
        assert inst_subsample > 0 and inst_subsample <= 1.
        self.inst_subsample = inst_subsample
        self.inst_subsample_replacement = inst_subsample_replacement
        self.inst_splitter = inst_splitter
        assert model_subsample > 0
        assert (type(model_subsample) == int) or (model_subsample <= 1.)
        self.model_subsample = model_subsample
        self.model_subsample_replacement = model_subsample_replacement
        self.bagging_size = bagging_size
        self.init_top_k = init_top_k
        self.epsilon = epsilon
        self.multiprocessing = multiprocessing
        self.multiprocessing_num_cores = multiprocessing_num_cores
        self.enable_extreme = enable_extreme
        self.random_seed = random_seed
        logname = "ensemble_selection_%s.log"%time_utils._timestamp()
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        self.n_models = len(self.model_list)
Esempio n. 14
0
def main(options):
    logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        options.feature_name, options.learner_name, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.task_mode, options.learner_name, 
        options.feature_name, logger, options.max_evals, verbose=True, refit_once=options.refit_once)
    optimizer.run()
 def __init__(self,
              feature_list,
              feature_name,
              feature_suffix=".csv",
              feature_level=2,
              meta_feature_dict={},
              corr_threshold=0):
     self.feature_name = feature_name
     self.feature_list = feature_list
     self.feature_suffix = feature_suffix
     self.feature_level = feature_level
     # for meta features
     self.meta_feature_dict = meta_feature_dict
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.feature_names = []
     self.has_basic = 1 if self.meta_feature_dict else 0
     logname = "feature_combiner_%s_%s.log" % (feature_name,
                                               time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     if self.feature_level == 2:
         self.splitter = splitter_level2
     elif self.feature_level == 3:
         self.splitter = splitter_level3
     self.n_iter = n_iter
     self.splitter_prev = [0] * self.n_iter
Esempio n. 16
0
def main():
    logname = "generate_feature_match_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        MatchQueryCount,
        MatchQueryRatio,
        LongestMatchSize,
        LongestMatchRatio,
    ]
    obs_fields_list = []
    target_fields_list = []
    ## question1 in question2
    obs_fields_list.append(['question1'])
    target_fields_list.append(['question2'])
    ## question2 in question1
    obs_fields_list.append(['question2'])
    target_fields_list.append(['question1'])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 18
0
def run_tfidf_ngram_cosinesim():
    """Symmetric in obs and target"""
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[2, 3], [4]]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator, ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator,
                                            dfAll,
                                            obs_fields,
                                            target_fields,
                                            param_list,
                                            config.FEAT_DIR,
                                            logger,
                                            force_corr=True)
                pf.go()
                del pf
                gc.collect()
Esempio n. 19
0
def main():

    # source domain
    print("load svhn")
    svhn_images_train, _ = DataLoader.load_svhn(SVHN_DIR, "train_32x32.mat")
    svhn_images_test, svhn_labels_test = DataLoader.load_svhn(SVHN_DIR, "test_32x32.mat")
    svhn_images_extra, svhn_labels_extra = DataLoader.load_svhn(SVHN_DIR, "extra_32x32.mat")

    auxiliary_data = {
        "X_train": svhn_images_extra,
        "y_train": svhn_labels_extra,
        "X_test": svhn_images_test,
        "y_test": svhn_labels_test,
    }

    # target domain
    print("load mnist")
    if not os.path.isfile(os.path.join(MNIST_DIR, "train.pkl")):
        DataLoader.prepare_mnist(MNIST_DIR, "train")
    mnist_images_train, _ = DataLoader.load_mnist(MNIST_DIR, "train")

    # dtn model
    print("init dtn")
    os_utils._makedirs(params["summary_dir"], force=True)
    os_utils._makedirs(params["log_dir"])
    logger = log_utils._get_logger(params["log_dir"], "tf-%s.log" % time_utils._timestamp())
    model = DomainTransferNet(params, logger)

    print("fit dtn")
    model.fit(auxiliary_data, Xs_train=svhn_images_train, Xt_train=mnist_images_train)

    print("evaluate dtn")
    model.evaluate(Xs=svhn_images_train, sample_batch=100, batch_size=100, sample_dir=SAMPLE_DIR)
Esempio n. 20
0
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram,
        LastIntersectCount_Ngram,
        FirstIntersectRatio_Ngram,
        LastIntersectRatio_Ngram,
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    ## document in query
    obs_fields_list.append(["question2"])
    target_fields_list.append(["question1"])
    ngrams = [1, 2, 3, 12, 123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
Esempio n. 21
0
def main():
    logname = "generate_feature_wordnet_similarity_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission
    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ][:1]
    obs_fields_list = []
    target_fields_list = []
    # only search_term and product_title are used in final submission
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                        target_fields, param_list,
                                        config.FEAT_DIR, logger)
            pf.go()
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["question1"] )
    target_fields_list.append( ["question2"] )
    ## document in query
    obs_fields_list.append( ["question2"] )
    target_fields_list.append( ["question1"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 24
0
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append(
        ["search_term", "search_term_alt", "search_term_auto_corrected"][:1])
    target_fields_list.append(["product_title", "product_description"][:1])
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [
                        obs_ngram, target_ngram, config.SVD_DIM,
                        config.SVD_N_ITER
                    ]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
def main():
    logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
    sf.go()
def main():
    logname = "generate_feature_group_relevance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields,
                                      param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields,
                                  param_list, sub_feature_dir, logger)
    sf.go()
Esempio n. 27
0
def parse_args(parser):
    parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns",
        help="lsa_columns")
    parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(),
        type="string", dest="outfile", help="outfile")

    (options, args) = parser.parse_args()
    return options, args
def parse_args(parser):
    parser.add_option("-d", "--dim", default=1, type=int, dest="lsa_columns",
        help="lsa_columns")
    parser.add_option("-o", "--outfile", default="feature_conf_%s.py"%time_utils._timestamp(),
        type="string", dest="outfile", help="outfile")

    (options, args) = parser.parse_args()
    return options, args
Esempio n. 29
0
def main(options):
    logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        options.feature_name, options.learner_name, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.task_mode, options.learner_name,
                              options.feature_name, logger, options.max_evals, verbose=True,
                              refit_once=options.refit_once, plot_importance=options.plot_importance)
    optimizer.run()
Esempio n. 30
0
def main():
    logname = "generate_feature_intersect_count_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    # Ngram
    generators = [
        IntersectCount_Ngram,
        IntersectRatio_Ngram,
    ]
    obs_fields_list = [['question1'], ['question2']]
    target_fields_list = [['question2'], ['question1']]
    ngrams = [1, 2, 3, 4, 5, 12,
              123]  # only 1,2,3,4,5,12,123 available, see ngram_utils.py
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()

    # Ngram symmetric
    generators = [
        CooccurrenceCount_Ngram,
        CooccurrenceRatio_Ngram,
        #CooccurrenceCount_Nterm,    # not used in Quora project, takes long to run
        #CooccurrenceRatio_Nterm,
    ]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    ngrams = [1, 2, 3, 4, 5, 12,
              123]  # only 1,2,3,4,5,12,123 available, see ngram_utils.py
    nterms = [
        2, 3, 4
    ]  # only 1,2,3,4 available,(uniterms is the same as unigrams) see ngram_utils.py
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            if generator.__name__[-5:] == 'Ngram':
                for ngram in ngrams:
                    param_list = [ngram]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
            elif generator.__name__[-5:] == 'Nterm':
                for nterm in nterms:
                    param_list = [nterm]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                                target_fields, param_list,
                                                config.FEAT_DIR, logger)
                    pf.go()
            else:
                print("Wrong Generator")
                pass
def parse_args(parser):
    parser.add_option("-l", "--level", default=2, 
        type="int", dest="level", help="level")
    parser.add_option("-t", "--top", default=10, 
        type="int", dest="topN", help="top-N")
    parser.add_option("-o", "--outfile", 
        default="stacking_feature_conf_%s.py"%time_utils._timestamp(),
        type="string", dest="outfile", help="outfile")
    (options, args) = parser.parse_args()
    return options, args
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    for w in which.split(","):
        if w == "tf":
            generators.append( StatCoocTF_Ngram )
        elif w == "norm_tf":
            generators.append( StatCoocNormTF_Ngram )
        elif w == "tfidf":
            generators.append( StatCoocTFIDF_Ngram )
        elif w == "norm_tfidf":
            generators.append( StatCoocNormTFIDF_Ngram )
        elif w == "bm25":
            generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 33
0
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    if which == "tf":
        generators.append( StatCoocTF_Ngram )
    elif which == "norm_tf":
        generators.append( StatCoocNormTF_Ngram )
    elif which == "tfidf":
        generators.append( StatCoocTFIDF_Ngram )
    elif which == "norm_tfidf":
        generators.append( StatCoocNormTFIDF_Ngram )
    elif which == "bm25":
        generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 34
0
 def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0):
     self.feature_name = feature_name
     self.feature_dict = feature_dict
     self.feature_suffix = feature_suffix
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.basic_only = 0
     logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     self.splitter = splitter_level1
     self.n_iter = n_iter
def main():
    logname = "generate_feature_group_distance_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [
        1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3
    ]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [
                        dfAll["id"], dfTrain2, target_field, relevance, ngram,
                        aggregation_mode
                    ]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard,
                                                dfAll, obs_fields,
                                                [target_field], param_list,
                                                sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [
                    dfAll["id"], dfTrain, target_field, relevance, ngram,
                    aggregation_mode
                ]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard,
                                            dfAll, obs_fields, [target_field],
                                            param_list, sub_feature_dir,
                                            logger)
                pf.go()
Esempio n. 36
0
def main(options):

    os_utils._makedirs("../logs")
    os_utils._makedirs("../output")
    os_utils._makedirs(params["offline_model_dir"])
    os_utils._makedirs(params["pb_model_dir"])
    logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp())

    params["granularity"] = options.granularity

    # save path
    model_name = "augmentation_%s_%s_%s"%(str(options.augmentation), options.granularity, options.model)
    path = config.SUB_DIR + "/" + model_name
    os_utils._makedirs(path)

    # load data
    X_dev, X_valid, Q, X_itest= get_train_valid_test_data(options.augmentation)
    # validation
    model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix)
    if os.path.exists(params["offline_model_dir"] + "/checkpoint"):
        print('restoring model.......')
        model.restore_session()
    train_model = True
    if train_model:    
        print('training model...')
        model.fit(X_dev, Q, validation_data=X_valid, shuffle=True)
        print('ready to save model....')
        model.save_session()
        print('model save done!')

    y_pred_itest = model.predict_proba(X_itest, Q).flatten()
    #print('build saving.....')
    if not os.path.exists(params["pb_model_dir"]+'/1'):
        build_model.build_save(model,str(1),params["pb_model_dir"])
    #acu
    assert(len(y_pred_itest)==len(X_itest["label"]))
    print(len(y_pred_itest))
    print(len(X_itest["label"]))
    count = 0
    for i in range(len(y_pred_itest)):
        score = y_pred_itest[i]
        if score > 0.5:
            prob = 1
        else:
            prob = 0
        if prob == X_itest["label"][i]:
            count += 1
    print(count/len(y_pred_itest))
    # save for stacking
    df = pd.DataFrame({"y_pred": y_pred_itest, "y_true": X_itest["label"]})
    df.to_csv(path + "/valid.csv", index=False, header=True)
    input('wait')
    print('save done!')
Esempio n. 37
0
def getExistingMapping(lang="en"):
	print "[%s]: parse existing mapping for language %s" % (time_utils._timestamp(), lang)
	G = g.Graph()
	G.parse(config.EXISTING_MAPPING[lang], format="n3")

	q = '''
PREFIX rr: <http://www.w3.org/ns/r2rml#>

SELECT ?template ?class
WHERE {
	?template rr:subjectMap ?mapping .
	?mapping rr:class ?class .
}
'''
	results = G.query(q)
	mapping = [row[0] for row in results]
	ontology = [row[1] for row in results]
	df = pd.DataFrame({'mapping':mapping, 'ontology':ontology})

	df["template"] = df["mapping"].apply(lambda x: config.TEMPLATE_NAME[lang] + x[47:])
	df.to_csv(config.EXISTING_MAPPING_OUTPUT[lang], index=False)
	print "[%s]: parsing complete" % time_utils._timestamp()
Esempio n. 38
0
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
Esempio n. 39
0
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["search_term", "product_title", "product_description", 
                "product_attribute", "product_brand", "product_color"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["search_term", "product_title", "product_description", 
    "product_attribute", "product_brand", "product_color"]
    ngrams = [1,2,3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount, 
        AttrBulletCount, 
        AttrBulletRatio, 
        AttrNonBulletCount, 
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()
Esempio n. 40
0
def process(lang, pivot):
    print "[%s]: process for language %s" % (time_utils._timestamp(), lang)
    linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)])
    templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang])
    articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot])
    mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot],
                          index_col="template")
    template1 = []
    template2 = []
    article1 = []
    article2 = []
    ontology = []
    for template in templateDict:
        articles = templateDict[template]
        for article in articles:
            if article in linkDict:
                tmp = linkDict[article]
                template1.append(template)
                article1.append(article)
                article2.append(tmp)
                if tmp in articleDict:
                    templateList = articleDict[tmp]
                else:
                    templateList = []
                c = ""
                t = ""
                for Template in templateList:
                    if Template in mapping.index:
                        c = mapping.at[Template, "ontology"]
                        t = Template
                template2.append(t)
                ontology.append(c)

    data = {"template1":template1, "article1":article1, "template2":template2, \
      "article2":article2, "ontology":ontology}
    df = pd.DataFrame(data)
    df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False)
    print "[%s]: processing complete" % time_utils._timestamp()
Esempio n. 41
0
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]   #DocIdOneHot not used
    obs_fields = ["question1", "question2"] 
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## id
    generators = [DocIdEcho]
    obs_fields = ["id"] 
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## qid
    generators = [MaxValue, DiffValue]
    obs_fields_list = [['qid1']]
    target_fields_list = [['qid2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
                param_list = []
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()

    ## DocLenRatio
    generators = [DocLenRatio]
    obs_fields_list = [['question1']]
    target_fields_list = [['question2']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
                param_list = []
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
                
    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["question1", "question2"]
    ngrams = [1, 2, 3, 4, 5, 12, 123]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()
Esempio n. 42
0
def main(conf,learner_name,exp_name):
    task_mode = expname
    feature_name = conf.name
    max_evals = 10
    refit_once = True
    logname = "%s_[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        exp_name, feature_name, learner_name, time_utils._timestamp())


    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(task_mode, learner_name,
                              conf, logger, max_evals, verbose=True,
                              refit_once=refit_once, plot_importance=False)
    optimizer.run()
Esempio n. 43
0
def run_lsa_ngram():
    logname = "generate_feature_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [LSA_Word_Ngram, LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields = ["question1", "question2"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()
Esempio n. 44
0
def getILL(lang, target):
	print "[%s]: generate ILL dict from language %s to language %s" % (time_utils._timestamp(), lang, target)
	infile = open(config.ILL[lang])
	prefix1 = config.LANG_PREFIX[lang]
	prefix2 = config.LANG_PREFIX[target]
	len1 = len(prefix1)
	len2 = len(prefix2)
	linkDict = {}
	for line in infile.readlines():
		if line[0] != "<":
			continue
		row = line.split()
		lang1 = row[0][1:-1]
		lang2 = row[2][1:-1]
		if prefix1 not in lang1:
			continue
		if prefix2 not in lang2:
			continue
		lang1 = lang1[len1:]
		lang2 = lang2[len2:]
		linkDict[lang1] = lang2
	print "%d links in total" % len(linkDict)
	pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict)
	print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 45
0
def parse_args(parser):
    parser.add_option("-l", "--level", default=1, type="int", 
        dest="feature_level", help="feature level, e.g., 1, 2, 3")
    parser.add_option("-c", "--config", default="feature_conf", type="string", 
        dest="feature_conf", help="feature config name")
    parser.add_option("-n", "--name", default="basic%s"%time_utils._timestamp(), 
        type="string", dest="feature_name", help="feature name")
    parser.add_option("-s", "--suffix", default=".pkl", type="string", 
        dest="feature_suffix", help="feature suffix")
    parser.add_option("-m", "--meta_config", default="feature_conf_meta", 
        type="string", dest="feature_conf_meta", help="meta feature config name")
    parser.add_option("-t", "--threshold", default=0.0, type="float", 
        dest="corr_threshold", help="correlation threshold for dropping features")
    (options, args) = parser.parse_args()
    return options, args
Esempio n. 46
0
def getExistingMapping(lang="en"):
    print "[%s]: parse existing mapping for language %s" % (
        time_utils._timestamp(), lang)
    G = g.Graph()
    G.parse(config.EXISTING_MAPPING[lang], format="n3")

    q = '''
PREFIX rr: <http://www.w3.org/ns/r2rml#>

SELECT ?template ?class
WHERE {
	?template rr:subjectMap ?mapping .
	?mapping rr:class ?class .
}
'''
    results = G.query(q)
    mapping = [row[0] for row in results]
    ontology = [row[1] for row in results]
    df = pd.DataFrame({'mapping': mapping, 'ontology': ontology})

    df["template"] = df["mapping"].apply(
        lambda x: config.TEMPLATE_NAME[lang] + x[47:])
    df.to_csv(config.EXISTING_MAPPING_OUTPUT[lang], index=False)
    print "[%s]: parsing complete" % time_utils._timestamp()
Esempio n. 47
0
def main():
	print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp())
	G = g.Graph()
	G.parse(config.ONTOLOGY, format="n3")

	q = '''
PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?child ?parent
WHERE {
	?child rr:subClassOf ?parent .
}'''
	
	results = G.query(q)
	ontologyDict = {}
	for row in results:
		child = str(row[0])
		parent = str(row[1])
		if parent in ontologyDict:
			ontologyDict[parent].append(child)
		else:
			ontologyDict[parent] = [child,]
	pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict)
	print "[%s]: generation complete" % time_utils._timestamp()
Esempio n. 48
0
def select_feature(conf,learner_name,exp_name):
    task_mode = expname
    feature_name = conf.name
    max_evals = 20
    refit_once = True
    logname = "%s_[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        exp_name, feature_name, learner_name, time_utils._timestamp())


    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(task_mode, learner_name,
                              conf, logger, max_evals, verbose=True,
                              refit_once=refit_once, plot_importance=False)
    given_predictors =['0_ip','0_app','0_device','0_os','0_channel','0_day','0_hour','0_next_click','0_next_click_shift']
    optimizer.select_features(given_predictors)
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
Esempio n. 50
0
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 51
0
def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Esempio n. 52
0
def run_edit_distance():
    logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                    pf.go()
Esempio n. 56
0
 def __init__(self, feature_list, feature_name, feature_suffix=".csv",
             feature_level=2, meta_feature_dict={}, corr_threshold=0):
     self.feature_name = feature_name
     self.feature_list = feature_list
     self.feature_suffix = feature_suffix
     self.feature_level = feature_level
     # for meta features
     self.meta_feature_dict = meta_feature_dict
     self.corr_threshold = corr_threshold
     self.feature_names_basic = []
     self.feature_names_cv = []
     self.has_basic = 1 if self.meta_feature_dict else 0
     logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
     self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
     if self.feature_level == 2:
         self.splitter = splitter_level2
     elif self.feature_level == 3:
         self.splitter = splitter_level3
     self.n_iter = n_iter
     self.splitter_prev = [0]*self.n_iter
def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go()
def main():
    logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
Esempio n. 59
0
def main():
    logname = "generate_feature_match_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [
        MatchQueryCount, 
        MatchQueryRatio, 
        LongestMatchSize,
        LongestMatchRatio
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount, 
        MatchAttrRatio, 
        IsIndoorOutdoorMatch, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_attribute_list"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
Esempio n. 60
0
#------------------- Process Attributes -------------------
def _split_attr_to_text(text):
    attrs = text.split(config.ATTR_SEPARATOR)
    return " ".join(attrs)

def _split_attr_to_list(text):
    attrs = text.split(config.ATTR_SEPARATOR)        
    if len(attrs) == 1:
        # missing
        return [[attrs[0], attrs[0]]]
    else:
        return [[n,v] for n,v in zip(attrs[::2], attrs[1::2])]


#-------------------------- Main --------------------------
now = time_utils._timestamp()

def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...