def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[1,2,3], [4]] obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] ) target_fields_list.append( ["product_title", "product_description"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def run_tsne_lsa_ngram(): logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True) sf.go() generators = [TSNE_LSA_Word_Ngram_Pair] ngrams = [1, 2] obs_fields_list = [] target_fields_list = [] obs_fields_list.append( ["question1"] ) target_fields_list.append( ["question2"] ) for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for ngram in ngrams: for generator in generators: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True) pf.go()
def main(): logname = "generate_feature_ident.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # Copies of data from ES docs. Note that multi-valued fields are first # converted into their length obs_fields = ["incoming_links", "popularity_score", "text_bytes", "category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"] transforms = [None, np.log, np.log10, np.sqrt] dedup = True for transform in transforms: param_list = [transform] sf = StandaloneFeatureWrapper(Ident, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # Sub-fields from termvec data obs_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['norm_query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] es_fields = ['score', 'term_freq', 'ttf', 'doc_freq'] aggregation_mode = ["mean", "std", "max", "min", "median"] for es_field in es_fields: for transform in transforms: param_list = [es_field, transform, aggregation_mode] sf = StandaloneFeatureWrapper(SubFieldIdent, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def main(): logname = "generate_feature_group_relevance_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i + 1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def run_lsa_ngram(): logname = "generate_feature_lsa_ngram_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) generators = [LSA_Word_Ngram, LSA_Char_Ngram] ngrams_list = [[1,2,3], [2,3,4,5]] ngrams_list = [[3], [4]] obs_fields = ["question1", "question2"] for generator,ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def run_lsa_ngram(): logname = "generate_feature_lsa_ngram_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfAll.drop(["product_attribute_list"], inplace=True, axis=1) generators = [LSA_Word_Ngram, LSA_Char_Ngram] ngrams_list = [[1, 2, 3], [2, 3, 4, 5]] ngrams_list = [[3], [4]] # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"] obs_fields = ["search_term", "product_title", "product_description"] for generator, ngrams in zip(generators, ngrams_list): for ngram in ngrams: param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) dfTrain = dfAll.iloc[:TRAIN_SIZE].copy() ## run python3 splitter.py first split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) n_iter = len(split) ## for cv for i in range(n_iter): trainInd, validInd = split[i][0], split[i][1] dfTrain2 = dfTrain.iloc[trainInd].copy() sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain2, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go() ## for all sub_feature_dir = "%s/All" % (config.FEAT_DIR) obs_fields = ["search_term", "product_title"][1:] aggregation_mode = ["mean", "std", "max", "min", "median", "size"] param_list = [dfAll["id"], dfTrain, aggregation_mode] sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger) sf.go()
def main(): logname = "generate_feature_basic_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["question1", "question2"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["question1", "question2"] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color"] ngrams = [1,2,3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_basic.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # basic generators = [DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] for generator in generators: param_list = [] dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # basic against multi-value fields obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] param_list = [aggregations] for generator in generators: multi_gen = MultiObjEstimatorWrapper(generator) dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] dedup = True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count against multi-value fields generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: multi_gen = MultiObjEstimatorWrapper(generator) param_list = [ngram, aggregations] dedup = True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def main(): logname = "generate_feature_basic_%s.log" % time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = [ "search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color" ] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_uid generators = [ DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3 ] obs_fields = ["product_uid"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = [ "search_term", "product_title", "product_description", "product_attribute", "product_brand", "product_color" ] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## for product_attribute_list generators = [ AttrCount, AttrBulletCount, AttrBulletRatio, AttrNonBulletCount, AttrNonBulletRatio, AttrHasProductHeight, AttrHasProductWidth, AttrHasProductLength, AttrHasProductDepth, AttrHasIndoorOutdoor, ] obs_fields = ["product_attribute_list"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): logname = "generate_feature_basic_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) ## basic generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] #DocIdOneHot not used obs_fields = ["question1", "question2"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## id generators = [DocIdEcho] obs_fields = ["id"] for generator in generators: param_list = [] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go() ## qid generators = [MaxValue, DiffValue] obs_fields_list = [['qid1']] target_fields_list = [['qid2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## DocLenRatio generators = [DocLenRatio] obs_fields_list = [['question1']] target_fields_list = [['question2']] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: param_list = [] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger) pf.go() ## unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["question1", "question2"] ngrams = [1, 2, 3, 4, 5, 12, 123] for generator in generators: for ngram in ngrams: param_list = [ngram] sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()