def go(self): y_train = self.dfAll["relevance"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s"%obs_field) continue obs_corpus = self.dfAll[obs_field].values for target_field in self.target_fields: if target_field not in self.dfAll.columns: self.logger.info("Skip %s"%target_field) continue target_corpus = self.dfAll[target_field].values ext = self.generator(obs_corpus, target_corpus, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim) pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))
def Article2Template(lang="en"): print "[%s]: generate article2template dict for language %s" % (time_utils._timestamp(), lang) infile = open(config.ARTICLE_TEMPLATES[lang]) prefix = config.LANG_PREFIX[lang] len_prefix = len(prefix) articleDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() article = row[0][1:-1] template = row[2][1:-1] article = article[len_prefix:] template = template[len_prefix:] if "/" in template: continue if article in articleDict: articleDict[article].append(template) else: articleDict[article] = [template, ] print "%d articles in total" % len(articleDict) pkl_utils._save(config.ARTICLE2TEMPLATE[lang], articleDict) print "[%s]: generation complete" % time_utils._timestamp()
def convert(self): dfAll = pd.read_csv(self.fname) columns_to_drop = ["id", "product_uid", "relevance", "search_term", "product_title"] columns_to_drop = [col for col in columns_to_drop if col in dfAll.columns] dfAll.drop(columns_to_drop, axis=1, inplace=True) for col in dfAll.columns: pkl_utils._save("%s/TuringTest_%s_%s.pkl"%(config.FEAT_DIR, self.name, col), dfAll[col].values)
def main(): dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1") dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1") # splits for level1 splitter = HomedepotSplitter(dfTrain=dfTrain, dfTest=dfTest, n_iter=config.N_RUNS, random_state=config.RANDOM_SEED, verbose=True, plot=True, # tune these params to get a close distribution split_param=[0.5, 0.25, 0.5], ) splitter.split() splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR) splits_level1 = splitter.splits ## splits for level2 splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR) splits_level2 = [0]*config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level1): dfValid = dfTrain.iloc[validInd].copy() splitter2 = HomedepotSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=1, random_state=run, verbose=True, # tune these params to get a close distribution split_param=[0.5, 0.15, 0.6]) splitter2.split() splits_level2[run] = splitter2.splits[0] pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2) ## splits for level3 splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR) splits_level3 = [0]*config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level2): dfValid = dfTrain.iloc[validInd].copy() splitter3 = HomedepotSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=1, random_state=run, verbose=True, # tune these params to get a close distribution split_param=[0.5, 0.15, 0.7]) splitter3.split() splits_level3[run] = splitter3.splits[0] pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
def main(): fnames = [ "TSNE_LSA100_Word_Unigram_Pair_search_term_x_product_title_100D", "TSNE_LSA100_Word_Bigram_Pair_search_term_x_product_title_100D", "TSNE_LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D", "TSNE_LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D", ] fnames = [os.path.join(config.FEAT_DIR, fname+".csv") for fname in fnames] for fname in fnames: df = pd.read_csv(fname, index=False) f = df.values pkl_utils._save(fname[:-4]+".pkl", f)
def save(self): data_dict = { "X_train_basic": self.X_train_basic, "y_train_cv": self.y_train_cv, "X_train_cv": self.X_train_cv, "X_test": self.X_test, "id_test": self.id_test, "splitter_prev": self.splitter_prev, "splitter": self.splitter, "n_iter": self.n_iter, "has_basic": self.has_basic, } fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX) pkl_utils._save(fname, data_dict) self.logger.info("Save to %s" % fname)
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = 1 fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def go(self): y_train = self.dfAll["is_duplicate"].values[:TRAIN_SIZE] for obs_field in self.obs_fields: if obs_field not in self.dfAll.columns: self.logger.info("Skip %s" % obs_field) continue obs_corpus = self.dfAll[obs_field].values for target_field in self.target_fields: if target_field not in self.dfAll.columns: self.logger.info("Skip %s" % target_field) continue target_corpus = self.dfAll[target_field].values ext = self.generator(obs_corpus, target_corpus, *self.param_list) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%s_x_%s_%dD" % (feat_name, obs_field, target_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr(x[:TRAIN_SIZE, i], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) else: dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD" % (ext.__name__(), obs_field, target_field, dim) pkl_utils._save( os.path.join(self.feat_dir, fname + config.FEAT_FILE_SUFFIX), x) if dim == 1: corr = np_utils._corr(x[:TRAIN_SIZE], y_train) self.logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) elif self.force_corr: for j in range(dim): corr = np_utils._corr(x[:TRAIN_SIZE, j], y_train) self.logger.info("%s (%d/%dD): corr = %.6f" % (fname, j + 1, dim, corr))
def main(): logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_corpus = [] query_suffix = [] # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("raw") # after processing dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("lemmatized") # after extracting product_name in search_term obs_corpus.append(dfAll["search_term_product_name"].values) query_suffix.append("product_name") if "search_term_auto_corrected" in dfAll.columns: # after auto correction obs_corpus.append(dfAll["search_term_auto_corrected"].values) query_suffix.append("corrected") # after stemming dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) obs_corpus.append(dfAll["search_term"].values) query_suffix.append("stemmed") y_train = dfAll["relevance"].values[:TRAIN_SIZE] for i in range(len(query_suffix)-1): for j in range(i+1, len(query_suffix)): ext = QueryQuality(obs_corpus[i], obs_corpus[j]) x = ext.transform() dim = np_utils._dim(x) fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x) corr = np_utils._corr(x[:TRAIN_SIZE], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) # raw dfAll = pkl_utils._load(config.ALL_DATA_RAW) obs_fields = ["search_term"] param_list = [] sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger) sf.go()
def main(): dfTrain = pd.read_csv(config.TRAIN_DATA) dfTest = pd.read_csv(config.TEST_DATA) # splits for level1 splitter = QuoraSplitter(dfTrain=dfTrain, dfTest=dfTest, n_iter=config.N_RUNS, random_state=config.RANDOM_SEED, verbose=True) splitter.split() splitter.save(config.SPLIT_DIR + "/splits_level1.pkl") ## splits for level2 splits_level1 = pkl_utils._load(config.SPLIT_DIR + "/splits_level1.pkl") splits_level2 = [0] * config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level1): dfValid = dfTrain.iloc[validInd].copy() splitter2 = QuoraSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=config.N_RUNS, random_state=run, verbose=True) splitter2.split() splits_level2[run] = splitter2.splits[-1] pkl_utils._save(config.SPLIT_DIR + "/splits_level2.pkl", splits_level2) ## splits for level3 splits_level2 = pkl_utils._load(config.SPLIT_DIR + "/splits_level2.pkl") splits_level3 = [0] * config.N_RUNS for run, (trainInd, validInd) in enumerate(splits_level2): dfValid = dfTrain.iloc[validInd].copy() splitter3 = QuoraSplitter(dfTrain=dfValid, dfTest=dfTest, n_iter=config.N_RUNS, random_state=run, verbose=True) splitter3.split() splits_level3[run] = splitter3.splits[-1] pkl_utils._save(config.SPLIT_DIR + "/splits_level3.pkl", splits_level3)
def main(): print "[%s]: generate ontology hierarchy tree" % (time_utils._timestamp()) G = g.Graph() G.parse(config.ONTOLOGY, format="n3") q = ''' PREFIX rr: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?child ?parent WHERE { ?child rr:subClassOf ?parent . }''' results = G.query(q) ontologyDict = {} for row in results: child = str(row[0]) parent = str(row[1]) if parent in ontologyDict: ontologyDict[parent].append(child) else: ontologyDict[parent] = [child,] pkl_utils._save(config.ONTOLOGY_TREE, ontologyDict) print "[%s]: generation complete" % time_utils._timestamp()
def getILL(lang, target): print "[%s]: generate ILL dict from language %s to language %s" % (time_utils._timestamp(), lang, target) infile = open(config.ILL[lang]) prefix1 = config.LANG_PREFIX[lang] prefix2 = config.LANG_PREFIX[target] len1 = len(prefix1) len2 = len(prefix2) linkDict = {} for line in infile.readlines(): if line[0] != "<": continue row = line.split() lang1 = row[0][1:-1] lang2 = row[2][1:-1] if prefix1 not in lang1: continue if prefix2 not in lang2: continue lang1 = lang1[len1:] lang2 = lang2[len2:] linkDict[lang1] = lang2 print "%d links in total" % len(linkDict) pkl_utils._save(config.ILL_DICT["%s2%s" % (lang, target)], linkDict) print "[%s]: generation complete" % time_utils._timestamp()
def main(): FNAME = "feature_text" logname = "%s_%s.log" % (FNAME, now) logger = logging_utils._get_logger(config.LOG_DIR, logname) stop_words = set(stopwords.words('russian')) train, test = dl.load_data() logger.info("Generating title & description text features ...") t0 = time() # Generating text features for title tfidf_title = TfidfVectorizer(stop_words=stop_words, max_features=config.MAX_TEXT_FEATURES) tfidf_description = TfidfVectorizer(stop_words=stop_words, max_features=config.MAX_TEXT_FEATURES) train['description'] = train['description'].fillna(' ') test['description'] = test['description'].fillna(' ') train['title'] = train['title'].fillna(' ') test['title'] = test['title'].fillna(' ') tfidf_title.fit(pd.concat([train['description'], train['description']])) tfidf_description.fit(pd.concat([test['title'], test['title']])) train_title_tfidf = tfidf_title.transform(train['title']) test_title_tfidf = tfidf_title.transform(test['title']) train_description_tfidf = tfidf_description.transform(train['description']) test_description_tfidf = tfidf_description.transform(test['description']) svd_title = TruncatedSVD(n_components=config.SVD_N_COMP, algorithm='arpack') svd_title.fit( tfidf_title.transform(pd.concat([train['title'], test['title']]))) svd_description = TruncatedSVD(n_components=config.SVD_N_COMP, algorithm='arpack') svd_description.fit( tfidf_description.transform( pd.concat([train['description'], test['description']]))) train_description_svd = pd.DataFrame( svd_description.transform(train_description_tfidf)) test_description_svd = pd.DataFrame( svd_description.transform(test_description_tfidf)) train_description_svd.columns = [ 'svd_description_' + str(i + 1) for i in range(config.SVD_N_COMP) ] test_description_svd.columns = [ 'svd_description_' + str(i + 1) for i in range(config.SVD_N_COMP) ] train_title_svd = pd.DataFrame(svd_title.transform(train_title_tfidf)) test_title_svd = pd.DataFrame(svd_title.transform(test_title_tfidf)) train_title_svd.columns = [ 'svd_title_' + str(i + 1) for i in range(config.SVD_N_COMP) ] test_title_svd.columns = [ 'svd_title_' + str(i + 1) for i in range(config.SVD_N_COMP) ] gc.collect() logger.info(FNAME + ' took: %s minutes' % round((time() - t0) / 60, 1)) logger.info('Train SVD title shape: %s & Test SVD title shape: %s' % (train_title_svd.shape, test_title_svd.shape)) logger.info( 'Train SVD description shape: %s & Test SVD description shape: %s' % (train_description_svd.shape, test_description_svd.shape)) # save data train_fname = os.path.join(config.DATA_FEATURES_DIR, "train_" + FNAME + config.FEAT_FILE_SUFFIX) test_fname = os.path.join(config.DATA_FEATURES_DIR, "test_" + FNAME + config.FEAT_FILE_SUFFIX) logger.info("Save to %s" % train_fname) pkl_utils._save( train_fname, pd.concat([train_title_svd, train_description_svd], axis=1)) logger.info("Save to %s" % test_fname) pkl_utils._save(test_fname, pd.concat([test_title_svd, test_description_svd], axis=1)) gc.collect()
def main(): logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp() logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append( ["search_term"] ) target_fields_list.append( ["product_title", "product_title_product_name"] ) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i,feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD"%(feat_name, dim) pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i]) corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train) logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr)) except: logger.info("Skip %s"%dist_name) pass
def main(): logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp( ) logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED) y_train = dfAll["relevance"].values[:TRAIN_SIZE] group_id_names = [ "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid" ] match_list = [ "MatchQueryCount", "MatchQueryRatio", "LongestMatchRatio", ] tfidf_list = [ "StatCoocTF_Unigram_Mean", "StatCoocTF_Unigram_Max", "StatCoocTF_Unigram_Min", # "StatCoocNormTF_Unigram_Mean", # "StatCoocNormTF_Unigram_Max", # "StatCoocNormTF_Unigram_Min", "StatCoocTFIDF_Unigram_Mean", "StatCoocTFIDF_Unigram_Max", "StatCoocTFIDF_Unigram_Min", "StatCoocBM25_Unigram_Mean", "StatCoocBM25_Unigram_Max", "StatCoocBM25_Unigram_Min", # "StatCoocTF_Bigram_Mean", # "StatCoocTF_Bigram_Max", # "StatCoocTF_Bigram_Min", # "StatCoocNormTF_Bigram_Mean", # "StatCoocNormTF_Bigram_Max", # "StatCoocNormTF_Bigram_Min", # "StatCoocTFIDF_Bigram_Mean", # "StatCoocTFIDF_Bigram_Max", # "StatCoocTFIDF_Bigram_Min", # "StatCoocBM25_Bigram_Mean", # "StatCoocBM25_Bigram_Max", # "StatCoocBM25_Bigram_Min", # "StatCoocTF_Trigram_Mean", # "StatCoocTF_Trigram_Max", # "StatCoocTF_Trigram_Min", # "StatCoocNormTF_Trigram_Mean", # "StatCoocNormTF_Trigram_Max", # "StatCoocNormTF_Trigram_Min", # "StatCoocTFIDF_Trigram_Mean", # "StatCoocTFIDF_Trigram_Max", # "StatCoocTFIDF_Trigram_Min", # "StatCoocBM25_Trigram_Mean", # "StatCoocBM25_Trigram_Max", # "StatCoocBM25_Trigram_Min", ] intersect_ngram_count_list = [ "IntersectCount_Unigram", "IntersectRatio_Unigram", # "IntersectCount_Bigram", # "IntersectRatio_Bigram", # "IntersectCount_Trigram", # "IntersectRatio_Trigram", ] first_last_ngram_list = [ "FirstIntersectCount_Unigram", "FirstIntersectRatio_Unigram", "LastIntersectCount_Unigram", "LastIntersectRatio_Unigram", # "FirstIntersectCount_Bigram", # "FirstIntersectRatio_Bigram", # "LastIntersectCount_Bigram", # "LastIntersectRatio_Bigram", # "FirstIntersectCount_Trigram", # "FirstIntersectRatio_Trigram", # "LastIntersectCount_Trigram", # "LastIntersectRatio_Trigram", ] cooccurrence_ngram_count_list = [ "CooccurrenceCount_Unigram", "CooccurrenceRatio_Unigram", # "CooccurrenceCount_Bigram", # "CooccurrenceRatio_Bigram", # "CooccurrenceCount_Trigram", # "CooccurrenceRatio_Trigram", ] ngram_jaccard_list = [ "JaccardCoef_Unigram", # "JaccardCoef_Bigram", # "JaccardCoef_Trigram", "DiceDistance_Unigram", # "DiceDistance_Bigram", # "DiceDistance_Trigram", ] char_dist_sim_list = [ "CharDistribution_CosineSim", "CharDistribution_KL", ] tfidf_word_ngram_cosinesim_list = [ "TFIDF_Word_Unigram_CosineSim", # "TFIDF_Word_Bigram_CosineSim", # "TFIDF_Word_Trigram_CosineSim", ] tfidf_char_ngram_cosinesim_list = [ # "TFIDF_Char_Bigram_CosineSim", # "TFIDF_Char_Trigram_CosineSim", "TFIDF_Char_Fourgram_CosineSim", # "TFIDF_Char_Fivegram_CosineSim", ] lsa_word_ngram_cosinesim_list = [ "LSA100_Word_Unigram_CosineSim", # "LSA100_Word_Bigram_CosineSim", # "LSA100_Word_Trigram_CosineSim", ] lsa_char_ngram_cosinesim_list = [ # "LSA100_Char_Bigram_CosineSim", # "LSA100_Char_Trigram_CosineSim", "LSA100_Char_Fourgram_CosineSim", # "LSA100_Char_Fivegram_CosineSim", ] doc2vec_list = [ "Doc2Vec_Homedepot_D100_CosineSim", ] word2vec_list = [ "Word2Vec_N_Similarity", "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean", "Word2Vec_Homedepot_D100_CosineSim_Max_Mean", "Word2Vec_Homedepot_D100_CosineSim_Min_Mean", ] distance_generator_list = \ match_list + \ tfidf_list + \ intersect_ngram_count_list + \ first_last_ngram_list + \ cooccurrence_ngram_count_list + \ ngram_jaccard_list + \ tfidf_word_ngram_cosinesim_list + \ tfidf_char_ngram_cosinesim_list + \ lsa_word_ngram_cosinesim_list + \ lsa_char_ngram_cosinesim_list + \ char_dist_sim_list + \ word2vec_list + \ doc2vec_list obs_fields_list = [] target_fields_list = [] ## query in document obs_fields_list.append(["search_term"]) target_fields_list.append(["product_title", "product_title_product_name"]) aggregation_mode = ["mean", "max", "min"] for group_id_name in group_id_names: group_id_list = pkl_utils._load( os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl")) for distance_generator in distance_generator_list: for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for obs_field in obs_fields: for target_field in target_fields: dist_name = "%s_%s_x_%s" % (distance_generator, obs_field, target_field) try: dist_list = pkl_utils._load( os.path.join(config.FEAT_DIR, dist_name + "_1D.pkl")) ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode) x = ext.transform() if isinstance(ext.__name__(), list): for i, feat_name in enumerate(ext.__name__()): dim = 1 fname = "%s_%dD" % (feat_name, dim) pkl_utils._save( os.path.join( config.FEAT_DIR, fname + config.FEAT_FILE_SUFFIX), x[:, i]) corr = np_utils._corr( x[:TRAIN_SIZE, i], y_train) logger.info("%s (%dD): corr = %.6f" % (fname, dim, corr)) except: logger.info("Skip %s" % dist_name) pass
def parse(lang="en"): _log.info("starting parsing") infile = open(config.INSTANCE_TYPES[lang]) rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" type_entries = [] entitySet = set() typeSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() instance = row[0][1:-1] ontology = row[2][1:-1] type_entries.append((instance, ontology)) entitySet.add(ontology) typeSet.add(ontology) typeDict = {y:x for x, y in enumerate(typeSet)} infile.close() cnt_type = len(entitySet) _log.info("%d types" % cnt_type) infile = open(config.OBJECTS[lang]) relationDict = {} instanceSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() subject = row[0][1:-1] predicate = row[1][1:-1] target = row[2][1:-1] entitySet.add(subject) entitySet.add(target) instanceSet.add(subject) instanceSet.add(target) if predicate in relationDict: relationDict[predicate].append((subject, target)) else: relationDict[predicate] = [(subject, target)] instanceDict = {y:x for x, y in enumerate(instanceSet)} entityDict = {y:x for x, y in enumerate(entitySet)} infile.close() cnt_ins = len(instanceSet) N = len(entitySet) _log.info("%d instanes" % cnt_ins) _log.info("%d entites" % N) tensor = [] predicateDict = {} cnt = 0 for predicate in relationDict: entries = relationDict[predicate] rows = [entityDict[entry[0]] for entry in entries] cols = [entityDict[entry[1]] for entry in entries] data = [1 for entry in entries] mat = spsp.csr_matrix((data, (rows, cols)), (N, N)) tensor.append(mat) predicateDict[predicate] = cnt cnt += 1 type_entries = [entry for entry in type_entries if entry[0] in instanceSet] rows = [entityDict[entry[0]] for entry in type_entries] cols = [entityDict[entry[1]] for entry in type_entries] data = [1 for entry in type_entries] mat = spsp.csr_matrix((data, (rows, cols)), (N, N)) tensor.append(mat) predicateDict[rdf_type] = cnt _log.info("%d relations" % (cnt+1)) pkl_utils._save(config.TENSOR[lang], tensor) pkl_utils._save(config.ENTITY[lang], entityDict) pkl_utils._save(config.PREDICATE[lang], predicateDict) pkl_utils._save(config.INSTANCE[lang], instanceDict) pkl_utils._save(config.TYPE[lang], typeDict) pkl_utils._save(config.TYPE_MATRIX[lang], (rows, cols)) _log.info("parsing complete")
def main(): # load provided data dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1") dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1") dfAttr = pd.read_csv(config.ATTR_DATA) dfDesc = pd.read_csv(config.DESC_DATA) # print("Train Mean: %.6f" % np.mean(dfTrain["relevance"])) print("Train Var: %.6f" % np.var(dfTrain["relevance"])) # dfTest["relevance"] = np.zeros((config.TEST_SIZE)) dfAttr.dropna(how="all", inplace=True) dfAttr["value"] = dfAttr["value"].astype(str) # concat train and test dfAll = pd.concat((dfTrain, dfTest), ignore_index=True) del dfTrain del dfTest gc.collect() # merge product description dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfDesc gc.collect() # merge product brand dfBrand = dfAttr[dfAttr.name == "MFG Brand Name"][[ "product_uid", "value" ]].rename(columns={"value": "product_brand"}) dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left") dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str) dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfBrand gc.collect() # merge product color color_columns = [ "product_color", "Color Family", "Color/Finish", "Color/Finish Family" ] dfColor = dfAttr[dfAttr.name.isin(color_columns)][[ "product_uid", "value" ]].rename(columns={"value": "product_color"}) dfColor.dropna(how="all", inplace=True) _agg_color = lambda df: " ".join(list(set(df["product_color"]))) dfColor = dfColor.groupby("product_uid").apply(_agg_color) dfColor = dfColor.reset_index(name="product_color") dfColor["product_color"] = dfColor["product_color"].values.astype(str) dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfColor gc.collect() # merge product attribute _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df[ "name"] + config.ATTR_SEPARATOR + df["value"]) dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr) dfAttr = dfAttr.reset_index(name="product_attribute_concat") dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfAttr gc.collect() # save data if config.TASK == "sample": dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy() pkl_utils._save(config.ALL_DATA_RAW, dfAll) # info dfInfo = dfAll[["id", "relevance"]].copy() pkl_utils._save(config.INFO_DATA, dfInfo)
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log" % now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ "question1", "question2", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ UnicodeConverter(), LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), # WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # save data logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED) pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # save data logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED_STEMMED) pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll)
def main(): # load provided data dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1") dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1") dfAttr = pd.read_csv(config.ATTR_DATA) dfDesc = pd.read_csv(config.DESC_DATA) # print("Train Mean: %.6f"%np.mean(dfTrain["relevance"])) print("Train Var: %.6f"%np.var(dfTrain["relevance"])) # dfTest["relevance"] = np.zeros((config.TEST_SIZE)) dfAttr.dropna(how="all", inplace=True) dfAttr["value"] = dfAttr["value"].astype(str) # concat train and test dfAll = pd.concat((dfTrain, dfTest), ignore_index=True) del dfTrain del dfTest gc.collect() # merge product description dfAll = pd.merge(dfAll, dfDesc, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfDesc gc.collect() # merge product brand dfBrand = dfAttr[dfAttr.name=="MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "product_brand"}) dfAll = pd.merge(dfAll, dfBrand, on="product_uid", how="left") dfBrand["product_brand"] = dfBrand["product_brand"].values.astype(str) dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfBrand gc.collect() # merge product color color_columns = ["product_color", "Color Family", "Color/Finish", "Color/Finish Family"] dfColor = dfAttr[dfAttr.name.isin(color_columns)][["product_uid", "value"]].rename(columns={"value": "product_color"}) dfColor.dropna(how="all", inplace=True) _agg_color = lambda df: " ".join(list(set(df["product_color"]))) dfColor = dfColor.groupby("product_uid").apply(_agg_color) dfColor = dfColor.reset_index(name="product_color") dfColor["product_color"] = dfColor["product_color"].values.astype(str) dfAll = pd.merge(dfAll, dfColor, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfColor gc.collect() # merge product attribute _agg_attr = lambda df: config.ATTR_SEPARATOR.join(df["name"] + config.ATTR_SEPARATOR + df["value"]) dfAttr = dfAttr.groupby("product_uid").apply(_agg_attr) dfAttr = dfAttr.reset_index(name="product_attribute_concat") dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfAttr gc.collect() # save data if config.TASK == "sample": dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy() pkl_utils._save(config.ALL_DATA_RAW, dfAll) # info dfInfo = dfAll[["id","relevance"]].copy() pkl_utils._save(config.INFO_DATA, dfInfo)
def group(input_data, output_data, if_sample=False): df = pd.read_csv(input_data, sep="\t", names=["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"]) grouped = df.groupby(["r", "e1", "e2"]) words = [] positions = [] heads = [] tails = [] labels = [] cnt = 0 for name, group in grouped: if if_sample and cnt > 10000: break cnt += 1 if cnt % 1000 == 0: print(cnt) group = group.reset_index(drop=True) label = name[0] head = name[1] tail = name[2] size = group.shape[0] tmp_words = [] tmp_positions = [] for i in range(size): tmp_words.append(group.s[i]) tmp_positions.append( [group.x1[i], group.y1[i], group.x2[i], group.y2[i]]) if size < config.BAG_SIZE: tmp = size ans_words = tmp_words[:] ans_positions = tmp_positions[:] while tmp + size < config.BAG_SIZE: tmp += size ans_words += tmp_words ans_positions += tmp_positions ans_words += tmp_words[:config.BAG_SIZE - tmp] ans_positions += tmp_positions[:config.BAG_SIZE - tmp] words.append(ans_words) positions.append(ans_positions) heads.append(head) tails.append(tail) labels.append(label) else: tmp = 0 while tmp + config.BAG_SIZE < size: words.append(tmp_words[tmp:tmp + config.BAG_SIZE]) positions.append(tmp_positions[tmp:tmp + config.BAG_SIZE]) heads.append(head) tails.append(tail) labels.append(label) tmp += config.BAG_SIZE words.append(tmp_words[-config.BAG_SIZE:]) positions.append(tmp_positions[-config.BAG_SIZE:]) heads.append(head) tails.append(tail) labels.append(label) heads = np.array(heads) tails = np.array(tails) labels = np.array(labels) pkl_utils._save(output_data, (words, positions, heads, tails, labels))
def save(self, model_dir, model_name): fname = os.path.join(model_dir, model_name) self.model.save(fname) pkl_utils._save("%s.sent_label" % fname, self.sentences.sent_label)
def parse(lang="en"): dataDict = {} infile = open(config.INSTANCE_TYPES[lang]) rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" type_entries = [] entitySet = set() typeSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() instance = row[0][1:-1] ontology = row[2][1:-1] type_entries.append((instance, ontology)) entitySet.add(ontology) typeSet.add(ontology) typeDict = {y:x for x, y in enumerate(typeSet)} infile.close() cnt_type = len(entitySet) _log.info("%d types" % cnt_type) infile = open(config.OBJECTS[lang]) relationDict = {} instanceSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() subject = row[0][1:-1] predicate = row[1][1:-1] target = row[2][1:-1] entitySet.add(subject) entitySet.add(target) instanceSet.add(subject) instanceSet.add(target) if predicate in relationDict: relationDict[predicate].append((subject, target)) else: relationDict[predicate] = [(subject, target)] instanceDict = {y:x for x, y in enumerate(instanceSet)} entityDict = {y:x for x, y in enumerate(entitySet)} infile.close() cnt_ins = len(instanceSet) N = len(entitySet) _log.info("%d instanes" % cnt_ins) _log.info("%d entites" % N) triples = [] predicateDict = {} cnt = 0 for predicate in relationDict: entries = relationDict[predicate] sub = [entityDict[entry[0]] for entry in entries] obj = [entityDict[entry[1]] for entry in entries] pred = [cnt for entry in entries] triples.extend(zip(sub, obj, pred)) predicateDict[cnt] = predicate cnt += 1 type_entries = [entry for entry in type_entries if entry[0] in instanceSet] sub = [entityDict[entry[0]] for entry in type_entries] obj = [entityDict[entry[1]] for entry in type_entries] pred = [cnt for entry in type_entries] triples.extend(zip(sub, obj, pred)) predicateDict[cnt] = rdf_type triples = pd.Series(triples) _log.info("%d relations" % (cnt+1)) _log.info("%d triples" % len(triples)) dataDict["entities"] = list(entitySet) dataDict["relations"] = predicateDict.values() IDX = list(range(len(triples))) shuffle(IDX) dataDict["train_subs"] = list(triples[IDX[:-20000]]) dataDict["valid_subs"] = list(triples[IDX[-20000:-10000]]) dataDict["test_subs"] = list(triples[IDX[-10000:]]) pkl_utils._save(config.DATA_DICT[lang], dataDict) _log.info("train size: %d" % len(dataDict["train_subs"])) _log.info("valid size: %d" % len(dataDict["valid_subs"])) _log.info("test size: %d" % len(dataDict["test_subs"])) _log.info("parsing complete")
dfAttr = dfAttr.reset_index(name="product_attribute_concat") dfAll = pd.merge(dfAll, dfAttr, on="product_uid", how="left") dfAll.fillna(config.MISSING_VALUE_STRING, inplace=True) del dfAttr gc.collect() # In[17]: dfAll.head() # In[ ]: # save data if config.TASK == 'sample': dfAll = dfAll.iloc[:config.SAMPLE_SIZE].copy( ) # in this case ".copy" is redundant pkl_utils._save(config.ALL_DATA_RAW, dfAll) # info dfInfo = dfAll[['id', 'relevance']].copy() pkl_utils._save(config.INFO_DATA, dfInfo) # In[ ]: if os.path.isfile('data_preparer.ipynb'): get_ipython().system('jupyter nbconvert --to script data_preparer.ipynb') # In[ ]: # In[ ]:
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # "product_attribute_list", "product_attribute_concat", "product_description", "product_brand", "product_color", "product_title", "search_term", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform) dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform) if config.TASK == "sample": print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]]) ## clean using GoogleQuerySpellingChecker # MUST BE IN FRONT OF ALL THE PROCESSING if config.GOOGLE_CORRECTING_QUERY: logger.info("Run GoogleQuerySpellingChecker at search_term") checker = GoogleQuerySpellingChecker() dfAll["search_term"] = dfAll["search_term"].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) if config.TASK == "sample": print(dfAll[["product_attribute", "product_attribute_list"]]) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(processors) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query if config.AUTO_CORRECTING_QUERY: logger.info("Run AutoSpellingChecker at search_term") checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll["search_term_auto_corrected"] = list(dfAll["search_term"].apply(checker.correct)) columns_to_proc += ["search_term_auto_corrected"] if config.TASK == "sample": print(dfAll[["search_term", "search_term_auto_corrected"]]) # save query_correction_map and spelling checker fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def main(): ### 1. Record Time now = time_utils._timestamp() ########### ## Setup ## ########### logname = f'data_processor_{now}.log' logger = logging_utils._get_logger(config.LOG_DIR, logname) # Put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process. # Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame. columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # 'product_attribute_list', 'product_attribute_concat', 'product_description', 'product_brand', 'product_color', 'product_title', 'search_term', ] if config.PLATFORM == 'Linux': config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here # 其實沒差,除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch) UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser='html.parser'), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type='snowball'), Stemmer(stemmer_type='porter') ][0:1] # means only use Stemmer(stemmer_type='snowball') ## simple test text = '1/2 inch rubber lep tips Bullet07' print('Original:') print(text) list_processor = ListProcessor(processors) print('After:') print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] if config.TASK == 'sample': dfAll = dfAll.iloc[0:config.SAMPLE_SIZE] print(f'data length: {len(dfAll)}') ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll['search_term_product_name'] = dfAll['search_term'].apply( ext.transform) dfAll['product_title_product_name'] = dfAll['product_title'].apply( ext.transform) if config.TASK == 'sample': print(dfAll[[ 'search_term', 'search_term_product_name', 'product_title_product_name' ]]) ## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission) # MUST BE IN FRONT OF ALL THE PROCESSING if config.GOOGLE_CORRECTING_QUERY: logger.info('Run GoogleQuerySpellingChecker at search_term') checker = GoogleQuerySpellingChecker() dfAll['search_term'] = dfAll['search_term'].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply( _split_attr_to_text) dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply( _split_attr_to_list) if config.TASK == 'sample': print(dfAll[['product_attribute', 'product_attribute_list']]) # query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.) if config.QUERY_EXPANSION: list_processor = ListProcessor(processors) # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'. # So, if stop word has 'one', it must replace to '1',too. base_stopwords = set(list_processor.process(list( config.STOP_WORDS))) # a set of stop word qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll['search_term_alt'] = qe.build() if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_alt']]) # save data logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}') columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query(Chenglong team not used in final submission) if config.AUTO_CORRECTING_QUERY: logger.info('Run AutoSpellingChecker at search_term') checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply( checker.correct)) columns_to_proc += ['search_term_auto_corrected'] if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_auto_corrected']]) # save query_correction_map and spelling checker fname = '%s/auto_spelling_checker_query_correction_map_%s.log' % ( config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED) columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply( _split_attr_to_text) dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply( _split_attr_to_list) # query expansion if config.QUERY_EXPANSION: list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll['search_term_alt'] = qe.build() if config.TASK == 'sample': print(dfAll[['search_term', 'search_term_alt']]) # save data logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [ col for col in dfAll.columns if col != 'product_attribute_concat' ] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def save(self, model_dir, model_name): fname = os.path.join(model_dir, model_name) self.model.save(fname) pkl_utils._save("%s.sent_label"%fname, self.sentences.sent_label)
def main(): ########### ## Setup ## ########### logname = "data_processor_%s.log"%now logger = logging_utils._get_logger(config.LOG_DIR, logname) # put product_attribute_list, product_attribute and product_description first as they are # quite time consuming to process columns_to_proc = [ # # product_attribute_list is very time consuming to process # # so we just process product_attribute which is of the form # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ... # # and split it into a list afterwards # "product_attribute_list", "product_attribute_concat", "product_description", "product_brand", "product_color", "product_title", "search_term", ] if config.PLATFORM == "Linux": config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc) # clean using a list of processors processors = [ LowerCaseConverter(), # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here UnitConverter(), LowerUpperCaseSplitter(), WordReplacer(replace_fname=config.WORD_REPLACER_DATA), LetterLetterSplitter(), DigitLetterSplitter(), DigitCommaDigitMerger(), NumberDigitMapper(), UnitConverter(), QuartetCleaner(), HtmlCleaner(parser="html.parser"), Lemmatizer(), ] stemmers = [ Stemmer(stemmer_type="snowball"), Stemmer(stemmer_type="porter") ][0:1] ## simple test text = "1/2 inch rubber lep tips Bullet07" print("Original:") print(text) list_processor = ListProcessor(processors) print("After:") print(list_processor.process([text])) ############# ## Process ## ############# ## load raw data dfAll = pkl_utils._load(config.ALL_DATA_RAW) columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns] ## extract product name from search_term and product_title ext = ProductNameExtractor() dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform) dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform) if config.TASK == "sample": print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]]) ## clean using GoogleQuerySpellingChecker # MUST BE IN FRONT OF ALL THE PROCESSING logger.info("Run GoogleQuerySpellingChecker at search_term") checker = GoogleQuerySpellingChecker() dfAll["search_term"] = dfAll["search_term"].apply(checker.correct) ## clean uisng a list of processors df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) if config.TASK == "sample": print(dfAll[["product_attribute", "product_attribute_list"]]) # query expansion list_processor = ListProcessor(processors) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## auto correcting query if config.AUTO_CORRECTING_QUERY: logger.info("Run AutoSpellingChecker at search_term") checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4) dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct)) columns_to_proc += ['search_term_auto_corrected'] if config.TASK == "sample": print(dfAll[["search_term", "search_term_auto_corrected"]]) # save query_correction_map and spelling checker fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now) checker.save_query_correction_map(fname) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save]) ## clean using stemmers df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS) df_processor.process(dfAll, columns_to_proc) # split product_attribute_concat into product_attribute and product_attribute_list dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text) dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list) # query expansion list_processor = ListProcessor(stemmers) base_stopwords = set(list_processor.process(list(config.STOP_WORDS))) qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords) dfAll["search_term_alt"] = qe.build() if config.TASK == "sample": print(dfAll[["search_term", "search_term_alt"]]) # save data logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED) columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"] pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
def save(self, fname): pkl_utils._save(fname, self.splits)
def parse(lang="en"): infile = open(config.INSTANCE_TYPES[lang]) rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" type_entries = [] entitySet = set() typeSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() instance = row[0][1:-1] ontology = row[2][1:-1] type_entries.append((instance, ontology)) entitySet.add(ontology) typeSet.add(ontology) typeDict = {y:x for x, y in enumerate(typeSet)} infile.close() cnt_type = len(entitySet) _log.info("%d types" % cnt_type) infile = open(config.OBJECTS[lang]) relationDict = {} instanceSet = set() for line in infile.readlines(): if line[0] != "<": continue row = line.split() subject = row[0][1:-1] predicate = row[1][1:-1] target = row[2][1:-1] entitySet.add(subject) entitySet.add(target) instanceSet.add(subject) instanceSet.add(target) if predicate in relationDict: relationDict[predicate].append((subject, target)) else: relationDict[predicate] = [(subject, target)] instanceDict = {y:x for x, y in enumerate(instanceSet)} entityDict = {y:x for x, y in enumerate(entitySet)} infile.close() cnt_ins = len(instanceSet) N = len(entitySet) _log.info("%d instanes" % cnt_ins) _log.info("%d entites" % N) tensor = [] predicateDict = {} cnt = 0 for predicate in relationDict: entries = relationDict[predicate] rows = [entityDict[entry[0]] for entry in entries] cols = [entityDict[entry[1]] for entry in entries] data = [1 for entry in entries] mat = spsp.csr_matrix((data, (rows, cols)), (N, N)) tensor.append(mat) predicateDict[cnt] = predicate cnt += 1 type_entries = [entry for entry in type_entries if entry[0] in instanceSet] rows = [entityDict[entry[0]] for entry in type_entries] cols = [entityDict[entry[1]] for entry in type_entries] data = [1 for entry in type_entries] mat = spsp.csr_matrix((data, (rows, cols)), (N, N)) tensor.append(mat) predicateDict[cnt] = rdf_type _log.info("%d relations" % (cnt+1)) pkl_utils._save(config.TENSOR[lang], tensor) pkl_utils._save(config.ENTITY[lang], entityDict) pkl_utils._save(config.PREDICATE[lang], predicateDict) pkl_utils._save(config.INSTANCE[lang], instanceDict) pkl_utils._save(config.TYPE[lang], typeDict) pkl_utils._save(config.TYPE_MATRIX[lang], (rows, cols)) _log.info("parsing complete")