def assemble_counts(train): X = [] titles = [] for i in range(len(train.id)): query = correct_string(train['query'][i].lower()) title = correct_string(train.product_title[i].lower()) query = (" ").join( [z for z in BeautifulSoup(query).get_text(" ").split(" ")]) title = (" ").join( [z for z in BeautifulSoup(title).get_text(" ").split(" ")]) query = re.sub("[^a-zA-Z0-9]", " ", query) title = re.sub("[^a-zA-Z0-9]", " ", title) query = (" ").join([stemmer.stem(z) for z in query.split(" ")]) title = (" ").join([stemmer.stem(z) for z in title.split(" ")]) query = " ".join(query.split()) title = " ".join(title.split()) #dist_qt = compression_distance(query,title) dist_qt2 = 1 - seq_matcher(None, query, title).ratio() query_len = len(query.split()) title_len = len(title.split()) tmp_title = title word_counter_qt = 0 lev_dist_arr = [] for q in query.split(): lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None, q, t).ratio() if lev_dist > 0.9: word_counter_qt += 1 tmp_title += ' ' + q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) last_word_in = 0 for t in title.split(): lev_dist = seq_matcher(None, query.split()[-1], t).ratio() if lev_dist > 0.9: last_word_in = 1 lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1 - lev_max / len(lev_dist_arr) word_counter_qt_norm = word_counter_qt / query_len X.append([ query_len, title_len, word_counter_qt, lev_max, last_word_in, word_counter_qt_norm, dist_qt2 ]) titles.append(tmp_title) X = np.array(X).astype(np.float) return X, np.array(titles)
def distance_between_query_and_title(): for cata in catagories: with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: query_unigram = pickle.load(f) with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: title_unigram = pickle.load(f) sz = len(query_unigram) compression_distance = np.zeros([sz, 1]) edit_distance = np.zeros([sz, 1]) mean_maximum_edit_distance = np.zeros([sz, 1]) for i in range(sz): query = " ".join(query_unigram[i]) title = " ".join(title_unigram[i]) compression_distance[i][0] = compressionDistance(query, title) edit_distance[i][0] = 1 - seq_matcher(None, query, title).ratio() lev_dist_arr = [] for q in query.split(): lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None, q, t).ratio() lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1 - lev_max / len(lev_dist_arr) mean_maximum_edit_distance[i][0] = lev_max if (i % 1000 == 0): print(i) with open("%s/compression_distance_%s.pickle" % (folderPath, cata), "wb") as f: pickle.dump(compression_distance, f) with open("%s/edit_distance_%s.pickle" % (folderPath, cata), "wb") as f: pickle.dump(edit_distance, f) with open( "%s/mean_maximum_edit_distance_%s.pickle" % (folderPath, cata), "wb") as f: pickle.dump(mean_maximum_edit_distance, f)
def assemble_counts(train): X = [] titles = [] for i in range(len(train.id)): query = correct_string(train['query'][i].lower()) title = correct_string(train.product_title[i].lower()) query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")]) title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")]) query=re.sub("[^a-zA-Z0-9]"," ", query) title=re.sub("[^a-zA-Z0-9]"," ", title) query= (" ").join([stemmer.stem(z) for z in query.split(" ")]) title= (" ").join([stemmer.stem(z) for z in title.split(" ")]) query=" ".join(query.split()) title=" ".join(title.split()) #dist_qt = compression_distance(query,title) dist_qt2 = 1 - seq_matcher(None,query,title).ratio() query_len = len(query.split()) title_len = len(title.split()) tmp_title = title word_counter_qt = 0 lev_dist_arr = [] for q in query.split(): lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None,q,t).ratio() if lev_dist > 0.9: word_counter_qt += 1 tmp_title += ' '+q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) last_word_in = 0 for t in title.split(): lev_dist = seq_matcher(None,query.split()[-1],t).ratio() if lev_dist > 0.9: last_word_in = 1 lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1- lev_max/len(lev_dist_arr) word_counter_qt_norm = word_counter_qt/query_len X.append([query_len,title_len,word_counter_qt,lev_max,last_word_in,word_counter_qt_norm, dist_qt2]) titles.append(tmp_title) X = np.array(X).astype(np.float) return X, np.array(titles)
def assemble_counts2(train): X = [] queries = [] for i in range(len(train.id)): query = train['query'][i] title = train.product_title[i] dist_qt = compression_distance(query, title) dist_qt2 = 1 - seq_matcher(None, query, title).ratio() query_len = len(query.split()) lev_dist_arr = [] word_rank_list = [] word_q_ind = 0 word_counter_qt = 0 for q in query.split(): word_q_ind += 1 lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None, q, t).ratio() if lev_dist > 0.9: word_counter_qt += 1 word_rank_list.append(word_q_ind) #tmp_title += ' '+q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) if word_counter_qt == 0: maxrank = 0 else: maxrank = 26 - min(word_rank_list) lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1 - lev_max / len(lev_dist_arr) word_counter_qt_norm = word_counter_qt / query_len X.append([ word_counter_qt, dist_qt, dist_qt2, lev_max, word_counter_qt_norm, maxrank ]) queries.append(query) X = np.array(X).astype(np.float) return X, np.array(queries)
def assemble_counts2(train): X = [] queries = [] for i in range(len(train.id)): query = train['query'][i] title = train.product_title[i] dist_qt = compression_distance(query,title) dist_qt2 = 1 - seq_matcher(None,query,title).ratio() query_len = len(query.split()) lev_dist_arr = [] word_rank_list = [] word_q_ind = 0 word_counter_qt = 0 for q in query.split(): word_q_ind += 1 lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None,q,t).ratio() if lev_dist > 0.9: word_counter_qt += 1 word_rank_list.append(word_q_ind) #tmp_title += ' '+q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) if word_counter_qt == 0: maxrank = 0 else: maxrank = 26 - min(word_rank_list) lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1- lev_max/len(lev_dist_arr) word_counter_qt_norm = word_counter_qt/query_len X.append([word_counter_qt,dist_qt,dist_qt2,lev_max,word_counter_qt_norm,maxrank]) queries.append(query) X = np.array(X).astype(np.float) return X, np.array(queries)
def last_word_from_query_present_title(): for cata in catagories: with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: query_unigram = pickle.load(f) with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: title_unigram = pickle.load(f) sz = len(query_unigram) output = np.zeros([sz, 1]) for i in range(sz): cnt = 0 for word in title_unigram[i]: lev_dist = seq_matcher(None, query_unigram[i][-1], word).ratio() if lev_dist > 0.9: cnt = 1 break output[i][0] = cnt print(output.shape) with open( "%s/last_word_from_query_present_title_%s.pickle" % (folderPath, cata), "wb") as f: pickle.dump(output, f)
def gen_count_word_query_in_title(): for cata in catagories: with open("%s/query_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: query_unigram = pickle.load(f) with open("%s/title_unigram_%s.pickle" % (ngramFolderPath, cata), "rb") as f: title_unigram = pickle.load(f) sz = len(query_unigram) output = np.zeros([sz, 1]) for i in range(sz): cnt = 0 for qword in query_unigram[i]: for tword in title_unigram[i]: lev_dist = seq_matcher(None, qword, tword).ratio() if lev_dist > 0.9: cnt += 1 output[i][0] = cnt print(output.shape) with open( "%s/count_word_query_in_title_%s.pickle" % (folderPath, cata), "wb") as f: pickle.dump(output, f)
def last_word(query, title): if len(query)==0 or len(title)==0: return 0 for t in title: dist=seq_matcher(None, query[-1], t).ratio() if dist > 0.9: return 1 return 0
def edist_norm(query, title): w=0 for q in query: for t in title: lev_dist = seq_matcher(None,q,t).ratio() if lev_dist>0.9: w+=1 return try_divide(w, len(query))
def get_difflib_features(df): logging.info('get difflib features') feat = pd.DataFrame(index=df.index) seq_distances = [] for i, (a, b) in enumerate(zip(df.search_term, df.product_title)): a = ''.join([c for c in a if c.isalnum()]) b = ''.join([c for c in b if c.isalnum()]) seq_distances.append(seq_matcher(None, a, b).ratio()) feat['seq_match_ratio'] = 1.0 - np.array(seq_distances) return feat
def mean_dist(data, col1, col2): mean_edit_s_t = [] for i in range(len(data)): search = data[col1][i] title = data[col2][i] max_edit_s_t_arr = [] for s in search.split(): max_edit_s_t = [] for t in title.split(): a = seq_matcher(None, s, t).ratio() max_edit_s_t.append(a) max_edit_s_t_arr.append(max_edit_s_t) l = 0 for item in max_edit_s_t_arr: l = l + max(item) mean_edit_s_t.append(l / len(max_edit_s_t_arr)) return mean_edit_s_t
def compute_one_edit_distance(row): query = row['search_term'] title = row['product_title'] return 1 - seq_matcher(None, query, title).ratio()
def assemble_counts(train, m='train'): X = [] titles = [] queries = [] weights = [] train['isdesc'] = 1 # Description present flag train.loc[train['product_description'].isnull(), 'isdesc'] = 0 for i in range(len(train.id)): query = correct_string(train['query'][i].lower()) title = correct_string(train.product_title[i].lower()) query = (" ").join( [z for z in BeautifulSoup(query).get_text(" ").split(" ")]) title = (" ").join( [z for z in BeautifulSoup(title).get_text(" ").split(" ")]) query = text.re.sub("[^a-zA-Z0-9]", " ", query) title = text.re.sub("[^a-zA-Z0-9]", " ", title) query = (" ").join([stemmer.stem(z) for z in query.split(" ")]) title = (" ").join([stemmer.stem(z) for z in title.split(" ")]) query = " ".join(query.split()) title = " ".join(title.split()) dist_qt = compression_distance(query, title) dist_qt2 = 1 - seq_matcher(None, query, title).ratio() query_len = len(query.split()) title_len = len(title.split()) isdesc = train.isdesc[i] tmp_title = title word_counter_qt = 0 lev_dist_arr = [] for q in query.split(): lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None, q, t).ratio() if lev_dist > 0.9: word_counter_qt += 1 #tmp_title += ' '+q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) last_word_in = 0 for t in title.split(): lev_dist = seq_matcher(None, query.split()[-1], t).ratio() if lev_dist > 0.9: last_word_in = 1 lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1 - lev_max / len(lev_dist_arr) word_counter_qt_norm = word_counter_qt / query_len X.append([ query_len, title_len, isdesc, word_counter_qt, dist_qt, dist_qt2, lev_max, last_word_in, word_counter_qt_norm ]) titles.append(tmp_title) queries.append(query) if m == 'train': weights.append(1 / (float(train["relevance_variance"][i]) + 1.0)) X = np.array(X).astype(np.float) if m == 'train': return X, np.array(weights).astype( np.float), np.array(titles), np.array(queries) else: return X, np.array(titles), np.array(queries)
def assemble_counts(train,m='train'): X = [] titles = [] queries = [] weights = [] train['isdesc'] = 1 # Description present flag train.loc[train['product_description'].isnull(),'isdesc'] = 0 for i in range(len(train.id)): query = correct_string(train['query'][i].lower()) title = correct_string(train.product_title[i].lower()) query = (" ").join([z for z in BeautifulSoup(query).get_text(" ").split(" ")]) title = (" ").join([z for z in BeautifulSoup(title).get_text(" ").split(" ")]) query=text.re.sub("[^a-zA-Z0-9]"," ", query) title=text.re.sub("[^a-zA-Z0-9]"," ", title) query= (" ").join([stemmer.stem(z) for z in query.split(" ")]) title= (" ").join([stemmer.stem(z) for z in title.split(" ")]) query=" ".join(query.split()) title=" ".join(title.split()) dist_qt = compression_distance(query,title) dist_qt2 = 1 - seq_matcher(None,query,title).ratio() query_len = len(query.split()) title_len = len(title.split()) isdesc = train.isdesc[i] tmp_title = title word_counter_qt = 0 lev_dist_arr = [] for q in query.split(): lev_dist_q = [] for t in title.split(): lev_dist = seq_matcher(None,q,t).ratio() if lev_dist > 0.9: word_counter_qt += 1 #tmp_title += ' '+q # add such words to title to increase their weights in tfidf lev_dist_q.append(lev_dist) lev_dist_arr.append(lev_dist_q) last_word_in = 0 for t in title.split(): lev_dist = seq_matcher(None,query.split()[-1],t).ratio() if lev_dist > 0.9: last_word_in = 1 lev_max = 0 for item in lev_dist_arr: lev_max_q = max(item) lev_max += lev_max_q lev_max = 1- lev_max/len(lev_dist_arr) word_counter_qt_norm = word_counter_qt/query_len X.append([query_len,title_len,isdesc,word_counter_qt,dist_qt,dist_qt2,lev_max,last_word_in,word_counter_qt_norm]) titles.append(tmp_title) queries.append(query) if m =='train': weights.append(1/(float(train["relevance_variance"][i]) + 1.0)) X = np.array(X).astype(np.float) if m =='train': return X, np.array(weights).astype(np.float), np.array(titles), np.array(queries) else: return X, np.array(titles), np.array(queries)
def edist(q, t): return 1 - seq_matcher(None,q,t).ratio()