sublinear_tf=1, stop_words='english')), ])), ])), ('svd', TruncatedSVD(n_components=250, random_state=45)), #('scl', StandardScaler()) ]) # In[7]: X2 = trf.fit_transform(X) X2.shape # In[8]: X2_test = trf.transform(X_test) X2_test.shape # In[9]: #dump("data/XXtestY250_clean", (X2, X2_test, y)) #dump("data/XXtestY250_am_t0", (X2, X2_test, y)) dump("data/XXtestY250_r2td", (X2, X2_test, y)) #dump("data/XXtestY250_ver1", (X2, X2_test, y)) #dump("data/XXtestY250_clean_description", (X2, X2_test, y)) #X2, X2_test, y = load("data/XXtestY250") np.savetxt("data/X250_3.csv", X2, delimiter=",") np.savetxt("data/X250_test_3.csv", X2_test, delimiter=",")
X2 = trf.fit_transform(X) X2.shape # In[8]: X2_test = trf.transform(X_test) X2_test.shape # In[9]: #dump("data/XXtestY250_clean", (X2, X2_test, y)) #dump("data/XXtestY250_am_t0", (X2, X2_test, y)) dump("data/XXtestY250_r2td", (X2, X2_test, y)) #dump("data/XXtestY250_ver1", (X2, X2_test, y)) #dump("data/XXtestY250_clean_description", (X2, X2_test, y)) #X2, X2_test, y = load("data/XXtestY250") np.savetxt("data/X250.csv", X2, delimiter=",") np.savetxt("data/X250_test.csv", X2_test, delimiter=",") # In[10]: def union_features_one(x): q = set(process_str_replace(x['query'])) t = set(process_str_replace(x['product_title'])) d = set(process_str_replace(x['product_description'])) qt = len(q & t)
])), ('svd', TruncatedSVD(n_components=250, random_state=44)), #('scl', StandardScaler()) ]) # In[7]: X2 = trf.fit_transform(X) X2.shape # In[8]: X2_test = trf.transform(X_test) X2_test.shape # In[9]: #dump("data/XXtestY250_clean", (X2, X2_test, y)) #dump("data/XXtestY250_am_t0", (X2, X2_test, y)) dump("data/XXtestY250_r2td", (X2, X2_test, y)) #dump("data/XXtestY250_ver1", (X2, X2_test, y)) #dump("data/XXtestY250_clean_description", (X2, X2_test, y)) #X2, X2_test, y = load("data/XXtestY250") np.savetxt("data/X250_2.csv", X2, delimiter=",") np.savetxt("data/X250_test_2.csv", X2_test, delimiter=",")
manx = set(['man', 'boy', 'men']) def is_man(s): wx = set(process_str(s)) return len(manx & wx)>0 def is_woman(s): wx = set(process_str(s)) return len(womanx & wx)>0 def wm_opposite(row): m_q = is_man(row['query']) w_q = is_woman(row['query']) m_t = is_man(row['product_title']) w_t = is_woman(row['product_title']) if m_q==True and w_t==True and m_t==False: return 1 elif w_q==True and m_t==True and w_t==False: return 1 else: return 0 train_wm = train.apply(wm_opposite, axis=1).reshape(-1,1) test_wm = test.apply(wm_opposite, axis=1).reshape(-1,1) train_wm.sum(), test_wm.sum(), train_wm.shape, test_wm.shape dump('data/wm_features', (train_wm, test_wm)) np.savetxt('data/train_wm.csv', train_wm, delimiter=',') np.savetxt('data/test_wm.csv', test_wm, delimiter=',')
wx = set(process_str(s)) return len(manx & wx) > 0 def is_woman(s): wx = set(process_str(s)) return len(womanx & wx) > 0 def wm_opposite(row): m_q = is_man(row['query']) w_q = is_woman(row['query']) m_t = is_man(row['product_title']) w_t = is_woman(row['product_title']) if m_q == True and w_t == True and m_t == False: return 1 elif w_q == True and m_t == True and w_t == False: return 1 else: return 0 train_wm = train.apply(wm_opposite, axis=1).reshape(-1, 1) test_wm = test.apply(wm_opposite, axis=1).reshape(-1, 1) train_wm.sum(), test_wm.sum(), train_wm.shape, test_wm.shape dump('data/wm_features', (train_wm, test_wm)) np.savetxt('data/train_wm.csv', train_wm, delimiter=',') np.savetxt('data/test_wm.csv', test_wm, delimiter=',')
def autocorrect_query(query,train=None,test=None,cutoff=0.8,warning_on=True): """ autocorrect a query based on the training set """ train_data = train.values[train['query'].values==query,:] test_data = test.values[test['query'].values==query,:] s = "" for r in train_data: s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True)) for r in test_data: s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True)) s = re.findall(r'[\'\"\w]+',s.lower()) s_bigram = [' '.join(i) for i in bigrams(s)] s.extend(s_bigram) corrected_query = [] for q in query.lower().split(): if len(q)<=2: corrected_query.append(q) continue corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff) if len(corrected_word) >0: corrected_query.append(corrected_word[0]) else : if warning_on: print ("WARNING: cannot find matched word for '%s' -> used the original word"%(q)) corrected_query.append(q) return ' '.join(corrected_query) query_map = build_query_correction_map() dump('data/query_auto_correct', query_map)
# In[7]: X2 = trf.fit_transform(X) X2.shape # In[8]: X2_test = trf.transform(X_test) X2_test.shape # In[9]: #dump("data/XXtestY250_clean", (X2, X2_test, y)) #dump("data/XXtestY250_am_t0", (X2, X2_test, y)) dump("data/XXtestY250_r2td", (X2, X2_test, y)) #dump("data/XXtestY250_ver1", (X2, X2_test, y)) #dump("data/XXtestY250_clean_description", (X2, X2_test, y)) #X2, X2_test, y = load("data/XXtestY250") np.savetxt("data/X250.csv", X2, delimiter=",") np.savetxt("data/X250_test.csv", X2_test, delimiter=",") # In[10]: def union_features_one(x): q = set(process_str_replace(x['query'])) t = set(process_str_replace(x['product_title'])) d = set(process_str_replace(x['product_description'])) qt = len(q & t)
def create_similarity_features(train, row, id=None): tx1, dx1 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 1, id) tx2, dx2 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 2, id) tx3, dx3 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 3, id) tx4, dx4 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 4, id) our_tx = set(process_str(row['product_title'])) our_dx = set(process_str(row['product_description'])) return len(tx1 & our_tx), len(dx1 & our_dx), len(tx2 & our_tx), len(dx2 & our_dx), len(tx3 & our_tx), len(dx3 & our_dx), len(tx4 & our_tx), len(dx4 & our_dx) # In[8]: train_fx = np.array([create_similarity_features(train, train.iloc[i], i) for i in xrange(train.shape[0])]) dump('data/train1234_c1_r', train_fx) test_fx = np.array([create_similarity_features(train, test.iloc[i]) for i in xrange(test.shape[0])]) dump('data/test1234_c1_r', test_fx) # ## Similarity less strict # In[9]: def get_str_for_query2(train, q, product_title, product_description, median_relevance, id=None): df = train[(train['query']==q) & (train['median_relevance']==median_relevance) & (train.index!=id)] title_set = set() for e in df['product_title'].values: title_set |= set(process_str(e))
if re.search("[0-9]+", w): continue if len(w)>7: for i in range(3, len(w)-2): pf = -1 w1 = w[:i] w2 = w[i:] f1 = wx.get(w1, 0) f2 = wx.get(w2, 0) if f1>10 and f2>10 and (f1+f2)>pf: w1 = stem_one(w1) w2 = stem_one(w2) rwx[w] = (w1, w2) pf = f1+f2 dump('data/word_to_2_replacer', rwx) numbers = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] replace = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80, 90] replace = [str(n) for n in replace] numbers = [stem_one(w) for w in numbers] num_to_num = dict(zip(numbers, replace)) dump('data/num_to_num', num_to_num)