Exemple #1
0
                               sublinear_tf=1,
                               stop_words='english')),
          ])),
     ])),
    ('svd', TruncatedSVD(n_components=250, random_state=45)),
    #('scl', StandardScaler())
])

# In[7]:

X2 = trf.fit_transform(X)
X2.shape

# In[8]:

X2_test = trf.transform(X_test)
X2_test.shape

# In[9]:

#dump("data/XXtestY250_clean", (X2, X2_test, y))
#dump("data/XXtestY250_am_t0", (X2, X2_test, y))

dump("data/XXtestY250_r2td", (X2, X2_test, y))

#dump("data/XXtestY250_ver1", (X2, X2_test, y))
#dump("data/XXtestY250_clean_description", (X2, X2_test, y))
#X2, X2_test, y = load("data/XXtestY250")
np.savetxt("data/X250_3.csv", X2, delimiter=",")
np.savetxt("data/X250_test_3.csv", X2_test, delimiter=",")
X2 = trf.fit_transform(X)
X2.shape


# In[8]:

X2_test = trf.transform(X_test)
X2_test.shape


# In[9]:

#dump("data/XXtestY250_clean", (X2, X2_test, y))
#dump("data/XXtestY250_am_t0", (X2, X2_test, y))

dump("data/XXtestY250_r2td", (X2, X2_test, y))

#dump("data/XXtestY250_ver1", (X2, X2_test, y))
#dump("data/XXtestY250_clean_description", (X2, X2_test, y))
#X2, X2_test, y = load("data/XXtestY250")
np.savetxt("data/X250.csv", X2, delimiter=",")
np.savetxt("data/X250_test.csv", X2_test, delimiter=",")


# In[10]:

def union_features_one(x):
    q = set(process_str_replace(x['query']))
    t = set(process_str_replace(x['product_title']))
    d = set(process_str_replace(x['product_description']))
    qt = len(q & t)
    ])),
    ('svd', TruncatedSVD(n_components=250, random_state=44)),
    #('scl', StandardScaler())
])


# In[7]:

X2 = trf.fit_transform(X)
X2.shape


# In[8]:

X2_test = trf.transform(X_test)
X2_test.shape


# In[9]:

#dump("data/XXtestY250_clean", (X2, X2_test, y))
#dump("data/XXtestY250_am_t0", (X2, X2_test, y))

dump("data/XXtestY250_r2td", (X2, X2_test, y))

#dump("data/XXtestY250_ver1", (X2, X2_test, y))
#dump("data/XXtestY250_clean_description", (X2, X2_test, y))
#X2, X2_test, y = load("data/XXtestY250")
np.savetxt("data/X250_2.csv", X2, delimiter=",")
np.savetxt("data/X250_test_2.csv", X2_test, delimiter=",")
manx = set(['man', 'boy', 'men'])

def is_man(s):
    wx = set(process_str(s))
    return len(manx & wx)>0

def is_woman(s):
    wx = set(process_str(s))
    return len(womanx & wx)>0

def wm_opposite(row):
    m_q = is_man(row['query'])
    w_q = is_woman(row['query'])
    m_t = is_man(row['product_title'])
    w_t = is_woman(row['product_title'])
    if m_q==True and w_t==True and m_t==False:
        return 1
    elif w_q==True and m_t==True and w_t==False:
        return 1
    else:
        return 0

train_wm = train.apply(wm_opposite, axis=1).reshape(-1,1)
test_wm = test.apply(wm_opposite, axis=1).reshape(-1,1)
train_wm.sum(), test_wm.sum(), train_wm.shape, test_wm.shape

dump('data/wm_features', (train_wm, test_wm))

np.savetxt('data/train_wm.csv', train_wm, delimiter=',')
np.savetxt('data/test_wm.csv', test_wm, delimiter=',')
    wx = set(process_str(s))
    return len(manx & wx) > 0


def is_woman(s):
    wx = set(process_str(s))
    return len(womanx & wx) > 0


def wm_opposite(row):
    m_q = is_man(row['query'])
    w_q = is_woman(row['query'])
    m_t = is_man(row['product_title'])
    w_t = is_woman(row['product_title'])
    if m_q == True and w_t == True and m_t == False:
        return 1
    elif w_q == True and m_t == True and w_t == False:
        return 1
    else:
        return 0


train_wm = train.apply(wm_opposite, axis=1).reshape(-1, 1)
test_wm = test.apply(wm_opposite, axis=1).reshape(-1, 1)
train_wm.sum(), test_wm.sum(), train_wm.shape, test_wm.shape

dump('data/wm_features', (train_wm, test_wm))

np.savetxt('data/train_wm.csv', train_wm, delimiter=',')
np.savetxt('data/test_wm.csv', test_wm, delimiter=',')
def autocorrect_query(query,train=None,test=None,cutoff=0.8,warning_on=True):
        """
        autocorrect a query based on the training set
        """
        train_data = train.values[train['query'].values==query,:]
        test_data = test.values[test['query'].values==query,:]
        s = ""
        for r in train_data:
                s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True))
        for r in test_data:
                s = "%s %s %s"%(s,BeautifulSoup(r[2]).get_text(" ",strip=True),BeautifulSoup(r[3]).get_text(" ",strip=True))
        s = re.findall(r'[\'\"\w]+',s.lower())
        s_bigram = [' '.join(i) for i in bigrams(s)]
        s.extend(s_bigram)
        corrected_query = []
        for q in query.lower().split():
                if len(q)<=2:
                        corrected_query.append(q)
                        continue
                corrected_word = difflib.get_close_matches(q, s,n=1,cutoff=cutoff)
                if len(corrected_word) >0:
                        corrected_query.append(corrected_word[0])
                else :
                        if warning_on:
                                print ("WARNING: cannot find matched word for '%s' -> used the original word"%(q))
                        corrected_query.append(q)
        return ' '.join(corrected_query)
query_map = build_query_correction_map()
dump('data/query_auto_correct', query_map)
Exemple #7
0
# In[7]:

X2 = trf.fit_transform(X)
X2.shape

# In[8]:

X2_test = trf.transform(X_test)
X2_test.shape

# In[9]:

#dump("data/XXtestY250_clean", (X2, X2_test, y))
#dump("data/XXtestY250_am_t0", (X2, X2_test, y))

dump("data/XXtestY250_r2td", (X2, X2_test, y))

#dump("data/XXtestY250_ver1", (X2, X2_test, y))
#dump("data/XXtestY250_clean_description", (X2, X2_test, y))
#X2, X2_test, y = load("data/XXtestY250")
np.savetxt("data/X250.csv", X2, delimiter=",")
np.savetxt("data/X250_test.csv", X2_test, delimiter=",")

# In[10]:


def union_features_one(x):
    q = set(process_str_replace(x['query']))
    t = set(process_str_replace(x['product_title']))
    d = set(process_str_replace(x['product_description']))
    qt = len(q & t)
def create_similarity_features(train, row, id=None):
    tx1, dx1 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 1, id)
    tx2, dx2 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 2, id)
    tx3, dx3 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 3, id)
    tx4, dx4 = get_str_for_query(train, row['query'], row['product_title'], row['product_description'], 4, id)
    our_tx = set(process_str(row['product_title']))
    our_dx = set(process_str(row['product_description']))

    return len(tx1 & our_tx), len(dx1 & our_dx), len(tx2 & our_tx), len(dx2 & our_dx), len(tx3 & our_tx), len(dx3 & our_dx), len(tx4 & our_tx), len(dx4 & our_dx)


# In[8]:

train_fx = np.array([create_similarity_features(train, train.iloc[i], i) for i in xrange(train.shape[0])])
dump('data/train1234_c1_r', train_fx)
test_fx = np.array([create_similarity_features(train, test.iloc[i]) for i in xrange(test.shape[0])])
dump('data/test1234_c1_r', test_fx)


# ## Similarity less strict

# In[9]:

def get_str_for_query2(train, q, product_title, product_description, median_relevance, id=None):
    df = train[(train['query']==q) &
               (train['median_relevance']==median_relevance) &
               (train.index!=id)]
    title_set = set()
    for e in df['product_title'].values:
        title_set |= set(process_str(e))
Exemple #9
0
    if re.search("[0-9]+", w):
        continue
    if len(w)>7:
        for i in range(3, len(w)-2):
            pf = -1
            w1 = w[:i]
            w2 = w[i:]
            f1 = wx.get(w1, 0)
            f2 = wx.get(w2, 0)
            if f1>10 and f2>10 and (f1+f2)>pf:
                w1 = stem_one(w1)
                w2 = stem_one(w2)
                rwx[w] = (w1, w2)
                pf = f1+f2

dump('data/word_to_2_replacer', rwx)


numbers = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
           'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
          'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty',
           'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
replace = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
          16, 17, 18, 19, 20,
          30, 40, 50, 60, 70, 80, 90]
replace = [str(n) for n in replace]

numbers = [stem_one(w) for w in numbers]

num_to_num = dict(zip(numbers, replace))
dump('data/num_to_num', num_to_num)