Ejemplo n.º 1
0
        print ind, doc

print_doc(250)

## use document membership as feature

product_description_mem = membership[0: df_all.shape[0]]
product_title_mem = membership[df_all.shape[0]: (2 * df_all.shape[0])]
search_term_mem = membership[(2 * df_all.shape[0]):]

description_match_search_member = product_description_mem == search_term_mem
title_match_search_member = product_title_mem == search_term_mem

df_all['description_match_search_member'] = pd.Series(description_match_search_member, index=df_all.index)
df_all['title_match_search_member'] = pd.Series(title_match_search_member, index=df_all.index)
df_all = add_ref_feature(df_all)

## train new model
df_all = df_all.drop(['search_term','product_title','product_description', 'product_uid', 'len_of_query'],axis=1)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id', 'relevance'], axis=1).values

np.savez_compressed('/home/steven/Projects/HomeDepot/data/mat_fac_mem_features.npz', y_train=y_train, X_train=X_train, X_test=X_test)

rf = RandomForestRegressor(n_estimators=700, max_depth=6, random_state=0)
Ejemplo n.º 2
0
product_title_rep = whole_cop_rep[df_all.shape[0]: (2 * df_all.shape[0]), :]
search_term_rep = whole_cop_rep[(2 * df_all.shape[0]):, :]


def matrix_cosine(mat1, mat2):
    norm1 = np.linalg.norm(mat1, axis=1)
    norm2 = np.linalg.norm(mat2, axis=1)
    inner = np.sum(mat1 * mat2, axis=1)
    return inner / norm1 / norm2

description_search_similarity = matrix_cosine(product_description_rep, search_term_rep)
title_search_similarity = matrix_cosine(product_title_rep, search_term_rep)
df_all['description_search_similarity'] = pd.Series(description_search_similarity, index=df_all.index)
df_all['title_search_similarity'] = pd.Series(title_search_similarity, index=df_all.index)

df_all_exp = add_ref_feature(df_all)
df_all = df_all_exp

df_all = df_all.drop(['search_term','product_title','product_description', 'product_uid'],axis=1)

df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]
id_test = df_test['id']

y_train = df_train['relevance'].values
X_train = df_train.drop(['id','relevance'],axis=1).values
X_test = df_test.drop(['id','relevance'],axis=1).values

# remove nan
X_train[np.isnan(X_train)] = 0
y_train[np.isnan(y_train)] = 0