print ind, doc print_doc(250) ## use document membership as feature product_description_mem = membership[0: df_all.shape[0]] product_title_mem = membership[df_all.shape[0]: (2 * df_all.shape[0])] search_term_mem = membership[(2 * df_all.shape[0]):] description_match_search_member = product_description_mem == search_term_mem title_match_search_member = product_title_mem == search_term_mem df_all['description_match_search_member'] = pd.Series(description_match_search_member, index=df_all.index) df_all['title_match_search_member'] = pd.Series(title_match_search_member, index=df_all.index) df_all = add_ref_feature(df_all) ## train new model df_all = df_all.drop(['search_term','product_title','product_description', 'product_uid', 'len_of_query'],axis=1) df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train.drop(['id', 'relevance'], axis=1).values X_test = df_test.drop(['id', 'relevance'], axis=1).values np.savez_compressed('/home/steven/Projects/HomeDepot/data/mat_fac_mem_features.npz', y_train=y_train, X_train=X_train, X_test=X_test) rf = RandomForestRegressor(n_estimators=700, max_depth=6, random_state=0)
product_title_rep = whole_cop_rep[df_all.shape[0]: (2 * df_all.shape[0]), :] search_term_rep = whole_cop_rep[(2 * df_all.shape[0]):, :] def matrix_cosine(mat1, mat2): norm1 = np.linalg.norm(mat1, axis=1) norm2 = np.linalg.norm(mat2, axis=1) inner = np.sum(mat1 * mat2, axis=1) return inner / norm1 / norm2 description_search_similarity = matrix_cosine(product_description_rep, search_term_rep) title_search_similarity = matrix_cosine(product_title_rep, search_term_rep) df_all['description_search_similarity'] = pd.Series(description_search_similarity, index=df_all.index) df_all['title_search_similarity'] = pd.Series(title_search_similarity, index=df_all.index) df_all_exp = add_ref_feature(df_all) df_all = df_all_exp df_all = df_all.drop(['search_term','product_title','product_description', 'product_uid'],axis=1) df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values X_train = df_train.drop(['id','relevance'],axis=1).values X_test = df_test.drop(['id','relevance'],axis=1).values # remove nan X_train[np.isnan(X_train)] = 0 y_train[np.isnan(y_train)] = 0