def get_text_predictions(df, max_features=25_000): model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features) text_embeddings = model.fit_transform(df_cu['title']).toarray() print('Finding similar titles...') CHUNK = 1024 * 4 CTS = len(df) // CHUNK if (len(df) % CHUNK) != 0: CTS += 1 preds = [] for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(df)) print('chunk', a, 'to', b) # COSINE SIMILARITY DISTANCE cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T for k in range(b - a): IDX = cupy.where(cts[k, ] > 0.7)[0] o = df.iloc[cupy.asnumpy(IDX)].posting_id.values preds.append(o) del model, text_embeddings gc.collect() return preds
def test_tfidf_vectorizer_idf_setter(): orig = TfidfVectorizer(use_idf=True) orig.fit(DOCS_GPU) copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True) copy.idf_ = orig.idf_[0] cp.testing.assert_array_almost_equal(copy.transform(DOCS_GPU).todense(), orig.transform(DOCS_GPU).todense())
def test_tfidf_vectorizer(norm, use_idf, smooth_idf, sublinear_tf): tfidf_mat = TfidfVectorizer( norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf ).fit_transform(DOCS_GPU) ref = SkTfidfVect( norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf ).fit_transform(DOCS) cp.testing.assert_array_almost_equal(tfidf_mat.todense(), ref.toarray())
def test_tfidf_vectorizer_get_feature_names(): corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] vectorizer = TfidfVectorizer() vectorizer.fit_transform(Series(corpus)) output = [ 'and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this' ] assert vectorizer.get_feature_names().to_arrow().to_pylist() == output
def get_tfidf_y_pred(n_batches: int, threshold: int) -> List[List[str]]: torch.cuda.empty_cache() model = TfidfVectorizer(stop_words="english", binary=True, max_features=10000) features = model.fit_transform(data.test["title"]) posting_ids = data.test["posting_id"].values n_rows = features.shape[0] batch_idxs = get_batch_idxs(n_rows, n_batches) y_pred = get_y_pred(features, batch_idxs, posting_ids, threshold) del model del features gc.collect() torch.cuda.empty_cache() return y_pred
def TFIDF_feateure(df, min_PCA = 5000): if IS_GPU: df_cu = cudf.DataFrame(df) else: df_cu = df max_features = 15000 n_components = min(min_PCA, len(df_cu)) nlp_model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features) text_embeddings = nlp_model.fit_transform(df_cu['title']).toarray() pca = PCA(n_components = n_components) if IS_GPU: text_embeddings = pca.fit_transform(text_embeddings).get() else: text_embeddings = pca.fit_transform(text_embeddings) print(f'Our title text embedding shape is {text_embeddings.shape}') return text_embeddings
def get_text_predictions(df, max_features=25_000): model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features) text_embeddings = model.fit_transform(df_cu['title']).toarray() preds = [] CHUNK = 1024 * 4 print('Finding similar titles...') CTS = len(df) // CHUNK if len(df) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK b = min(b, len(df)) print('chunk', a, 'to', b) # COSINE SIMILARITY DISTANCE cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T for k in range(b - a): # choose best threhod IDX = cupy.where(cts[k, ] > 0.7705)[0] o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values if len(o) >= 2: IDX_b = cupy.where(cts[k, ] > 0.80105)[0] o_b = df_cu.iloc[cupy.asnumpy( IDX_b)].posting_id.to_pandas().values if len(o) >= 2: preds.append(o_b) else: preds.append(o) else: IDX = cupy.where(cts[k, ] > 0.6555)[0] o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values preds.append(o[:2]) del model, text_embeddings gc.collect() return preds
def test_tfidf_vectorizer_setters(): tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False, sublinear_tf=False) tv.norm = 'l1' assert tv._tfidf.norm == 'l1' tv.use_idf = True assert tv._tfidf.use_idf tv.smooth_idf = True assert tv._tfidf.smooth_idf tv.sublinear_tf = True assert tv._tfidf.sublinear_tf
def find_similar_titles_with_rapids_knn(): """ First we will extract text embeddings using RAPIDS cuML's TfidfVectorizer. This will turn every title into a one-hot-encoding of the words present. We will then compare one-hot-encodings with RAPIDS cuML KNN to find title's that are similar. :return: """ # LOAD TRAIN UNTO THE GPU WITH CUDF train_gf = cudf.read_csv('../input/shopee-product-matching/train.csv') print('train shape is', train_gf.shape) train_gf.head() # Extract Text Embeddings with RAPIDS TfidfVectorizer¶ # TfidfVectorizer returns a cupy sparse matrix. # Afterward we convert to a cupy dense matrix and feed that into RAPIDS cuML KNN. model = TfidfVectorizer(stop_words='english', binary=True) text_embeddings = model.fit_transform(train_gf.title).toarray() print('text embeddings shape is', text_embeddings.shape) # After fitting KNN, we will display some example rows of train and their 10 closest other titles in train (based on word count one-hot-encoding). KNN = 50 model = NearestNeighbors(n_neighbors=KNN) model.fit(text_embeddings) distances, indices = model.kneighbors(text_embeddings) for k in range(5): plt.figure(figsize=(20, 3)) plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-') plt.title('Text Distance From Train Row %i to Other Train Rows' % k, size=16) plt.ylabel('Distance to Train Row %i' % k, size=14) plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14) plt.show() print(train_gf.loc[cupy.asnumpy(indices[k, :10]), ['title', 'label_group']])
# del imagefeat, imgmodel # In[14]: train['oof_cnn'] = preds if COMPUTE_CV: train['f1'] = train.apply(getMetric('oof_cnn'), axis=1) print('CV score for baseline =', train.f1.mean()) # # title TFIDF # In[15]: # from sklearn.feature_extraction.text import TfidfVectorizer model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000) text_embeddings = model.fit_transform(train_gf.title).toarray() print('text embeddings shape', text_embeddings.shape) # In[16]: preds = [] CHUNK = 1024 * 4 print('Finding similar titles...') CTS = len(train) // CHUNK if len(train) % CHUNK != 0: CTS += 1 for j in range(CTS): a = j * CHUNK b = (j + 1) * CHUNK
def text_embedding_extraction(): print('Computing text embeddings...') model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000) text_embeddings = model.fit_transform(test_gf.title).toarray() print('text embeddings shape', text_embeddings.shape)
def get_text_embeddings(cu_df): model = TfidfVectorizer(stop_words='english', binary=True) text_embeddings = model.fit_transform(cu_df.title).toarray() del model return text_embeddings
test["preds1"] = image_predictions # # 2.TEXT(使用TFIDF对商品标题做匹配) # In[16]: # 导入tfidf相关的库 import cudf, cuml, cupy from cuml.feature_extraction.text import TfidfVectorizer from cuml.neighbors import NearestNeighbors print('RAPIDS', cuml.__version__) # In[19]: test_gf = cudf.read_csv(DATA_PATH + 'test.csv') # 再次导入test.csv model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000) #创建tfidf模型 text_embeddings = model.fit_transform( test_gf.title).toarray() #使用tfidf模型对test数据中的商品标题训练 print('text embeddings shape', text_embeddings.shape) # 获得text的features # In[21]: # 分块做匹配。因为数据量大,无法一次性做两两匹配(会超内存)。 preds2 = [] CHUNK = 1024 * 4 # 每个分块的大小 print('Finding similar titles...') CTS = len(test) // CHUNK if len(test) % CHUNK != 0: CTS += 1 for j in range(CTS):