def get_text_predictions(df, max_features=25_000):
    model = TfidfVectorizer(stop_words='english',
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()

    print('Finding similar titles...')
    CHUNK = 1024 * 4
    CTS = len(df) // CHUNK
    if (len(df) % CHUNK) != 0:
        CTS += 1

    preds = []
    for j in range(CTS):
        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T
        for k in range(b - a):
            IDX = cupy.where(cts[k, ] > 0.7)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)

    del model, text_embeddings
    gc.collect()
    return preds
def test_tfidf_vectorizer_idf_setter():
    orig = TfidfVectorizer(use_idf=True)
    orig.fit(DOCS_GPU)
    copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True)
    copy.idf_ = orig.idf_[0]
    cp.testing.assert_array_almost_equal(copy.transform(DOCS_GPU).todense(),
                                         orig.transform(DOCS_GPU).todense())
def test_tfidf_vectorizer(norm, use_idf, smooth_idf, sublinear_tf):
    tfidf_mat = TfidfVectorizer(
        norm=norm, use_idf=use_idf,
        smooth_idf=smooth_idf, sublinear_tf=sublinear_tf
    ).fit_transform(DOCS_GPU)

    ref = SkTfidfVect(
        norm=norm, use_idf=use_idf,
        smooth_idf=smooth_idf, sublinear_tf=sublinear_tf
    ).fit_transform(DOCS)

    cp.testing.assert_array_almost_equal(tfidf_mat.todense(), ref.toarray())
def test_tfidf_vectorizer_get_feature_names():
    corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]
    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(Series(corpus))
    output = [
        'and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
        'this'
    ]
    assert vectorizer.get_feature_names().to_arrow().to_pylist() == output
def get_tfidf_y_pred(n_batches: int, threshold: int) -> List[List[str]]:
    torch.cuda.empty_cache()
    model = TfidfVectorizer(stop_words="english",
                            binary=True,
                            max_features=10000)
    features = model.fit_transform(data.test["title"])
    posting_ids = data.test["posting_id"].values
    n_rows = features.shape[0]
    batch_idxs = get_batch_idxs(n_rows, n_batches)
    y_pred = get_y_pred(features, batch_idxs, posting_ids, threshold)
    del model
    del features
    gc.collect()
    torch.cuda.empty_cache()
    return y_pred
Exemple #6
0
def TFIDF_feateure(df, min_PCA = 5000):
    if IS_GPU:
        df_cu = cudf.DataFrame(df)
    else:
        df_cu = df
    max_features = 15000
    n_components = min(min_PCA, len(df_cu))
    nlp_model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = nlp_model.fit_transform(df_cu['title']).toarray()
    pca = PCA(n_components = n_components)
    if IS_GPU:
        text_embeddings = pca.fit_transform(text_embeddings).get()
    else:
        text_embeddings = pca.fit_transform(text_embeddings)
    print(f'Our title text embedding shape is {text_embeddings.shape}')
    return text_embeddings
def get_text_predictions(df, max_features=25_000):
    model = TfidfVectorizer(stop_words='english',
                            binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df_cu['title']).toarray()
    preds = []
    CHUNK = 1024 * 4

    print('Finding similar titles...')
    CTS = len(df) // CHUNK
    if len(df) % CHUNK != 0: CTS += 1
    for j in range(CTS):

        a = j * CHUNK
        b = (j + 1) * CHUNK
        b = min(b, len(df))
        print('chunk', a, 'to', b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T

        for k in range(b - a):
            # choose best threhod
            IDX = cupy.where(cts[k, ] > 0.7705)[0]
            o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
            if len(o) >= 2:
                IDX_b = cupy.where(cts[k, ] > 0.80105)[0]
                o_b = df_cu.iloc[cupy.asnumpy(
                    IDX_b)].posting_id.to_pandas().values
                if len(o) >= 2:
                    preds.append(o_b)
                else:
                    preds.append(o)
            else:
                IDX = cupy.where(cts[k, ] > 0.6555)[0]
                o = df_cu.iloc[cupy.asnumpy(IDX)].posting_id.to_pandas().values
                preds.append(o[:2])

    del model, text_embeddings
    gc.collect()
    return preds
def test_tfidf_vectorizer_setters():
    tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False,
                         sublinear_tf=False)
    tv.norm = 'l1'
    assert tv._tfidf.norm == 'l1'
    tv.use_idf = True
    assert tv._tfidf.use_idf
    tv.smooth_idf = True
    assert tv._tfidf.smooth_idf
    tv.sublinear_tf = True
    assert tv._tfidf.sublinear_tf
Exemple #9
0
def find_similar_titles_with_rapids_knn():
    """
    First we will extract text embeddings using RAPIDS cuML's TfidfVectorizer.
    This will turn every title into a one-hot-encoding of the words present.
    We will then compare one-hot-encodings with RAPIDS cuML KNN to find title's that are similar.
    :return:
    """
    # LOAD TRAIN UNTO THE GPU WITH CUDF
    train_gf = cudf.read_csv('../input/shopee-product-matching/train.csv')
    print('train shape is', train_gf.shape)
    train_gf.head()

    # Extract Text Embeddings with RAPIDS TfidfVectorizer¶
    # TfidfVectorizer returns a cupy sparse matrix.
    # Afterward we convert to a cupy dense matrix and feed that into RAPIDS cuML KNN.

    model = TfidfVectorizer(stop_words='english', binary=True)
    text_embeddings = model.fit_transform(train_gf.title).toarray()
    print('text embeddings shape is', text_embeddings.shape)

    # After fitting KNN, we will display some example rows of train and their 10 closest other titles in train (based on word count one-hot-encoding).

    KNN = 50
    model = NearestNeighbors(n_neighbors=KNN)
    model.fit(text_embeddings)
    distances, indices = model.kneighbors(text_embeddings)

    for k in range(5):
        plt.figure(figsize=(20, 3))
        plt.plot(np.arange(50), cupy.asnumpy(distances[k, ]), 'o-')
        plt.title('Text Distance From Train Row %i to Other Train Rows' % k,
                  size=16)
        plt.ylabel('Distance to Train Row %i' % k, size=14)
        plt.xlabel('Index Sorted by Distance to Train Row %i' % k, size=14)
        plt.show()

        print(train_gf.loc[cupy.asnumpy(indices[k, :10]),
                           ['title', 'label_group']])
Exemple #10
0
# del imagefeat, imgmodel

# In[14]:

train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'), axis=1)
    print('CV score for baseline =', train.f1.mean())

# # title TFIDF

# In[15]:

# from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings = model.fit_transform(train_gf.title).toarray()
print('text embeddings shape', text_embeddings.shape)

# In[16]:

preds = []
CHUNK = 1024 * 4

print('Finding similar titles...')
CTS = len(train) // CHUNK
if len(train) % CHUNK != 0: CTS += 1
for j in range(CTS):

    a = j * CHUNK
    b = (j + 1) * CHUNK
Exemple #11
0
def text_embedding_extraction():
    print('Computing text embeddings...')
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
    text_embeddings = model.fit_transform(test_gf.title).toarray()
    print('text embeddings shape', text_embeddings.shape)
Exemple #12
0
def get_text_embeddings(cu_df):
    model = TfidfVectorizer(stop_words='english', binary=True)
    text_embeddings = model.fit_transform(cu_df.title).toarray()
    del model
    return text_embeddings
Exemple #13
0
test["preds1"] = image_predictions

# # 2.TEXT(使用TFIDF对商品标题做匹配)

# In[16]:

# 导入tfidf相关的库
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
print('RAPIDS', cuml.__version__)

# In[19]:

test_gf = cudf.read_csv(DATA_PATH + 'test.csv')  # 再次导入test.csv
model = TfidfVectorizer(stop_words='english', binary=True,
                        max_features=25_000)  #创建tfidf模型
text_embeddings = model.fit_transform(
    test_gf.title).toarray()  #使用tfidf模型对test数据中的商品标题训练
print('text embeddings shape', text_embeddings.shape)  # 获得text的features

# In[21]:

# 分块做匹配。因为数据量大,无法一次性做两两匹配(会超内存)。
preds2 = []
CHUNK = 1024 * 4  # 每个分块的大小

print('Finding similar titles...')
CTS = len(test) // CHUNK
if len(test) % CHUNK != 0:
    CTS += 1
for j in range(CTS):