def awesome_cossim_topn_wrapper(A,
                                B,
                                ntop,
                                lower_bound=0,
                                use_threads=False,
                                n_jobs=1,
                                return_best_ntop=False,
                                test_nnz_max=-1,
                                expect_best_ntop=None):
    """
    This function is running awesome_cossim_topn()
    with and without return_best_ntop and checking if we get the expected result and if both results are the same.
    It has the same signature, but has an extra parameter: expect_best_ntop
    """

    result1, best_ntop = awesome_cossim_topn(A, B, ntop, lower_bound,
                                             use_threads, n_jobs, True,
                                             test_nnz_max)

    assert expect_best_ntop == best_ntop

    result2 = awesome_cossim_topn(A, B, ntop, lower_bound, use_threads, n_jobs,
                                  False, test_nnz_max)

    assert (result1 != result2).nnz == 0  # The 2 CSR matrix are the same

    return result1
def helper_awesome_cossim_topn_dense(a_dense, b_dense):
    dense_result = np.dot(a_dense, np.transpose(b_dense))  # dot product
    sparse_result = csr_matrix(dense_result)
    sparse_result_top3 = [
        get_n_top_sparse(row, NUM_CANDIDATES) for row in sparse_result
    ]  # get ntop using the old method

    pruned_dense_result = dense_result.copy()
    pruned_dense_result[
        pruned_dense_result < PRUNE_THRESHOLD] = 0  # prune low similarity
    pruned_sparse_result = csr_matrix(pruned_dense_result)
    pruned_sparse_result_top3 = [
        get_n_top_sparse(row, NUM_CANDIDATES) for row in pruned_sparse_result
    ]

    a_csr = csr_matrix(a_dense)
    b_csr_t = csr_matrix(b_dense).T

    awesome_result = awesome_cossim_topn(a_csr, b_csr_t, len(b_dense), 0.0)
    awesome_result_top3 = awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES,
                                              0.0)
    awesome_result_top3 = [
        list(zip(row.indices, row.data)) if len(row.data) > 0 else None
        for row in awesome_result_top3
    ]  # make comparable, normally not needed

    pruned_awesome_result = awesome_cossim_topn(a_csr, b_csr_t, len(b_dense),
                                                PRUNE_THRESHOLD)
    pruned_awesome_result_top3 = awesome_cossim_topn(a_csr, b_csr_t,
                                                     NUM_CANDIDATES,
                                                     PRUNE_THRESHOLD)
    pruned_awesome_result_top3 = [
        list(zip(row.indices, row.data)) if len(row.data) > 0 else None
        for row in pruned_awesome_result_top3
    ]

    # no candidate selection, no pruning
    assert awesome_result.nnz == sparse_result.nnz
    # no candidate selection, below PRUNE_THRESHOLD similarity pruned
    assert pruned_awesome_result.nnz == pruned_sparse_result.nnz

    all_none1 = np.all(pd.isnull(awesome_result_top3)) and np.all(
        pd.isnull(sparse_result_top3))
    all_none2 = np.all(pd.isnull(pruned_awesome_result_top3)) and np.all(
        pd.isnull(pruned_sparse_result_top3))

    # top NUM_CANDIDATES candidates selected, no pruning
    if not all_none1:
        np.testing.assert_array_almost_equal(awesome_result_top3,
                                             sparse_result_top3)
    else:
        assert len(awesome_result_top3) == len(sparse_result_top3)
    # top NUM_CANDIDATES candidates selected, below PRUNE_THRESHOLD similarity pruned
    if not all_none2:
        np.testing.assert_array_almost_equal(pruned_awesome_result_top3,
                                             pruned_sparse_result_top3)
    else:
        len(pruned_awesome_result_top3) == len(pruned_sparse_result_top3)
Beispiel #3
0
 def run(self, indices):
     if (len(indices) <= 5):
         return super().run(indices)
     df = self.df.iloc[indices]
     try:  # name vectorization
         tfidf_vectorizer = TfidfVectorizer(
             ngram_range=self.ngram_range,
             max_df=self.max_name_vectorization_word_frequency,
             min_df=self.min_name_vectorization_word_frequency,
             token_pattern='(\S+)')
         tf_idf_matrix = tfidf_vectorizer.fit_transform(
             df.standardized_name)
     except:  # relax constraints and try again
         tfidf_vectorizer = TfidfVectorizer(ngram_range=self.ngram_range,
                                            max_df=1.0,
                                            min_df=0,
                                            token_pattern='(\S+)')
         tf_idf_matrix = tfidf_vectorizer.fit_transform(
             df.standardized_name)
     sparse_matrix = awesome_cossim_topn(tf_idf_matrix,
                                         tf_idf_matrix.transpose(),
                                         self.topn_by_cosine_similarity,
                                         self.min_cosine_similarity)
     non_zeros = sparse_matrix.nonzero()
     sparserows = non_zeros[0]
     sparsecols = non_zeros[1]
     top_n_rows = min(self.topn_matches_to_apply_model_to, sparsecols.size)
     left = list(itemgetter(*sparserows[:top_n_rows])(indices))
     right = list(itemgetter(*sparsecols[:top_n_rows])(indices))
     return super().filter_bydistance(np.dstack((left, right))[0])
Beispiel #4
0
 def _get_cosine_matrix(self, vals):
     tf_idf_matrix = self._get_tf_idf_matrix(vals)
     if self._topn is None:
         topn = vals.size
     else:
         topn = self._topn
     return awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(),
                                topn, self._match_threshold)
Beispiel #5
0
def cosine_similarity_on_rows_numpy_and_csr_2(matrix, ntop=10):
    """
    Cosine similarity for numpy arrays and sparse matrices.
    It use Dask as well as Cython in the back, being faster than the others.
    Input:
        - matrix (np.array/sparse): matrix to compute the cosine distance in the rows
    Output:
        - matrix (np.array): cosine distance matrix
    """
    mat = matrix.astype(np.float, copy=True)
    return awesome_cossim_topn(A=mat, B=mat.transpose(), ntop=ntop)
Beispiel #6
0
def get_similarity(matrix1,matrix2, n_top, min_similarity, zero_diagonal = False):
    matrix1 = normalize(matrix1)
    matrix2 = normalize(matrix2)

    matrix1 = csr_matrix(matrix1).astype(float)
    matrix2 = csr_matrix(matrix2).astype(float)
    similarity_matrix = awesome_cossim_topn(matrix1, matrix2.T, ntop=n_top, lower_bound=min_similarity)
    # set diagonal to zero
    if zero_diagonal == True:
        similarity_matrix.setdiag(0)

    return similarity_matrix
Beispiel #7
0
def train_knn(args):

    df = pd.read_csv(args.input, names=['query', 'category'])

    logger.info("Lemmatizing and preparing data for KNN model")
    df['query_lem'] = df['query'].apply(lemmatize)
    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         stratify=df.category.values,
                                         random_state=42)

    logger.info("Fitting vectorizer")
    vectorizer = TfidfVectorizer()
    train_features = vectorizer.fit_transform(df_train.query_lem.values)
    logger.info("Scoring 20% holdout sample")
    test_features = vectorizer.transform(df_test.query_lem.values)
    matches = awesome_cossim_topn(test_features, train_features.transpose(),
                                  20, 0.01)
    ind = np.argwhere(matches)
    i1 = ind[:, 0]
    i2 = ind[:, 1]

    df_test.reset_index(inplace=True)
    df_test = df_test.rename(columns={'index': 'index1'})

    index1 = np.take(df_test['index1'].values, i1)
    categories = np.take(df_train.category.values, i2)

    df2 = pd.DataFrame(data={'index1': index1, 'cat': categories})

    #     most frequent category
    pred = df2[['index1', 'cat'
                ]].groupby(['index1']).agg(lambda x: scipy.stats.mode(x)[0])
    pred = pred.reset_index()
    pred = pred.sort_values('index1')

    pred1 = pred.merge(df_test[['index1', 'category'
                                ]].rename(columns={'category': 'cat_true'}),
                       on='index1',
                       how='left')
    pred1['match'] = (pred1.cat == pred1.cat_true).astype(int)
    logger.info("Hold out sample accuracy %s",
                np.round(pred1['match'].mean(), 2))
    logger.info(
        "Hold out sample f1 score %s",
        np.round(
            f1_score(pred1.cat_true.values,
                     pred1.cat.values,
                     average='weighted'), 2))
    logger.info(
        "Hold out sample sklearn accuracy %s",
        np.round(accuracy_score(pred1.cat_true.values, pred1.cat.values), 2))
Beispiel #8
0
            def subgroup_match(subdf):
                if not df2groups.indices.get(subdf.name, None) is None:
                    sub_tfidf1 = tfidf1[subdf.index.values]
                    sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]]
                    co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo()

                    # 2) now use the Levenshtein distance to find the best match
                    for row in set(co.row):
                        rowcol = co.col[co.row==row]
                        argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx],
                            df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx])
                        if lev_dist >= lev_lower_bound:
                            matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]]
    def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix:
        """Builds the cossine similarity matrix of two csr matrices"""
        tf_idf_matrix_1 = master_matrix
        tf_idf_matrix_2 = duplicate_matrix.transpose()

        optional_kwargs = dict()
        if self._config.number_of_processes > 1:
            optional_kwargs = {
                'use_threads': True,
                'n_jobs': self._config.number_of_processes
            }

        return awesome_cossim_topn(tf_idf_matrix_1, tf_idf_matrix_2,
                                   self._config.max_n_matches,
                                   self._config.min_similarity,
                                   **optional_kwargs)
Beispiel #10
0
def identify_anisong_tf(title, artist=None):
    #fuzzy match title with song titles from database using cosine similarity over TF-IDF matrix
    #
    if artist is not None:
        pass
    else:
        tf_idf_matrix_test = vectorizer.transform([title])
        matches = awesome_cossim_topn(tf_idf_matrix_test,
                                      tf_idf_matrix_songs.transpose(), 1, 0)
        song2 = songs[matches.nonzero()[1][0]]
        confidence = int(matches.data[0] * 100)
        #print(confidence)
        #print(song2)
        return conn.execute(
            f"select anime,type,start_ep,end_ep from anisong where rowid = {matches.nonzero()[1][0]}"
        ).fetchone() + (confidence, )
def get_fuzzy_matches(words, targets, n=2, lower_bound=0.85):
    """
    Fuzzy matching of single-token lexicon entries using TF-IDF matrices of character-level n-grams.

    :param words: list of tokens to find fuzzy matches in
    :param targets: list of targets to match to
    :param n: number of characters in the n-grams
    :param lower_bound: lower bound for cosine similarity
    :return: list of (fuzzy match, matched target, cosine similarity) tuples
    """
    vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(n, n), use_idf=False)
    vectorizer.fit(words + targets)
    tf_idf_words = vectorizer.transform(words)
    tf_idf_lexicon = vectorizer.transform(targets).transpose()
    matches = awesome_cossim_topn(tf_idf_words, tf_idf_lexicon, ntop=1, lower_bound=lower_bound).tocoo()

    return [(words[row], targets[col], val) for row, col, val in zip(matches.row, matches.col, matches.data)]
Beispiel #12
0
    def preprocess_text(self, df_clean):
        self.text_to_preprocess = self.remove_unneeded_text()
        df_dirty = {"name": self.text_to_preprocess.split()}

        vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
        tf_idf_matrix_clean = vectorizer.fit_transform(df_clean['name'])
        tf_idf_matrix_dirty = vectorizer.transform(df_dirty['name'])

        matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0)

        matches_df = get_matches_df(matches, df_dirty['name'], df_clean['name'], top=0)
        matches_df = matches_df.loc[matches_df['similarity'] >= 0.85]
        for index, row in matches_df.iterrows():
            if row["wrong_word"] != row["right_word"]:
                if self.word_filter(row["right_word"]):
                    if row["similarity"] > 0.95:
                        self.text_to_preprocess = self.text_to_preprocess.replace(row["wrong_word"], row["right_word"])
                else:
                    self.text_to_preprocess = self.text_to_preprocess.replace(row["wrong_word"], row["right_word"])

        return self.self_written_preprocess_rules()
Beispiel #13
0
def score_knn(args):

    df_test = pd.read_csv(args.input, names=['query'])
    df_train = pd.read_csv('./data/trainSet.csv', names=['query', 'category'])
    df_test['query_lem'] = df_test['query'].apply(lemmatize)
    df_train['query_lem'] = df_train['query'].apply(lemmatize)

    vectorizer = TfidfVectorizer()
    train_features = vectorizer.fit_transform(df_train.query_lem.values)
    test_features = vectorizer.transform(df_test.query_lem.values)
    matches = awesome_cossim_topn(test_features, train_features.transpose(),
                                  20, 0.01)
    ind = np.argwhere(matches)
    i1 = ind[:, 0]
    i2 = ind[:, 1]

    df_test.reset_index(inplace=True)
    df_test = df_test.rename(columns={'index': 'index1'})

    index1 = np.take(df_test['index1'].values, i1)
    query = np.take(df_test['query'].values, i1)
    categories = np.take(df_train.category.values, i2)

    df2 = pd.DataFrame(data={
        'query': query,
        'index1': index1,
        'cat': categories
    })

    #     most frequent category
    pred = df2[['query', 'index1',
                'cat']].groupby(['query', 'index1'
                                 ]).agg(lambda x: scipy.stats.mode(x)[0])
    pred = pred.reset_index()
    pred = pred.sort_values('index1')
    pred[['query', 'cat']].to_csv('./data/pred_knn.csv',
                                  index=None,
                                  header=False)
    logger.info("Finished scoring test sample with KNN")
Beispiel #14
0
def match_to_fda(dest_path: str, to_compare: str):
    print("Loading FDA dictionary vectorized data...")
    fda = load_npz("../output/fda_dict_vectorized.npz")

    print("Generating CSR matrix with matches...")

    # cossim_matrix = load_npz(to_compare)
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)

    with open(file="../data/s1_drug_name_list_unique.csv") as csvfile:
        for name in csvfile:
            if name.__contains__("drug_name"):
                continue
            v_name = vectorizer.fit_transform(list(name))
            best_match = awesome_cossim_topn(v_name.reshape(fda.shape),
                                             fda,
                                             1,
                                             0.85,
                                             use_threads=True,
                                             n_jobs=8)
            print(best_match)
            break
Beispiel #15
0
def generate_csr_matrix(src_path: str,
                        dest_path: str,
                        topn=10,
                        lower_bound=0.85):
    print("Reading data...")
    name_list: pd.DataFrame = pd.read_csv(src_path)
    name_list: pd.Series = name_list.iloc[:, 0]
    name_list: list = name_list.tolist()

    print("Vectorizing...")
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
    tf_idf_matrix = vectorizer.fit_transform(name_list)

    print("Generating CSR matrix with matches...")
    matches: csr_matrix = awesome_cossim_topn(tf_idf_matrix,
                                              tf_idf_matrix.transpose(),
                                              topn,
                                              lower_bound,
                                              use_threads=True,
                                              n_jobs=8)

    print("Saving CSR matrix...")
    save_npz(file=dest_path, matrix=matches)
Beispiel #16
0
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import rand
from sparse_dot_topn import awesome_cossim_topn

N = 10
a = rand(100, 1000000, density=0.005, format='csr')
b = rand(1000000, 200, density=0.005, format='csr')

# Use standard implementation

c = awesome_cossim_topn(a, b, 5, 0.01)

# Use parallel implementation with 4 threads

d = awesome_cossim_topn(a, b, 5, 0.01, use_threads=True, n_jobs=4)


Beispiel #17
0
def awesome_cossim_topn_7_threads(a, b, N, thresh):
    return awesome_cossim_topn(a, b, N, thresh, True, 7, True)
Beispiel #18
0
def awesome_cossim_topn_1_thread(a, b, N, thresh):
    return awesome_cossim_topn(a, b, N, thresh, True, 1, True)
Beispiel #19
0
def awesome_cossim_topn_0_threads(a, b, N, thresh):
    return awesome_cossim_topn(a, b, N, thresh, False, 1, True)
Beispiel #20
0
    a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
    a = a_sparse.tocsr()
    # a = a.astype(np.float32)

    row = rng1.randint(n_duplicates, size=nnz_b)
    cols = rng1.randint(nr_vocab, size=nnz_b)
    data = rng1.rand(nnz_b)

    b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
    b = b_sparse.T.tocsr()
    # b = b.astype(np.float32)

    # first run without profiling to bring the memory up to the same level
    # for all subsequent profiled runs:
    C, C_ntop = awesome_cossim_topn(a, b, N, thresh, True, 7, True)

    print("Sampling non-parallelized sparse_dot_topn function ... ",
          end='',
          flush=True)
    C, C_ntop = awesome_cossim_topn_0_threads(a, b, N, thresh)
    print("Finished.")

    print("Sampling threaded function with 1 thread ... ", end='', flush=True)
    C, C_ntop = awesome_cossim_topn_1_thread(a, b, N, thresh)
    print("Finished.")

    print("Sampling threaded function with 2 threads ... ", end='', flush=True)
    C, C_ntop = awesome_cossim_topn_2_threads(a, b, N, thresh)
    print("Finished.")
Beispiel #21
0
def align_publications(df1, df2=None, columns2match_exact=['Year'], column2match_approx='Title', ntop=1, cosine_lower_bound=0.75,
    use_threads=False, n_jobs=2, lev_lower_bound=0.9, show_progress=False):
    """
    Fast way to match publications between two datasets.  We first match subsets of exact values
    between the two DataFrames, as specified by `columns2match_exact`.
    We then use a fast approximate string matching to match values in `columns2match_approx` to within a threshold.

    Parameters
    ----------
    :param df1 : DataFrame
        A DataFrame with the publication information.

    :param df2 : DataFrame, Optional
        Another DataFrame with the publication information.  If None, then df1 is used again.

    :param columns2match_exact : list, Default: ['Year']
        The columns to match exactly between DataFrames.

    :param column2match_approx : list, Default: 'Title'
        The column to match approximately between DataFrames.

    :param ntop : int, Default 1
        The number of best matches from df2 to return for each row of df1.

    :param lower_bound : float, Default 0.75
        The lowerbound for cosine similarity when doing a fuzzy string match.

    :param use_threads : bool, Default False
        Use multithreading when calculating cosine similarity for fuzzy string matching.

    :param n_jobs : int, Optional, Default 2
        If use_threads is True, the number of threads to use in the parall calculation.

    :param show_progress : bool, Default False
        If True, show a progress bar tracking the calculation.

    """
    # we can do an exact match from merge
    if (columns2match_exact is None or len(columns2match_exact) > 0) and (column2match_approx is None or len(column2match_approx) == 0):
        # get the index name and reset the index to force it as a column
        indexcol = df2.index.name
        df2 = df2.reset_index(drop=False)
        # now merge the dataframes and drop duplicates giving an exact match
        mdf = df1.merge(df2[columns2match_exact + [indexcol]], how='left', on=columns2match_exact)
        mdf.drop_duplicates(subset=columns2match_exact, keep='first', inplace=True)
        return mdf[indexcol]

    # otherwise, if there is a column to match approximately then we need to prep for fuzzy matching
    elif len(column2match_approx) > 0:

        # we take a two-step approach to fuzzy matching
        # 1) first we employ a super fast but not very accurate cosine-similarity
        #    matching to narrow down the possible pair-wise matches
        #    for each string, we create feature vectors from 3-char counts
        tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
        tfidf1 = tfidf.fit_transform(df1[column2match_approx])
        tfidf2 = tfidf.transform(df2[column2match_approx])

        matches = np.empty(tfidf1.shape[0])
        matches[:] = np.NaN

        # if there are no columns to match exactly
        if (columns2match_exact is None or len(columns2match_exact) == 0):

            # 1) first do the all-to-all cosine similarity and extract up to the ntop best possible matches
            co= awesome_cossim_topn(tfidf1, tfidf2.T, ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo()

            # 2) now use the Levenshtein
            for row in tqdm(set(co.row), desc="Align Publications", disable=not show_progress):
                rowcol = co.col[co.row==row]
                argmatch, lev_dist = levenshtein_best_match(df1.loc[row, column2match_approx], df2.iloc[rowcol][column2match_approx])
                if lev_dist >= lev_lower_bound:
                    matches[row] = rowcol[argmatch]


        else:

            df2groups = df2.groupby(columns2match_exact)

            def subgroup_match(subdf):
                if not df2groups.indices.get(subdf.name, None) is None:
                    sub_tfidf1 = tfidf1[subdf.index.values]
                    sub_tfidf2 = tfidf2[df2groups.indices[subdf.name]]
                    co = awesome_cossim_topn(sub_tfidf1, sub_tfidf2.transpose(), ntop=ntop, lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).tocoo()

                    # 2) now use the Levenshtein distance to find the best match
                    for row in set(co.row):
                        rowcol = co.col[co.row==row]
                        argmatch, lev_dist = levenshtein_best_match(subdf.iloc[row][column2match_approx],
                            df2.iloc[df2groups.indices[subdf.name][rowcol]][column2match_approx])
                        if lev_dist >= lev_lower_bound:
                            matches[subdf.index.values[row]] = df2groups.indices[subdf.name][rowcol[argmatch]]

            # register our pandas apply with tqdm for a progress bar
            tqdm.pandas(desc='Publication Matches', disable= not show_progress)

            df1.groupby(columns2match_exact, group_keys=True).progress_apply(subgroup_match)

        return matches
Beispiel #22
0
        add_vals_to_lookup(group, row, col)
    else:
        # if we get here, we need to add a new group.
        # The name is arbitrary, so just make it the row
        add_vals_to_lookup(row, row, col)


# Grab the column you'd like to group, filter out duplicate values
# and make sure the values are Unicode
vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
vals = df['NewName'].unique().astype('U')

# Build the matrix!!!
tfidf_matrix = vectorizer.fit_transform(vals)

cosine_matrix = awesome_cossim_topn(tfidf_matrix, tfidf_matrix.transpose(),
                                    vals.size, 0.8)

# Build a coordinate matrix
coo_matrix = cosine_matrix.tocoo()

# for each row and column in coo_matrix
# if they're not the same string add them to the group lookup
for row, col in zip(coo_matrix.row, coo_matrix.col):
    if row != col:
        add_pair_to_lookup(vals[row], vals[col])

df['Group'] = df['NewName'].map(group_lookup)  #.fillna(df['NewName'])
print(df['Group'].isna().sum())
#df.to_csv('./dol-data-grouped.csv')

###
Beispiel #23
0
    # sublinear_tf=True
)
features = Pipeline(
    steps=[
        ('vect', vect),
        ('tfidf', tfidf)
    ]
)

tfidf_matrix = features.fit_transform(tmp)

cosine_matrix = awesome_cossim_topn(
    tfidf_matrix,
    tfidf_matrix.transpose(),
  # vals.size,
    10,
    0.1,
    use_threads=True,
    n_jobs=3
)

pprint(cosine_matrix)

pprint(get_csr_ntop_idx_data(cosine_matrix[2], 5))

pprint(Article.query.get(3).summary)
pprint(Article.query.get(11).summary)



Beispiel #24
0
time_lev_2 = total / 10
print(
    f"Time saved over Levenshtein distance database: {time_lev_1 - time_lev_2}s ({time_lev_1/time_lev_2}x speedup)"
)

print("TF-IDF matching")

conn.row_factory = lambda cursor, row: row[0]
songs = conn.execute("select title_en from anisong").fetchall()

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_songs = vectorizer.fit_transform(songs)

total = 0
for i in range(1, 11):
    start = time.time()
    tf_idf_matrix_test = vectorizer.transform([test_song])
    matches = awesome_cossim_topn(tf_idf_matrix_test,
                                  tf_idf_matrix_songs.transpose(), 1, 0)
    print(matches)
    end = time.time()
    duration = end - start
    total += duration
    print(f"Iteration {i}: {duration}")

song2 = songs[matches.nonzero()[1][0]]
print(f"Matched {song2} expected {expected_song} certainty {matches.data[0]}")

print(f"Avg duration: {total/10}")
time_tf = total / 10
#print(f"Time saved over Levenshtein distance: {time_lev_2 - time_tf}s ({time_lev_2/time_tf}x speedup)")
Beispiel #25
0
def cosine_similarity(from_vector: np.ndarray,
                      to_vector: np.ndarray,
                      from_list: List[str],
                      to_list: List[str],
                      nbest,
                      min_similarity: float = 0,
                      method: str = "sparse") -> pd.DataFrame:
    """ Calculate similarity between two matrices/vectors and return best matches

    Arguments:
        from_vector: the matrix or vector representing the embedded strings to map from
        to_vector: the matrix or vector representing the embedded strings to map to
        from_list: The list from which you want mappings
        to_list: The list where you want to map to
        min_similarity: The minimum similarity between strings, otherwise return 0 similarity
        method: The method/package for calculating the cosine similarity.
                Options: "sparse", "sklearn", "knn".
                Sparse is the fastest and most memory efficient but requires a
                package that might be difficult to install.
                Sklearn is a bit slower than sparse and requires significantly more memory as
                the distance matrix is not sparse
                Knn uses 1-nearest neighbor to extract the most similar strings
                it is significantly slower than both methods but requires little memory

    Returns:
        matches:  The best matches between the lists of strings


    Usage:

    Make sure to fill the `to_vector` and `from_vector` with vector representations
    of `to_list` and `from_list` respectively:

    ```python
    from polyfuzz.models import extract_best_matches
    indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
    ```
    """
    if nbest != None:
        if int(nbest) >  len(to_list):
            raise ValueError('best choice must be less than to_list')
    else:
        nbest = int(1)
        
    # Slower but uses less memory
    if method == "knn":

        if from_list == to_list:
            knn = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)
            distances = distances[:, 1]
            indices = indices[:, 1]

        else:
            knn = NearestNeighbors(n_neighbors=1, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)

        similarity = [round(1 - distance, 3) for distance in distances.flatten()]

    # Fast, but does has some installation issues
    elif _HAVE_SPARSE_DOT and method == "sparse":
        if isinstance(to_vector, np.ndarray):
            to_vector = csr_matrix(to_vector)
        if isinstance(from_vector, np.ndarray):
            from_vector = csr_matrix(from_vector)

        # There is a bug with awesome_cossim_topn that when to_vector and from_vector
        # have the same shape, setting topn to 1 does not work. Apparently, you need
        # to it at least to 2 for it to work
        if int(nbest) <= 1:
            similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, 2, min_similarity)
        elif int(nbest) > 1:
            similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, nbest, min_similarity)
            
        if from_list == to_list:
            similarity_matrix = similarity_matrix.tolil()
            similarity_matrix.setdiag(0.)
            similarity_matrix = similarity_matrix.tocsr()
        
        if int(nbest) <= 1 and method == "sparse":
            indices = np.array(similarity_matrix.argmax(axis=1).T).flatten()
            similarity = similarity_matrix.max(axis=1).toarray().T.flatten()
        elif int(nbest) > 1 and method == "sparse":
            similarity = np.flip(np.take_along_axis(similarity_matrix.toarray(), np.argsort(similarity_matrix.toarray(), axis =1), axis=1) [:,-nbest:], axis=1)
            indices = np.flip(np.argsort(np.array(similarity_matrix.toarray()), axis =1)[:,-nbest:], axis=1)
            
    # Faster than knn and slower than sparse but uses more memory
    else:
        similarity_matrix = scikit_cosine_similarity(from_vector, to_vector)

        if from_list == to_list:
            np.fill_diagonal(similarity_matrix, 0)

        indices = similarity_matrix.argmax(axis=1)
        similarity = similarity_matrix.max(axis=1)

    # Convert results to df
    if int(nbest) <= 1:
        matches = [to_list[idx] for idx in indices.flatten()]
        matches = pd.DataFrame(np.vstack((from_list, matches, similarity)).T, columns=["From", "To", "Similarity"])
        matches.Similarity = matches.Similarity.astype(float)
        matches.loc[matches.Similarity < 0.001, "To"] = None
    else:
        matches = [np.array([to_list[idx] for idx in l]) for l in indices] ##In progress
        column = ["To"]
        for i in range(nbest - 1):
            column.append("BestMatch" + "__" + str(i+1))
        column.append("Similarity")
        for j in range(nbest - 1):
            column.append("Similarity" + "__" + str(j+1))
        matches = pd.concat([pd.DataFrame({'From' : from_list}), pd.DataFrame(np.hstack((matches, similarity)), columns= column)], axis =1)
        matches.Similarity = matches.Similarity.astype(float)
        matches.loc[matches.Similarity < 0.001, "To"] = None
        for i in range(nbest - 1):
            matches.loc[matches.Similarity < 0.001, "BestMatch" + "__" + str(i+1)] = None
        
    return matches
Beispiel #26
0
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import rand
from sparse_dot_topn import awesome_cossim_topn

N = 10
a = rand(100, 1000000, density=0.005, format='csr')
b = rand(1000000, 200, density=0.005, format='csr')

c = awesome_cossim_topn(a, b, 5, 0.01)
Beispiel #27
0
def sparse_dot_product(A, B, ntop, lower_bound):
    '''dot product of two saprse matrices'''
    return awesome_cossim_topn(A, B,ntop=ntop, lower_bound=lower_bound)
Beispiel #28
0
def cosine_similarity(from_vector: np.ndarray,
                      to_vector: np.ndarray,
                      from_list: List[str],
                      to_list: List[str],
                      min_similarity: float = 0.75,
                      top_n: int = 1,
                      method: str = "sparse") -> pd.DataFrame:
    """ Calculate similarity between two matrices/vectors and return best matches

    Arguments:
        from_vector: the matrix or vector representing the embedded strings to map from
        to_vector: the matrix or vector representing the embedded strings to map to
        from_list: The list from which you want mappings
        to_list: The list where you want to map to
        min_similarity: The minimum similarity between strings, otherwise return 0 similarity
        top_n: The number of best matches you want returned
        method: The method/package for calculating the cosine similarity.
                Options: "sparse", "sklearn", "knn".
                Sparse is the fastest and most memory efficient but requires a
                package that might be difficult to install.
                Sklearn is a bit slower than sparse and requires significantly more memory as
                the distance matrix is not sparse
                Knn uses 1-nearest neighbor to extract the most similar strings
                it is significantly slower than both methods but requires little memory

    Returns:
        matches:  The best matches between the lists of strings


    Usage:

    Make sure to fill the `to_vector` and `from_vector` with vector representations
    of `to_list` and `from_list` respectively:

    ```python
    from polyfuzz.models import extract_best_matches
    indices, similarity = extract_best_matches(from_vector, to_vector, method="sparse")
    ```
    """
    if to_list is not None:
        if top_n > len(set(to_list)):
            top_n = len(set(to_list))
    
    # Slower but uses less memory
    if method == "knn":

        if to_list is None:
            knn = NearestNeighbors(n_neighbors=top_n+1, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)
            distances = distances[:, 1:]
            indices = indices[:, 1:]
        else:
            knn = NearestNeighbors(n_neighbors=top_n, n_jobs=-1, metric='cosine').fit(to_vector)
            distances, indices = knn.kneighbors(from_vector)

        similarities = [np.round(1 - distances[:, i], 3) for i in range(distances.shape[1])]

    # Fast, but does has some installation issues
    elif _HAVE_SPARSE_DOT and method == "sparse":
        if isinstance(to_vector, np.ndarray):
            to_vector = csr_matrix(to_vector)
        if isinstance(from_vector, np.ndarray):
            from_vector = csr_matrix(from_vector)

        # There is a bug with awesome_cossim_topn that when to_vector and from_vector
        # have the same shape, setting topn to 1 does not work. Apparently, you need
        # to it at least to 2 for it to work
        similarity_matrix = awesome_cossim_topn(from_vector, to_vector.T, top_n+1, min_similarity)

        if to_list is None:
            similarity_matrix = similarity_matrix.tolil()
            similarity_matrix.setdiag(0.)
            similarity_matrix = similarity_matrix.tocsr()

        indices = _top_n_idx_sparse(similarity_matrix, top_n)
        similarities = _top_n_similarities_sparse(similarity_matrix, indices)
        indices = np.array(np.nan_to_num(np.array(indices, dtype=np.float), nan=0), dtype=np.int)

    # Faster than knn and slower than sparse but uses more memory
    else:
        similarity_matrix = scikit_cosine_similarity(from_vector, to_vector)

        if to_list is None:
            np.fill_diagonal(similarity_matrix, 0)

        indices = np.flip(np.argsort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
        similarities = np.flip(np.sort(similarity_matrix, axis=-1), axis=1)[:, :top_n]
        similarities = [np.round(similarities[:, i], 3) for i in range(similarities.shape[1])]

    # Convert results to df
    if to_list is None:
        to_list = from_list.copy()
        
    columns = (["From"] +
               ["To" if i == 0 else f"To_{i+1}" for i in range(top_n)] +
               ["Similarity" if i ==0 else f"Similarity_{i+1}" for i in range(top_n)])
    matches = [[to_list[idx] for idx in indices[:, i]] for i in range(indices.shape[1])]
    matches = pd.DataFrame(np.vstack(([from_list], matches, similarities)).T, columns = columns)

    # Update column order
    columns = [["From", "To", "Similarity"]] + [[f"To_{i+2}", f"Similarity_{i+2}"] for i in range((top_n-1))]
    matches = matches.loc[:, [title for column in columns for title in column]]

    # Update types
    for column in matches.columns:
        if "Similarity" in column:
            matches[column] = matches[column].astype(float)
            matches.loc[matches[column] < 0.001, column] = float(0)
            matches.loc[matches[column] < 0.001, column.replace("Similarity", "To")] = None

    return matches
Beispiel #29
0
 def _get_cosine_matrix(self, vals):
     tf_idf_matrix = self._get_tf_idf_matrix(vals)
     return awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(),
                                vals.size, self._match_threshold)
Beispiel #30
0
def cos_sim_query(query_vector, query_space, n_neighbors=50, lower_bound=0.0, beta = 1, gamma = 1, n_jobs = None, n_batches = 100):
    '''make cos similarity query of query_vector on query_space
    beta is a weightening factor such that query_space = normalize(query_space^beta)
    beta greater than one ensure higher magnitude components recieves more importance when querying
    returns idx, sim
    '''

    query_vector, query_space = copy.deepcopy(query_vector), copy.deepcopy(query_space)
    query_vector, query_space = transform_similarity_weights(query_vector, query_space, beta, gamma)

    print(f'Querying {n_neighbors} nearest neighbors, this can take a while...')
    if not scipy.sparse.issparse(query_vector):
        query_vector = scipy.sparse.csr_matrix(query_vector)

    if not scipy.sparse.issparse(query_space):
        query_space = scipy.sparse.csr_matrix(query_space)
    try:
        query_space = query_space.T
        if n_jobs is None:
            batches = make_batches(query_vector, batch_size = np.ceil(query_vector.shape[0]/n_batches).astype(int))
            sim_matrix = [awesome_cossim_topn(qv, query_space,ntop=n_neighbors, lower_bound=lower_bound,) for qv in tqdm(batches)]
            sim_matrix = scipy.sparse.vstack(sim_matrix)
        else:
            batches = make_batches(query_vector, batch_size = np.ceil(query_vector.shape[0]/n_batches).astype(int))

            sim_matrix = Parallel(n_jobs=n_jobs, verbose=1,
                                   **_joblib_parallel_args(prefer="threads"))(
                    delayed(awesome_cossim_topn)(qv, query_space,
                                             ntop=n_neighbors, lower_bound=lower_bound)
                    for qv in batches)

            sim_matrix = scipy.sparse.vstack(sim_matrix)

        sim_matrix = scipy.sparse.csr_matrix(sim_matrix)
        print('Postprocessing query results...')
        idx = []
        sim = []
        arr_sizes = []
        for d in sim_matrix:
            s = d.data
            i = d.nonzero()[1]
            sim.append(s)
            idx.append(i)
            arr_sizes.append(len(s))

        max_size = max(arr_sizes)
        idx = np.array([pad_to_shape(i, max_size) for i in idx]).astype(int)
        sim = np.array([pad_to_shape(s, max_size) for i in sim])
        if idx.shape[1] == 0:
            raise ValueError('No similarity greater than lower_bound found. Choose a lower threshold.')
        return  idx, sim

    except NameError: #in case sparse_dot_topn is not instaled
        print('''sparse_dot_topn not installed. Neighbors query will use
        sklearn NearestNeighbor, which may take a while for sparse matrix query''')
        dist, idx = (
            NearestNeighbors(n_neighbors = n_neighbors, radius = 1 - lower_bound, metric = 'cosine', n_jobs = -1)
            .fit(query_space)
            .kneighbors(query_vector)
        )
        return idx, 1 - dist # <- cos_sim = 1 - cos_dist