Python AutoComplete.searchの例、fast_autocomplete.AutoComplete.search Pythonの例

コード例 #1

0

ファイルを表示

def autocomplete():
    search = request.args.get('q')
    print(request.args.get('term'))
    print('search is ---------')
    print(str(search))
    autocomplete = AutoComplete(words=autocmplete_label_dict)
    print(autocomplete.search(word=str(search), max_cost=3, size=3))
    t = autocomplete.search(word=str(search), max_cost=3, size=6)

    flatten = [item for sublist in t for item in sublist]
    print(flatten)

    # results = autocomplete.search(word=str(search), max_cost=3, size=3) #class_labels  # ['Beer', 'Wine', 'Soda', 'Juice', 'Water']
    results = flatten
    print(results)

    return jsonify(matching_results=results)

コード例 #2

0

ファイルを表示

    def test_special_characters(self):
        words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}}
        autocomplete = AutoComplete(
            words=words,
            valid_chars_for_string=string.ascii_letters + string.punctuation)
        # result = autocomplete.search(word='abcd(efgh)')
        # assert [['abcd(efgh)ijk']] == result

        result2 = autocomplete.search(word='u (2 o')
        assert [['u (2 off)']] == result2

コード例 #3

0

ファイルを表示

class SM_Autocomplete:

    def __init__(self):
        with open("shaker_dictionary.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'shaker_dictionary.txt') as f:
            words = f.readlines()
        words = dict(zip(words, [dict()] * len(words)))
        self.autocomplete = AutoComplete(words=words)

        with open("authors.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'authors.txt') as f:
            names = f.readlines()
        names = dict(zip(names, [dict()] * len(names)))
        self.authors = AutoComplete(words=names)

    def general(self, s):
        return sorted(self.autocomplete.search(word=s, max_cost=3, size=10))

    def author(self, s):
        return sorted(self.authors.search(word=s, max_cost=3, size=10))

コード例 #4

0

ファイルを表示

    def video_loop(self):
        ok, frame = self.vs.read()
        if ok:
            cv2image = cv2.flip(frame, 1)
            x1 = int(0.5 * frame.shape[1])
            y1 = 10
            x2 = frame.shape[1] - 10
            y2 = int(0.5 * frame.shape[1])
            cv2.rectangle(cv2image, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1),
                          (255, 0, 0), 1)
            cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA)
            self.current_image = Image.fromarray(cv2image)
            imgtk = ImageTk.PhotoImage(image=self.current_image)
            self.panel.imgtk = imgtk
            self.panel.config(image=imgtk)
            cv2image = cv2image[y1:y2, x1:x2]
            gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY)
            blur = cv2.GaussianBlur(gray, (5, 5), 2)
            th3 = cv2.adaptiveThreshold(blur, 255,
                                        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                        cv2.THRESH_BINARY_INV, 11, 2)
            ret, res = cv2.threshold(th3, 70, 255,
                                     cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            self.predict(res)
            # self.current_image2 = Image.fromarray(res)
            # imgtk = ImageTk.PhotoImage(image=self.current_image2)
            # self.panel2.imgtk = imgtk
            # self.panel2.config(image=imgtk)

            self.panel3.config(text=self.current_symbol, font=("Courier", 10))
            self.panel4.config(text=self.word, font=("Courier", 10))
            self.panel5.config(text=self.str, font=("Courier", 10))

            predicts = self.word
            autocomplete = AutoComplete(words=self.words)
            self.a = autocomplete.search(word=predicts, max_cost=2, size=2)
            print("Initial {0}: ,Suggest : {1}".format(predicts, self.a))
            if (len(self.a) > 0):
                self.bt1.config(text=self.a[0][0], font=("Courier", 10))
            else:
                self.bt1.config(text="None")

            if (len(self.a) > 1):
                self.bt2.config(text=self.a[1][0], font=("Courier", 10))
            else:
                self.bt2.config(text="None")

            if (len(self.a) > 2):
                self.bt3.config(text=self.a[2][0], font=("Courier", 10))
            else:
                self.bt3.config(text="None")

        self.root.after(60, self.video_loop)

コード例 #5

0

ファイルを表示

 def test__find_and_sort(self, word, max_cost, size, expected_find_results,
                         expected_steps, expected_find_and_sort_results):
     expected_results = expected_find_and_sort_results
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
     results = auto_complete._find_and_sort(word, max_cost, size)
     results = list(results)
     search_results = auto_complete.search(word, max_cost, size)
     print_results(locals())
     assert expected_results == results
     if word.strip():
         assert expected_results == search_results
     else:
         assert [] == search_results

コード例 #6

0

ファイルを表示

 def test_update_count_of_word(self, word, update_dict, expected_results,
                               expected_new_count):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS,
                                  synonyms=SYNONYMS,
                                  full_stop_words=['bmw', 'alfa romeo'])
     if update_dict:
         new_count = auto_complete.update_count_of_word(**update_dict)
         assert expected_new_count == new_count
         assert expected_new_count == auto_complete.get_count_of_word(
             update_dict['word'])
     results = auto_complete.search(word, max_cost=2, size=4)
     print_results(locals())
     assert expected_results == results

コード例 #7

0

ファイルを表示

        model = line['model']
        count = line['count']
        if make != model:
            #local_words = [model, '{}{}'.format(make,model)]
            # print(local_words)
            # while local_words:
            #    word = local_words.pop()
            #    if word not in words:
            words['{}{}'.format(make, model)] = {}
        # if make not in words:
        #words[make] = {}
    return words


synonyms = {
    "alfa romeo 4c coupe": ["the alfa", "hello"],
    "bmw": ["beemer", "bimmer"]
}
words = get_words("autocomp.csv")
autocomplete = AutoComplete(words=words, synonyms=synonyms)

autocomplete.search(word='the ', max_cost=3, size=5)

# %%
auto_complete.update_count_of_word(word='toyota aygo', count=10000)
autocomplete.get_count_of_word('toyota aygo')

# %%

# %%

コード例 #8

0

ファイルを表示

 def test_autocomplete_synonym_part_of_another_word(self):
     words = {'cartoon': {}, 'vehicle': {}}
     synonyms = {'vehicle': ['car']}
     autocomplete = AutoComplete(words=words, synonyms=synonyms)
     result = autocomplete.search(word='ca')
     assert [['vehicle'], ['cartoon']] == result

コード例 #9

0

ファイルを表示

class RelatedArticles():
    @staticmethod
    def get_filtered_by_date(articles, distances, days=0, months=0, years=1):
        print(len(articles), len(distances))
        filter_date = (
            datetime.now() -
            relativedelta(days=days, months=months, years=years)).date()
        filtered_articles = []
        filtered_distances = []
        for i, article in enumerate(articles):
            if article.publish_date > filter_date:
                filtered_articles.append(article)
                filtered_distances.append(distances[i])
        return filtered_articles, np.array(filtered_distances)

    @staticmethod
    def article2text(article):
        title = CromaGNI.preprocess_aws_data(article['title'])
        text = CromaGNI.preprocess_aws_data(article['text'])
        text = title + '\n' + text
        return text

    @staticmethod
    def doc2tokens(doc):
        tokens = []
        i = 0
        while i < len(doc):
            t = doc[i]
            tx = t.text
            #             print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_)
            if t.ent_iob_ == 'O':
                ent_tex = tx
                i += 1
                if (not t.is_space and '@' not in t.text
                    ) or '\n' in t.text:  # and t.text != '\n'):
                    if t.is_digit:
                        tokens.append('__DIGIT__')
                    elif '$' in tx:
                        tokens.append('__CURRENCY__')
                    else:
                        tokens.append(ent_tex)
            else:
                ent_tex = ''
                while t.ent_iob_ != 'O':
                    if t.pos_ == 'DET' and t.ent_iob_ == 'B':
                        # It is an article
                        tokens.append(tx)
                    else:
                        ent_tex = ent_tex + ' ' + tx
                    i += 1
                    if i < len(doc):
                        t = doc[i]
                        tx = t.text
                    else:
                        break
                ent_tex = ent_tex.strip().replace(' - ', '-')
                tokens.append(ent_tex)
        return tokens

    def __init__(self,
                 spacy_model_path=None,
                 gensim_model_path=None,
                 faiss_indexes_path=None,
                 faiss_indexes_tfidf_path=None,
                 token2tfidf_path=None):
        self.faiss_indexes_path = faiss_indexes_path
        self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path
        if spacy_model_path is not None:
            self.nlp = spacy.load(spacy_model_path)
        if gensim_model_path is not None:
            self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r')
            model_words, self.model_synonyms = self.prepare_autocomplete()
            self.autocomplete_model = AutoComplete(words=model_words)
        if faiss_indexes_path is not None and os.path.exists(
                faiss_indexes_path):
            self.faiss_indexes = faiss.read_index(faiss_indexes_path)
        else:
            self.faiss_indexes = None
        if faiss_indexes_tfidf_path is not None and os.path.exists(
                faiss_indexes_tfidf_path):
            self.faiss_indexes_tfidf = faiss.read_index(
                faiss_indexes_tfidf_path)
        else:
            self.faiss_indexes_tfidf = None

        if token2tfidf_path is not None:
            self.token2tfidf = np.load(token2tfidf_path,
                                       allow_pickle=True).item()
        else:
            self.token2tfidf = None

    # def save_training_tokens(self, publication_name, chunk_size = 50_000):
    #     dst_folder = f'training_data_{publication_name}_{chunk_size}/'
    #     if not os.path.exists(dst_folder):
    #         os.makedirs(dst_folder)
    #     articles = Article.objects(publication=Publication.objects(name=publication_name).get()).order_by('-publish_date')
    #     N = articles.count()
    #     N_chunks = np.ceil(N/chunk_size)
    #     sentences = []
    #     ids = []
    #     chunk = 0
    #     for i, article in enumerate(articles):
    #         if i%chunk_size == 0 and i!=0:
    #             chunk+=1
    #             file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy'
    #             np.save(file_name, sentences)
    #             sentences = []
    #             print()
    #             print(f'{file_name} saved!')
    #             file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy'
    #             np.save(file_name_ids, ids)
    #             ids = []

    #         text = RelatedArticles.article2text(article)
    #         print(f'\r{i}/{N}', end=' ')
    #         doc = self.nlp(text)
    #         sentences.append(RelatedArticles.doc2tokens(doc))
    #         ids.append(str(article['id']))
    #     chunk+=1
    #     file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy'
    #     np.save(file_name, sentences)
    #     sentences = []
    #     print()
    #     print(f'{file_name} saved!')
    #     file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy'
    #     np.save(file_name_ids, ids)
    #     ids = []

    def get_autocomplete_words_list(self, text):
        autocomplets = self.autocomplete_model.search(text, size=10)
        near_words = []
        for word in autocomplets:
            near_words = near_words + self.model_synonyms[word[0]]

        return near_words

    def get_similar(self, word, topn=10):
        words = []
        distances = []
        for word, distance in self.w2v_model.wv.most_similar(word, topn=topn):
            words.append(word)
            distances.append(distance)
        return words, distances

    def get_related_articles(self,
                             article,
                             years=1,
                             months=0,
                             days=0,
                             radius=0.89):
        id_form_article_id = article['faiss_index']
        if id_form_article_id is None:
            # Not in faiss db already
            vector = self.article2vect(
                article
            )  # np.array([article_to_faiss_vect(article, nlp_custom, w2v_model)])
        else:
            vector = np.array(
                [self.faiss_indexes.index.reconstruct(id_form_article_id)])

        articles, distances = self.get_related_articles_from_vector(
            vector, years=years, months=months, days=days, radius=radius)

        if id_form_article_id is None:
            articles = list(articles)
            articles.insert(0, article)
            distances = list(distances)
            distances.insert(0, 1.0)
        return articles, distances

    def tokens2vect(self, art_arry, tfidf=True):
        if self.token2tfidf is None:
            tfidf = False
        word_vect_dim = self.w2v_model.wv.vector_size
        v = np.zeros(word_vect_dim)
        if tfidf:
            v_tfidf = np.zeros(word_vect_dim)
        for word in art_arry:
            if word in self.w2v_model.wv.vocab:
                if tfidf:
                    wordtfidf = self.token2tfidf.get(word, 0)
                    v_tfidf = v_tfidf + self.w2v_model.wv.get_vector(
                        word) * wordtfidf
                v = v + self.w2v_model.wv.get_vector(word)
            else:
                words = word.split(' ')
                if len(words) > 1:
                    for word in words:
                        if word in self.w2v_model.wv.vocab:
                            v = v + self.w2v_model.wv.get_vector(word)
                            if tfidf:
                                wordtfidf = self.token2tfidf.get(word, 0)
                                v_tfidf = v_tfidf + self.w2v_model.wv.get_vector(
                                    word) * wordtfidf
        norm = np.linalg.norm(v)

        if norm == 0:
            v = np.zeros(word_vect_dim)
        else:
            v = v / norm

        if tfidf:
            norm_tfidf = np.linalg.norm(v_tfidf)
            if norm_tfidf == 0:
                v_tfidf = np.zeros(word_vect_dim)
            else:
                v_tfidf = v_tfidf / norm_tfidf

            return v.astype('float32'), v_tfidf.astype('float32')
        else:
            return v.astype('float32')

    def doc2vect(self, doc):
        tokens = RelatedArticles.doc2tokens(doc)
        return self.tokens2vect(tokens)

    def text2doc(self, text):
        return self.nlp(text)

    def text2vect(self, text):
        doc = self.text2doc(text)
        return self.doc2vect(doc)

    def article2vect(self, article):
        text = RelatedArticles.article2text(article)
        return self.text2vect(text)

    def get_related_articles_from_vector(self,
                                         vector,
                                         radius=0.89,
                                         k=None,
                                         fr=0,
                                         filter_by_date=True,
                                         years=1,
                                         months=0,
                                         days=0):
        indexes = []
        distances = []
        if type(vector) == tuple:
            # Hay que hacer un fix para tfidf aca
            vector = vector[0]
        if len(vector.shape) == 1:
            vector = np.array([vector])
        if self.faiss_indexes is not None:
            if k is None:
                """
                    returns by radius
                """
                lims, D, I = self.faiss_indexes.range_search(vector, radius)
                j = 0
                distances = D[lims[j]:lims[j + 1]][fr:]
                sorted_idx = np.argsort(distances)[::-1]
                distances = distances[sorted_idx]
                indexes = I[lims[j]:lims[j + 1]][fr:][sorted_idx]
            else:
                """
                    returns k related
                """
                D, I = self.faiss_indexes.search(vector, k)
                distances = D[0][fr:]
                indexes = I[0][fr:]

        articles = []
        for idx in indexes:
            art_ = Article.objects(faiss_index=idx).first()
            if art_ is not None:
                articles.append(art_)

        if len(articles) > 0 and filter_by_date:
            articles, distances = RelatedArticles.get_filtered_by_date(
                articles, distances, years=years, months=months, days=days)

        return articles, distances

    # def add_faiss_vectors(self, articles, old_faiss_ids_f, old_faiss_indexes_f, old_faiss_indexes_tfidf_f, new_faiss_ids_f, new_faiss_indexes_f, new_faiss_indexes_tfidf_f):
    #     if new_faiss_indexes_tfidf_f is not None:
    #         tfidf=True
    #     else:
    #         tfidf=False
    #     # Read faiss indexes and mongoids
    #     if old_faiss_ids_f is None or old_faiss_indexes_f is None:
    #         faiss_articles_ids = []
    #         faiss_index2 = None
    #         faiss_index2_tfidf = None
    #     else:
    #         faiss_articles_ids = np.load(old_faiss_ids_f)
    #         faiss_index2 = faiss.read_index(old_faiss_indexes_f)
    #         faiss_index2_tfidf = faiss.read_index(old_faiss_indexes_tfidf_f)

    #     N_vects = len(articles)
    #     # Get wordvectors
    #     word_vect_dim = self.w2v_model.wv.vector_size
    #     xb = np.zeros((N_vects, word_vect_dim), dtype='float32')
    #     if tfidf:
    #         xb_tfidf = np.zeros((N_vects, word_vect_dim), dtype='float32')
    #     new_article_ids = []
    #     i=0
    #     j=0
    #     while j<N_vects:
    #         article = articles[i]
    #         if str(article.id) not in faiss_articles_ids:
    #             new_article_ids.append(str(article.id))
    #             if tfidf:
    #                 xb[j, :], xb_tfidf[j, :] = self.article2vect(article)
    #             else:
    #                 xb[j, :] = self.article2vect(article)
    #             j+=1
    #         i+=1
    #         print(f'\r{i}, {j} / {N_vects}', end='')

    #     # Update articles ids
    #     all_articles_ids = list(faiss_articles_ids) + new_article_ids
    #     np.save(new_faiss_ids_f, all_articles_ids)
    #     if len(faiss_articles_ids) == 0:
    #         ids = np.arange(N_vects).astype('int64')  # + faiss_index2.ntotal
    #     else:
    #         ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal

    #     if len(faiss_articles_ids) == 0:
    #         index = faiss.IndexFlatIP(word_vect_dim)
    #         faiss_index2 = faiss.IndexIDMap(index)
    #         if tfidf:
    #             index_tfidf = faiss.IndexFlatIP(word_vect_dim)
    #             faiss_index2_tfidf = faiss.IndexIDMap(index_tfidf)

    #     faiss_index2.add_with_ids(xb, ids)
    #     faiss.write_index(faiss_index2, new_faiss_indexes_f)

    #     if tfidf:
    #         faiss_index2_tfidf.add_with_ids(xb_tfidf, ids)
    #         faiss.write_index(faiss_index2_tfidf, new_faiss_indexes_tfidf_f)

    def prepare_autocomplete(self):
        words = {}
        for word, g in self.w2v_model.wv.vocab.items():
            lower = word.lower()
            if lower in words:
                if g.count > words[lower]['count']:
                    words[lower] = {'count': g.count}
            else:
                words[lower] = {'count': g.count}

        synonyms = {}
        for word, g in self.w2v_model.wv.vocab.items():
            lower = word.lower()
            if lower not in synonyms:
                synonyms[lower] = []
            synonyms[lower].append(word)
        return words, synonyms

    def add_faiss_vectors(self, articles, tfidf=True):
        total_vectors = 0
        vector_size = self.w2v_model.wv.vector_size
        if self.faiss_indexes is None:
            index = faiss.IndexFlatIP(vector_size)
            self.faiss_indexes = faiss.IndexIDMap(index)
            if tfidf:
                index_tfidf = faiss.IndexFlatIP(vector_size)
                self.faiss_indexes_tfidf = faiss.IndexIDMap(index_tfidf)

        total_vectors = self.faiss_indexes.ntotal
        total_vectors_tfidf = self.faiss_indexes_tfidf.ntotal

        xb = []
        xb_tfidf = []
        faiss_count = total_vectors
        faiss_count_tfidf = total_vectors_tfidf
        ids = []
        ids_tfidf = []

        for article in articles:
            if article['faiss_index'] is None:
                vect, vect_tfidf = self.article2vect(article)
                xb.append(vect)
                xb_tfidf.append(vect_tfidf)
                article['faiss_index'] = faiss_count
                article['faiss_index_tfidf'] = faiss_count_tfidf
                ids.append(faiss_count)
                ids_tfidf.append(faiss_count_tfidf)
                article.save()
                faiss_count = faiss_count + 1
                faiss_count_tfidf = faiss_count_tfidf + 1

        if len(ids) == 0:
            # No se agrego nada por que ya estaba
            return total_vectors, len(ids)

        xb = np.array(xb, dtype='float32')
        xb_tfidf = np.array(xb_tfidf, dtype='float32')

        ids = np.array(ids, dtype='int64')
        ids_tfidf = np.array(ids_tfidf, dtype='int64')

        self.faiss_indexes.add_with_ids(xb, ids)
        faiss.write_index(self.faiss_indexes, self.faiss_indexes_path)

        self.faiss_indexes_tfidf.add_with_ids(xb_tfidf, ids_tfidf)
        faiss.write_index(self.faiss_indexes_tfidf,
                          self.faiss_indexes_tfidf_path)

        return total_vectors, len(ids)


# def article_to_faiss_vect(article, nlp, w2v_model):
#     # article2vect
#     title = CromaGNI.preprocess_aws_data(article['title'])
#     text = CromaGNI.preprocess_aws_data(article['text'])
#     text = title + '\n' + text
#     doc = nlp(text)
#     return get_sentence_vect(doc, w2v_model)

# def get_related_aticles(vector, faiss_indexes, faiss_article_ids, Article, radius=0.89, k=None, fr = 0, filter_by_date=True, years=1, months=0, days=0):
#     if k is None:
#         lims, D, I = faiss_indexes.range_search(vector, radius)
#         j = 0
#         distances = D[lims[j]:lims[j+1]][fr:]
#         sorted_idx = np.argsort(distances)[::-1]
#         distances = distances[sorted_idx]
#         indexes = I[lims[j]:lims[j+1]][fr:][sorted_idx]
#     else:
#         D, I = faiss_indexes.search(vector, k)
#         distances = D[0][fr:]
#         indexes = I[0][fr:]

#     articles = []

#     for idx in indexes:
#         articles.append(Article.objects(id=faiss_article_ids[idx]).first())

#     if filter_by_date:
#         articles, distances = get_filtered_by_date(articles, distances, years=years, months=months, days=days)

#     return articles, distances

# def add_faiss_vectors(old_faiss_ids_f, old_faiss_indexes_f, new_faiss_ids_f, new_faiss_indexes_f, articles, w2v_model, nlp_ner, N_vects=10000):
#     # Read faiss indexes and mongoids
#     if old_faiss_ids_f is None or old_faiss_indexes_f is None:
#         faiss_articles_ids = []
#         faiss_index2 = None
#     else:
#         faiss_articles_ids = np.load(old_faiss_ids_f)
#         faiss_index2 = faiss.read_index(old_faiss_indexes_f)

#     # Get wordvectors
#     word_vect_dim = w2v_model.wv.vector_size
#     xb = np.zeros((N_vects, word_vect_dim), dtype='float32')
#     new_article_ids = []
#     i=0
#     j=0
#     while j<N_vects:
#         article = articles[i]
#         if str(article.id) not in faiss_articles_ids:
#             new_article_ids.append(str(article.id))
#             xb[j, :] = article_to_faiss_vect(article, nlp_ner, w2v_model)
#             j+=1
#         i+=1
#         print(f'\r{i}, {j}', end='')

#     # Update articles ids
#     all_articles_ids = list(faiss_articles_ids) + new_article_ids
#     np.save(new_faiss_ids_f, all_articles_ids)
#     if len(faiss_articles_ids) == 0:
#         ids = np.arange(N_vects).astype('int64')  # + faiss_index2.ntotal
#     else:
#         ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal

#     if len(faiss_articles_ids) == 0:
#         index = faiss.IndexFlatIP(word_vect_dim)
#         faiss_index2 = faiss.IndexIDMap(index)
#     faiss_index2.add_with_ids(xb, ids)
#     faiss.write_index(faiss_index2, new_faiss_indexes_f)

# def array_to_sentence_vect(art_arry, w2v_model):
#     word_vect_dim=w2v_model.wv.vector_size
#     v = np.zeros(word_vect_dim)
#     for word in art_arry:
#         if word in w2v_model.wv.vocab:
#             v = v + w2v_model.wv.get_vector(word)
#         else:
#             words = word.split(' ')
#             if len(words)>1:
#                 for word in words:
#                     if word in w2v_model.wv.vocab:
#                         v = v + w2v_model.wv.get_vector(word)
#     norm = np.linalg.norm(v)
#     if norm==0:
#         return np.zeros(word_vect_dim)
#     else:
#         return v/np.linalg.norm(v)

# def word2vect_encode(doc):
#     tokens = []
#     i = 0
#     while i<len(doc):
#         t = doc[i]
#         tx = t.text
#         # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_)
#         if t.ent_iob_=='O':
#             ent_tex = tx
#             i+=1
#             if (not t.is_space and '@' not in t.text):
#                 if t.is_digit:
#                     tokens.append('__DIGIT__')
#                 elif '$' in tx:
#                     tokens.append('__CURRENCY__')
#                 else:
#                     tokens.append(ent_tex)
#         else:
#             ent_tex = ''
#             while t.ent_iob_!='O':
#                 if t.pos_ == 'DET' and t.ent_iob_=='B':
#                     # It is an article
#                     tokens.append(tx)
#                 else:
#                     ent_tex = ent_tex + ' ' + tx
#                 i+=1
#                 if i<len(doc):
#                     t = doc[i]
#                     tx = t.text
#                 else:
#                     break

#             ent_tex = ent_tex.strip().replace(' - ', '-')
#             tokens.append(ent_tex)

#     return tokens

# def get_sentence_vect(doc, w2v_model):
#     tokens = word2vect_encode(doc)
#     return array_to_sentence_vect(tokens, w2v_model).astype('float32')

コード例 #10

0

ファイルを表示

ファイル: ConcreteStringLevenshteinAutoCompleteStrategy.py プロジェクト: LuisAndreMaduroFerreira/Neverlur_Backend

 def execute_algorithm(self, data: dict, current_string: str):
     print(data)
     autocomplete = AutoComplete(words=data)
     print(autocomplete)
     # word -> what to search by, max_cost -> , size -> number of results to propagate back
     return autocomplete.search(word=current_string, max_cost=3, size=3)