def autocomplete(): search = request.args.get('q') print(request.args.get('term')) print('search is ---------') print(str(search)) autocomplete = AutoComplete(words=autocmplete_label_dict) print(autocomplete.search(word=str(search), max_cost=3, size=3)) t = autocomplete.search(word=str(search), max_cost=3, size=6) flatten = [item for sublist in t for item in sublist] print(flatten) # results = autocomplete.search(word=str(search), max_cost=3, size=3) #class_labels # ['Beer', 'Wine', 'Soda', 'Juice', 'Water'] results = flatten print(results) return jsonify(matching_results=results)
def test_special_characters(self): words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}} autocomplete = AutoComplete( words=words, valid_chars_for_string=string.ascii_letters + string.punctuation) # result = autocomplete.search(word='abcd(efgh)') # assert [['abcd(efgh)ijk']] == result result2 = autocomplete.search(word='u (2 o') assert [['u (2 off)']] == result2
class SM_Autocomplete: def __init__(self): with open("shaker_dictionary.txt", "r", encoding="utf8") as f: #with stream(__name__, 'shaker_dictionary.txt') as f: words = f.readlines() words = dict(zip(words, [dict()] * len(words))) self.autocomplete = AutoComplete(words=words) with open("authors.txt", "r", encoding="utf8") as f: #with stream(__name__, 'authors.txt') as f: names = f.readlines() names = dict(zip(names, [dict()] * len(names))) self.authors = AutoComplete(words=names) def general(self, s): return sorted(self.autocomplete.search(word=s, max_cost=3, size=10)) def author(self, s): return sorted(self.authors.search(word=s, max_cost=3, size=10))
def video_loop(self): ok, frame = self.vs.read() if ok: cv2image = cv2.flip(frame, 1) x1 = int(0.5 * frame.shape[1]) y1 = 10 x2 = frame.shape[1] - 10 y2 = int(0.5 * frame.shape[1]) cv2.rectangle(cv2image, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1), (255, 0, 0), 1) cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA) self.current_image = Image.fromarray(cv2image) imgtk = ImageTk.PhotoImage(image=self.current_image) self.panel.imgtk = imgtk self.panel.config(image=imgtk) cv2image = cv2image[y1:y2, x1:x2] gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 2) th3 = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) ret, res = cv2.threshold(th3, 70, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) self.predict(res) # self.current_image2 = Image.fromarray(res) # imgtk = ImageTk.PhotoImage(image=self.current_image2) # self.panel2.imgtk = imgtk # self.panel2.config(image=imgtk) self.panel3.config(text=self.current_symbol, font=("Courier", 10)) self.panel4.config(text=self.word, font=("Courier", 10)) self.panel5.config(text=self.str, font=("Courier", 10)) predicts = self.word autocomplete = AutoComplete(words=self.words) self.a = autocomplete.search(word=predicts, max_cost=2, size=2) print("Initial {0}: ,Suggest : {1}".format(predicts, self.a)) if (len(self.a) > 0): self.bt1.config(text=self.a[0][0], font=("Courier", 10)) else: self.bt1.config(text="None") if (len(self.a) > 1): self.bt2.config(text=self.a[1][0], font=("Courier", 10)) else: self.bt2.config(text="None") if (len(self.a) > 2): self.bt3.config(text=self.a[2][0], font=("Courier", 10)) else: self.bt3.config(text="None") self.root.after(60, self.video_loop)
def test__find_and_sort(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results): expected_results = expected_find_and_sort_results auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) results = auto_complete._find_and_sort(word, max_cost, size) results = list(results) search_results = auto_complete.search(word, max_cost, size) print_results(locals()) assert expected_results == results if word.strip(): assert expected_results == search_results else: assert [] == search_results
def test_update_count_of_word(self, word, update_dict, expected_results, expected_new_count): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) if update_dict: new_count = auto_complete.update_count_of_word(**update_dict) assert expected_new_count == new_count assert expected_new_count == auto_complete.get_count_of_word( update_dict['word']) results = auto_complete.search(word, max_cost=2, size=4) print_results(locals()) assert expected_results == results
model = line['model'] count = line['count'] if make != model: #local_words = [model, '{}{}'.format(make,model)] # print(local_words) # while local_words: # word = local_words.pop() # if word not in words: words['{}{}'.format(make, model)] = {} # if make not in words: #words[make] = {} return words synonyms = { "alfa romeo 4c coupe": ["the alfa", "hello"], "bmw": ["beemer", "bimmer"] } words = get_words("autocomp.csv") autocomplete = AutoComplete(words=words, synonyms=synonyms) autocomplete.search(word='the ', max_cost=3, size=5) # %% auto_complete.update_count_of_word(word='toyota aygo', count=10000) autocomplete.get_count_of_word('toyota aygo') # %% # %%
def test_autocomplete_synonym_part_of_another_word(self): words = {'cartoon': {}, 'vehicle': {}} synonyms = {'vehicle': ['car']} autocomplete = AutoComplete(words=words, synonyms=synonyms) result = autocomplete.search(word='ca') assert [['vehicle'], ['cartoon']] == result
class RelatedArticles(): @staticmethod def get_filtered_by_date(articles, distances, days=0, months=0, years=1): print(len(articles), len(distances)) filter_date = ( datetime.now() - relativedelta(days=days, months=months, years=years)).date() filtered_articles = [] filtered_distances = [] for i, article in enumerate(articles): if article.publish_date > filter_date: filtered_articles.append(article) filtered_distances.append(distances[i]) return filtered_articles, np.array(filtered_distances) @staticmethod def article2text(article): title = CromaGNI.preprocess_aws_data(article['title']) text = CromaGNI.preprocess_aws_data(article['text']) text = title + '\n' + text return text @staticmethod def doc2tokens(doc): tokens = [] i = 0 while i < len(doc): t = doc[i] tx = t.text # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_) if t.ent_iob_ == 'O': ent_tex = tx i += 1 if (not t.is_space and '@' not in t.text ) or '\n' in t.text: # and t.text != '\n'): if t.is_digit: tokens.append('__DIGIT__') elif '$' in tx: tokens.append('__CURRENCY__') else: tokens.append(ent_tex) else: ent_tex = '' while t.ent_iob_ != 'O': if t.pos_ == 'DET' and t.ent_iob_ == 'B': # It is an article tokens.append(tx) else: ent_tex = ent_tex + ' ' + tx i += 1 if i < len(doc): t = doc[i] tx = t.text else: break ent_tex = ent_tex.strip().replace(' - ', '-') tokens.append(ent_tex) return tokens def __init__(self, spacy_model_path=None, gensim_model_path=None, faiss_indexes_path=None, faiss_indexes_tfidf_path=None, token2tfidf_path=None): self.faiss_indexes_path = faiss_indexes_path self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path if spacy_model_path is not None: self.nlp = spacy.load(spacy_model_path) if gensim_model_path is not None: self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r') model_words, self.model_synonyms = self.prepare_autocomplete() self.autocomplete_model = AutoComplete(words=model_words) if faiss_indexes_path is not None and os.path.exists( faiss_indexes_path): self.faiss_indexes = faiss.read_index(faiss_indexes_path) else: self.faiss_indexes = None if faiss_indexes_tfidf_path is not None and os.path.exists( faiss_indexes_tfidf_path): self.faiss_indexes_tfidf = faiss.read_index( faiss_indexes_tfidf_path) else: self.faiss_indexes_tfidf = None if token2tfidf_path is not None: self.token2tfidf = np.load(token2tfidf_path, allow_pickle=True).item() else: self.token2tfidf = None # def save_training_tokens(self, publication_name, chunk_size = 50_000): # dst_folder = f'training_data_{publication_name}_{chunk_size}/' # if not os.path.exists(dst_folder): # os.makedirs(dst_folder) # articles = Article.objects(publication=Publication.objects(name=publication_name).get()).order_by('-publish_date') # N = articles.count() # N_chunks = np.ceil(N/chunk_size) # sentences = [] # ids = [] # chunk = 0 # for i, article in enumerate(articles): # if i%chunk_size == 0 and i!=0: # chunk+=1 # file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy' # np.save(file_name, sentences) # sentences = [] # print() # print(f'{file_name} saved!') # file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy' # np.save(file_name_ids, ids) # ids = [] # text = RelatedArticles.article2text(article) # print(f'\r{i}/{N}', end=' ') # doc = self.nlp(text) # sentences.append(RelatedArticles.doc2tokens(doc)) # ids.append(str(article['id'])) # chunk+=1 # file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy' # np.save(file_name, sentences) # sentences = [] # print() # print(f'{file_name} saved!') # file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy' # np.save(file_name_ids, ids) # ids = [] def get_autocomplete_words_list(self, text): autocomplets = self.autocomplete_model.search(text, size=10) near_words = [] for word in autocomplets: near_words = near_words + self.model_synonyms[word[0]] return near_words def get_similar(self, word, topn=10): words = [] distances = [] for word, distance in self.w2v_model.wv.most_similar(word, topn=topn): words.append(word) distances.append(distance) return words, distances def get_related_articles(self, article, years=1, months=0, days=0, radius=0.89): id_form_article_id = article['faiss_index'] if id_form_article_id is None: # Not in faiss db already vector = self.article2vect( article ) # np.array([article_to_faiss_vect(article, nlp_custom, w2v_model)]) else: vector = np.array( [self.faiss_indexes.index.reconstruct(id_form_article_id)]) articles, distances = self.get_related_articles_from_vector( vector, years=years, months=months, days=days, radius=radius) if id_form_article_id is None: articles = list(articles) articles.insert(0, article) distances = list(distances) distances.insert(0, 1.0) return articles, distances def tokens2vect(self, art_arry, tfidf=True): if self.token2tfidf is None: tfidf = False word_vect_dim = self.w2v_model.wv.vector_size v = np.zeros(word_vect_dim) if tfidf: v_tfidf = np.zeros(word_vect_dim) for word in art_arry: if word in self.w2v_model.wv.vocab: if tfidf: wordtfidf = self.token2tfidf.get(word, 0) v_tfidf = v_tfidf + self.w2v_model.wv.get_vector( word) * wordtfidf v = v + self.w2v_model.wv.get_vector(word) else: words = word.split(' ') if len(words) > 1: for word in words: if word in self.w2v_model.wv.vocab: v = v + self.w2v_model.wv.get_vector(word) if tfidf: wordtfidf = self.token2tfidf.get(word, 0) v_tfidf = v_tfidf + self.w2v_model.wv.get_vector( word) * wordtfidf norm = np.linalg.norm(v) if norm == 0: v = np.zeros(word_vect_dim) else: v = v / norm if tfidf: norm_tfidf = np.linalg.norm(v_tfidf) if norm_tfidf == 0: v_tfidf = np.zeros(word_vect_dim) else: v_tfidf = v_tfidf / norm_tfidf return v.astype('float32'), v_tfidf.astype('float32') else: return v.astype('float32') def doc2vect(self, doc): tokens = RelatedArticles.doc2tokens(doc) return self.tokens2vect(tokens) def text2doc(self, text): return self.nlp(text) def text2vect(self, text): doc = self.text2doc(text) return self.doc2vect(doc) def article2vect(self, article): text = RelatedArticles.article2text(article) return self.text2vect(text) def get_related_articles_from_vector(self, vector, radius=0.89, k=None, fr=0, filter_by_date=True, years=1, months=0, days=0): indexes = [] distances = [] if type(vector) == tuple: # Hay que hacer un fix para tfidf aca vector = vector[0] if len(vector.shape) == 1: vector = np.array([vector]) if self.faiss_indexes is not None: if k is None: """ returns by radius """ lims, D, I = self.faiss_indexes.range_search(vector, radius) j = 0 distances = D[lims[j]:lims[j + 1]][fr:] sorted_idx = np.argsort(distances)[::-1] distances = distances[sorted_idx] indexes = I[lims[j]:lims[j + 1]][fr:][sorted_idx] else: """ returns k related """ D, I = self.faiss_indexes.search(vector, k) distances = D[0][fr:] indexes = I[0][fr:] articles = [] for idx in indexes: art_ = Article.objects(faiss_index=idx).first() if art_ is not None: articles.append(art_) if len(articles) > 0 and filter_by_date: articles, distances = RelatedArticles.get_filtered_by_date( articles, distances, years=years, months=months, days=days) return articles, distances # def add_faiss_vectors(self, articles, old_faiss_ids_f, old_faiss_indexes_f, old_faiss_indexes_tfidf_f, new_faiss_ids_f, new_faiss_indexes_f, new_faiss_indexes_tfidf_f): # if new_faiss_indexes_tfidf_f is not None: # tfidf=True # else: # tfidf=False # # Read faiss indexes and mongoids # if old_faiss_ids_f is None or old_faiss_indexes_f is None: # faiss_articles_ids = [] # faiss_index2 = None # faiss_index2_tfidf = None # else: # faiss_articles_ids = np.load(old_faiss_ids_f) # faiss_index2 = faiss.read_index(old_faiss_indexes_f) # faiss_index2_tfidf = faiss.read_index(old_faiss_indexes_tfidf_f) # N_vects = len(articles) # # Get wordvectors # word_vect_dim = self.w2v_model.wv.vector_size # xb = np.zeros((N_vects, word_vect_dim), dtype='float32') # if tfidf: # xb_tfidf = np.zeros((N_vects, word_vect_dim), dtype='float32') # new_article_ids = [] # i=0 # j=0 # while j<N_vects: # article = articles[i] # if str(article.id) not in faiss_articles_ids: # new_article_ids.append(str(article.id)) # if tfidf: # xb[j, :], xb_tfidf[j, :] = self.article2vect(article) # else: # xb[j, :] = self.article2vect(article) # j+=1 # i+=1 # print(f'\r{i}, {j} / {N_vects}', end='') # # Update articles ids # all_articles_ids = list(faiss_articles_ids) + new_article_ids # np.save(new_faiss_ids_f, all_articles_ids) # if len(faiss_articles_ids) == 0: # ids = np.arange(N_vects).astype('int64') # + faiss_index2.ntotal # else: # ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal # if len(faiss_articles_ids) == 0: # index = faiss.IndexFlatIP(word_vect_dim) # faiss_index2 = faiss.IndexIDMap(index) # if tfidf: # index_tfidf = faiss.IndexFlatIP(word_vect_dim) # faiss_index2_tfidf = faiss.IndexIDMap(index_tfidf) # faiss_index2.add_with_ids(xb, ids) # faiss.write_index(faiss_index2, new_faiss_indexes_f) # if tfidf: # faiss_index2_tfidf.add_with_ids(xb_tfidf, ids) # faiss.write_index(faiss_index2_tfidf, new_faiss_indexes_tfidf_f) def prepare_autocomplete(self): words = {} for word, g in self.w2v_model.wv.vocab.items(): lower = word.lower() if lower in words: if g.count > words[lower]['count']: words[lower] = {'count': g.count} else: words[lower] = {'count': g.count} synonyms = {} for word, g in self.w2v_model.wv.vocab.items(): lower = word.lower() if lower not in synonyms: synonyms[lower] = [] synonyms[lower].append(word) return words, synonyms def add_faiss_vectors(self, articles, tfidf=True): total_vectors = 0 vector_size = self.w2v_model.wv.vector_size if self.faiss_indexes is None: index = faiss.IndexFlatIP(vector_size) self.faiss_indexes = faiss.IndexIDMap(index) if tfidf: index_tfidf = faiss.IndexFlatIP(vector_size) self.faiss_indexes_tfidf = faiss.IndexIDMap(index_tfidf) total_vectors = self.faiss_indexes.ntotal total_vectors_tfidf = self.faiss_indexes_tfidf.ntotal xb = [] xb_tfidf = [] faiss_count = total_vectors faiss_count_tfidf = total_vectors_tfidf ids = [] ids_tfidf = [] for article in articles: if article['faiss_index'] is None: vect, vect_tfidf = self.article2vect(article) xb.append(vect) xb_tfidf.append(vect_tfidf) article['faiss_index'] = faiss_count article['faiss_index_tfidf'] = faiss_count_tfidf ids.append(faiss_count) ids_tfidf.append(faiss_count_tfidf) article.save() faiss_count = faiss_count + 1 faiss_count_tfidf = faiss_count_tfidf + 1 if len(ids) == 0: # No se agrego nada por que ya estaba return total_vectors, len(ids) xb = np.array(xb, dtype='float32') xb_tfidf = np.array(xb_tfidf, dtype='float32') ids = np.array(ids, dtype='int64') ids_tfidf = np.array(ids_tfidf, dtype='int64') self.faiss_indexes.add_with_ids(xb, ids) faiss.write_index(self.faiss_indexes, self.faiss_indexes_path) self.faiss_indexes_tfidf.add_with_ids(xb_tfidf, ids_tfidf) faiss.write_index(self.faiss_indexes_tfidf, self.faiss_indexes_tfidf_path) return total_vectors, len(ids) # def article_to_faiss_vect(article, nlp, w2v_model): # # article2vect # title = CromaGNI.preprocess_aws_data(article['title']) # text = CromaGNI.preprocess_aws_data(article['text']) # text = title + '\n' + text # doc = nlp(text) # return get_sentence_vect(doc, w2v_model) # def get_related_aticles(vector, faiss_indexes, faiss_article_ids, Article, radius=0.89, k=None, fr = 0, filter_by_date=True, years=1, months=0, days=0): # if k is None: # lims, D, I = faiss_indexes.range_search(vector, radius) # j = 0 # distances = D[lims[j]:lims[j+1]][fr:] # sorted_idx = np.argsort(distances)[::-1] # distances = distances[sorted_idx] # indexes = I[lims[j]:lims[j+1]][fr:][sorted_idx] # else: # D, I = faiss_indexes.search(vector, k) # distances = D[0][fr:] # indexes = I[0][fr:] # articles = [] # for idx in indexes: # articles.append(Article.objects(id=faiss_article_ids[idx]).first()) # if filter_by_date: # articles, distances = get_filtered_by_date(articles, distances, years=years, months=months, days=days) # return articles, distances # def add_faiss_vectors(old_faiss_ids_f, old_faiss_indexes_f, new_faiss_ids_f, new_faiss_indexes_f, articles, w2v_model, nlp_ner, N_vects=10000): # # Read faiss indexes and mongoids # if old_faiss_ids_f is None or old_faiss_indexes_f is None: # faiss_articles_ids = [] # faiss_index2 = None # else: # faiss_articles_ids = np.load(old_faiss_ids_f) # faiss_index2 = faiss.read_index(old_faiss_indexes_f) # # Get wordvectors # word_vect_dim = w2v_model.wv.vector_size # xb = np.zeros((N_vects, word_vect_dim), dtype='float32') # new_article_ids = [] # i=0 # j=0 # while j<N_vects: # article = articles[i] # if str(article.id) not in faiss_articles_ids: # new_article_ids.append(str(article.id)) # xb[j, :] = article_to_faiss_vect(article, nlp_ner, w2v_model) # j+=1 # i+=1 # print(f'\r{i}, {j}', end='') # # Update articles ids # all_articles_ids = list(faiss_articles_ids) + new_article_ids # np.save(new_faiss_ids_f, all_articles_ids) # if len(faiss_articles_ids) == 0: # ids = np.arange(N_vects).astype('int64') # + faiss_index2.ntotal # else: # ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal # if len(faiss_articles_ids) == 0: # index = faiss.IndexFlatIP(word_vect_dim) # faiss_index2 = faiss.IndexIDMap(index) # faiss_index2.add_with_ids(xb, ids) # faiss.write_index(faiss_index2, new_faiss_indexes_f) # def array_to_sentence_vect(art_arry, w2v_model): # word_vect_dim=w2v_model.wv.vector_size # v = np.zeros(word_vect_dim) # for word in art_arry: # if word in w2v_model.wv.vocab: # v = v + w2v_model.wv.get_vector(word) # else: # words = word.split(' ') # if len(words)>1: # for word in words: # if word in w2v_model.wv.vocab: # v = v + w2v_model.wv.get_vector(word) # norm = np.linalg.norm(v) # if norm==0: # return np.zeros(word_vect_dim) # else: # return v/np.linalg.norm(v) # def word2vect_encode(doc): # tokens = [] # i = 0 # while i<len(doc): # t = doc[i] # tx = t.text # # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_) # if t.ent_iob_=='O': # ent_tex = tx # i+=1 # if (not t.is_space and '@' not in t.text): # if t.is_digit: # tokens.append('__DIGIT__') # elif '$' in tx: # tokens.append('__CURRENCY__') # else: # tokens.append(ent_tex) # else: # ent_tex = '' # while t.ent_iob_!='O': # if t.pos_ == 'DET' and t.ent_iob_=='B': # # It is an article # tokens.append(tx) # else: # ent_tex = ent_tex + ' ' + tx # i+=1 # if i<len(doc): # t = doc[i] # tx = t.text # else: # break # ent_tex = ent_tex.strip().replace(' - ', '-') # tokens.append(ent_tex) # return tokens # def get_sentence_vect(doc, w2v_model): # tokens = word2vect_encode(doc) # return array_to_sentence_vect(tokens, w2v_model).astype('float32')
def execute_algorithm(self, data: dict, current_string: str): print(data) autocomplete = AutoComplete(words=data) print(autocomplete) # word -> what to search by, max_cost -> , size -> number of results to propagate back return autocomplete.search(word=current_string, max_cost=3, size=3)