def __init__(self, spacy_model_path=None, gensim_model_path=None, faiss_indexes_path=None, faiss_indexes_tfidf_path=None, token2tfidf_path=None): self.faiss_indexes_path = faiss_indexes_path self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path if spacy_model_path is not None: self.nlp = spacy.load(spacy_model_path) if gensim_model_path is not None: self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r') model_words, self.model_synonyms = self.prepare_autocomplete() self.autocomplete_model = AutoComplete(words=model_words) if faiss_indexes_path is not None and os.path.exists( faiss_indexes_path): self.faiss_indexes = faiss.read_index(faiss_indexes_path) else: self.faiss_indexes = None if faiss_indexes_tfidf_path is not None and os.path.exists( faiss_indexes_tfidf_path): self.faiss_indexes_tfidf = faiss.read_index( faiss_indexes_tfidf_path) else: self.faiss_indexes_tfidf = None if token2tfidf_path is not None: self.token2tfidf = np.load(token2tfidf_path, allow_pickle=True).item() else: self.token2tfidf = None
def test_get_word_context(self, word, expected_results): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) results = auto_complete.get_word_context(word) print_results(locals()) assert expected_results == results
def test_search_without_synonyms(self, word, max_cost, size, expected_results): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS) results, find_steps = auto_complete._find(word, max_cost, size) results = dict(results) print_results(locals()) assert expected_results == results
def test_get_all_descendent_words_for_condition(self, word, expected_results): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) results = auto_complete.get_tokens_flat_list(word, max_cost=0, size=3) print_results(locals()) assert expected_results == results
def test_find(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results): expected_results = expected_find_results auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) results, find_steps = auto_complete._find(word, max_cost, size) results = dict(results) print_results(locals()) assert expected_results == results assert expected_steps == find_steps
def test_search_unicode_without_synonyms(self, word, max_cost, size, expected_results): auto_complete = AutoComplete( words=SHORT_WORDS_UNICODE, valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی') results, find_steps = auto_complete._find(word, max_cost, size) results = dict(results) print_results(locals()) assert expected_results == results
def test_special_characters(self): words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}} autocomplete = AutoComplete( words=words, valid_chars_for_string=string.ascii_letters + string.punctuation) # result = autocomplete.search(word='abcd(efgh)') # assert [['abcd(efgh)ijk']] == result result2 = autocomplete.search(word='u (2 o') assert [['u (2 off)']] == result2
def test_get_descendants_nodes(self, word, expected_results): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill( word) size = 2 found_words_gen = node.get_descendants_nodes(size=size) found_words = [_node.word for _node in found_words_gen][:size + 1] print(f'word: {word}') print(f'expected_results: {expected_results}') print(f'found_words: {found_words}') assert expected_results == list(found_words)
def test_get_all_descendent_words_for_condition1(self, word, expected_results): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) def condition(word_info): return 'model' in word_info size = 10 results = auto_complete.get_all_descendent_words_for_condition( word=word, size=size, condition=condition) print_results(locals())
def video_loop(self): ok, frame = self.vs.read() if ok: cv2image = cv2.flip(frame, 1) x1 = int(0.5 * frame.shape[1]) y1 = 10 x2 = frame.shape[1] - 10 y2 = int(0.5 * frame.shape[1]) cv2.rectangle(cv2image, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1), (255, 0, 0), 1) cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA) self.current_image = Image.fromarray(cv2image) imgtk = ImageTk.PhotoImage(image=self.current_image) self.panel.imgtk = imgtk self.panel.config(image=imgtk) cv2image = cv2image[y1:y2, x1:x2] gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 2) th3 = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2) ret, res = cv2.threshold(th3, 70, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) self.predict(res) # self.current_image2 = Image.fromarray(res) # imgtk = ImageTk.PhotoImage(image=self.current_image2) # self.panel2.imgtk = imgtk # self.panel2.config(image=imgtk) self.panel3.config(text=self.current_symbol, font=("Courier", 10)) self.panel4.config(text=self.word, font=("Courier", 10)) self.panel5.config(text=self.str, font=("Courier", 10)) predicts = self.word autocomplete = AutoComplete(words=self.words) self.a = autocomplete.search(word=predicts, max_cost=2, size=2) print("Initial {0}: ,Suggest : {1}".format(predicts, self.a)) if (len(self.a) > 0): self.bt1.config(text=self.a[0][0], font=("Courier", 10)) else: self.bt1.config(text="None") if (len(self.a) > 1): self.bt2.config(text=self.a[1][0], font=("Courier", 10)) else: self.bt2.config(text="None") if (len(self.a) > 2): self.bt3.config(text=self.a[2][0], font=("Courier", 10)) else: self.bt3.config(text="None") self.root.after(60, self.video_loop)
def __init__(self): with open("shaker_dictionary.txt", "r", encoding="utf8") as f: #with stream(__name__, 'shaker_dictionary.txt') as f: words = f.readlines() words = dict(zip(words, [dict()] * len(words))) self.autocomplete = AutoComplete(words=words) with open("authors.txt", "r", encoding="utf8") as f: #with stream(__name__, 'authors.txt') as f: names = f.readlines() names = dict(zip(names, [dict()] * len(names))) self.authors = AutoComplete(words=names)
def test_update_count_of_word(self, word, update_dict, expected_results, expected_new_count): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS, full_stop_words=['bmw', 'alfa romeo']) if update_dict: new_count = auto_complete.update_count_of_word(**update_dict) assert expected_new_count == new_count assert expected_new_count == auto_complete.get_count_of_word( update_dict['word']) results = auto_complete.search(word, max_cost=2, size=4) print_results(locals()) assert expected_results == results
def test__find_and_sort(self, word, max_cost, size, expected_find_results, expected_steps, expected_find_and_sort_results): expected_results = expected_find_and_sort_results auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) results = auto_complete._find_and_sort(word, max_cost, size) results = list(results) search_results = auto_complete.search(word, max_cost, size) print_results(locals()) assert expected_results == results if word.strip(): assert expected_results == search_results else: assert [] == search_results
def autocomplete(): search = request.args.get('q') print(request.args.get('term')) print('search is ---------') print(str(search)) autocomplete = AutoComplete(words=autocmplete_label_dict) print(autocomplete.search(word=str(search), max_cost=3, size=3)) t = autocomplete.search(word=str(search), max_cost=3, size=6) flatten = [item for sublist in t for item in sublist] print(flatten) # results = autocomplete.search(word=str(search), max_cost=3, size=3) #class_labels # ['Beer', 'Wine', 'Soda', 'Juice', 'Water'] results = flatten print(results) return jsonify(matching_results=results)
class SM_Autocomplete: def __init__(self): with open("shaker_dictionary.txt", "r", encoding="utf8") as f: #with stream(__name__, 'shaker_dictionary.txt') as f: words = f.readlines() words = dict(zip(words, [dict()] * len(words))) self.autocomplete = AutoComplete(words=words) with open("authors.txt", "r", encoding="utf8") as f: #with stream(__name__, 'authors.txt') as f: names = f.readlines() names = dict(zip(names, [dict()] * len(names))) self.authors = AutoComplete(words=names) def general(self, s): return sorted(self.autocomplete.search(word=s, max_cost=3, size=10)) def author(self, s): return sorted(self.authors.search(word=s, max_cost=3, size=10))
def test_prefix_autofill(self, word, expected_matched_prefix_of_last_word, expected_rest_of_word, expected_matched_words, expected_node_path): auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS) matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill( word) print(f'word: {word}') print( f'expected_matched_prefix_of_last_word: {expected_matched_prefix_of_last_word}' ) print(f'matched_prefix_of_last_word: {matched_prefix_of_last_word}') print(f'expected_rest_of_word: {expected_rest_of_word}') print(f'rest_of_word: {rest_of_word}') print(f'node: {node}') print(f'expected_matched_words: {expected_matched_words}') print(f'matched_words: {matched_words}') expected_node = auto_complete._dwg for k in expected_node_path.split(','): expected_node = expected_node[k] assert expected_node is node assert expected_matched_prefix_of_last_word == matched_prefix_of_last_word assert expected_rest_of_word == rest_of_word assert expected_matched_words == matched_words
def test_immutable_info(self, word): auto_complete = AutoComplete(words=SHORT_WORDS, synonyms=SYNONYMS) auto_complete_immutable = AutoComplete( words=SHORT_WORDS_IMMUTABLE_INFO, synonyms=SYNONYMS) search_results = auto_complete._find(word, max_cost=3, size=3) search_results_immutable = auto_complete_immutable._find(word, max_cost=3, size=3) print_results(locals()) assert search_results_immutable == search_results
def load_autocomplete_trie(self): directory = glob('bigData/*') words = set() for file_name in directory: if file_name == 'bigData/bigtext.txt': continue with open(file_name, 'r') as file: for line in file: line = line.strip() words.add(line) words = {word: {} for word in words} self.Trie = AutoComplete(words=words)
def _build_search_dictionary(self): '''Takes a JournalFile and returns an AutoComplete object. ''' words = {} synonyms = {} del self._vardict['sdict'] for count, entry in enumerate(self._vardict['journal_file']): title = entry['Title'] sub_title = entry['subTitle'] subsub_title = entry['subsubTitle'] words[title.lower()] = {'Title': title, 'Index': count} synonyms[title] = [sub_title.lower(), subsub_title.lower()] for abbrev in entry['Abbreviations']: if abbrev: synonyms[title].append(str(abbrev.lower())) self._vardict['sdict'] = AutoComplete(words=words, synonyms=synonyms)
def getAutocompleteEntries() -> dict: words = {} logger.info("Starting autocomplete words intialisation...") with gzip.open('wikidump/enwiki-20210820-abstract.xml.gz', 'rb') as f: doc_id = 1 for _, element in etree.iterparse(f, events=('end',), tag='doc'): if (doc_id % 78 == 1): title = element.findtext('./title') logger.debug("Creating autocomplete word entry for [id={0}] [title={1}]".format(doc_id, title)) words[title[11:]] = {} doc_id += 1 element.clear() logger.info("Finished autocomplete words intialisation") return words autocomplete = AutoComplete(words=getAutocompleteEntries()) class ErrorHandlerWrapper: def __call__(self, func): @functools.wraps(func) def wrapper(*args, **kwargs): self.client_addr = request.remote_addr try: with self: return func(*args, **kwargs) except SSLError as e: logger.error("An issue with SSL occured", e) response = { "status_code": e.status_code, "status": "An issue with SSL occured: {}".format(e)
from sanic import Blueprint from sanic import response from fast_autocomplete import AutoComplete import json autocomplete = Blueprint('autocomplete', url_prefix='/autocomplete') models_directory = './autocomplete' autocomplete_models = { 'en': AutoComplete(words=json.load(open(f'{models_directory}/en.json', 'r'))), 'es': AutoComplete(words=json.load(open(f'{models_directory}/es.json', 'r'))), 'fr': AutoComplete(words=json.load(open(f'{models_directory}/fr.json', 'r'))), 'pt': AutoComplete(words=json.load(open(f'{models_directory}/pt.json', 'r'))) } @autocomplete.post('/') async def complete(request): lang = request.raw_args.get('language', 'en') query = request.raw_args.get('text') limit = request.raw_args.get('limit', 7) model = autocomplete_models.get(lang, autocomplete_models['en']) if any(stop in query for stop in ['.', '?', '!']): return response.json([]) elif query.count(' ') >= int(limit):
model = line['model'] count = line['count'] if make != model: #local_words = [model, '{}{}'.format(make,model)] # print(local_words) # while local_words: # word = local_words.pop() # if word not in words: words['{}{}'.format(make, model)] = {} # if make not in words: #words[make] = {} return words synonyms = { "alfa romeo 4c coupe": ["the alfa", "hello"], "bmw": ["beemer", "bimmer"] } words = get_words("autocomp.csv") autocomplete = AutoComplete(words=words, synonyms=synonyms) autocomplete.search(word='the ', max_cost=3, size=5) # %% auto_complete.update_count_of_word(word='toyota aygo', count=10000) autocomplete.get_count_of_word('toyota aygo') # %% # %%
import os import tkinter as tk from fast_autocomplete import AutoComplete from scanners.smmx import scan from pathlib import Path import subprocess import platform from pynput import keyboard import threading import copy rootdir = 'C:/Users/xxx/Dropbox/SimpleMind' words, paths = scan(rootdir) words_copy = copy.deepcopy(words) autocomplete = AutoComplete(words=words) contexts = [] selected = None def remove_prefix(str, prefix): if str.startswith(prefix): return str[len(prefix):] return str def on_keyrelease(event): # get text from entry value = event.widget.get() value = value.strip().lower()
def _init_autocompleter(self, df: Card_DF): card_names = self._get_words(df) card_names_to_dicts = {word: {} for word in card_names} completer = AutoComplete(words=card_names_to_dicts) return completer
parser.add_argument('--disallowed-rels', help='List of semicolon-separated relations that are disallowed', default='') parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true') args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) logging.debug('Args:', args) logging.info('Loading autocomplete vocabulary...') words_en = {} vocab_filepath = '/data/zeste_cache/vocab.txt' if os.path.exists(vocab_filepath): vocab_file = open(vocab_filepath, 'r') lines = vocab_file.readlines() for line in lines: words_en[line.strip()] = {} autocomplete_en = AutoComplete(words=words_en) words_fr = {} vocab_filepath = '/data/zeste_cache/vocab_fr.txt' if os.path.exists(vocab_filepath): vocab_file = open(vocab_filepath, 'r') lines = vocab_file.readlines() for line in lines: words_fr[line.strip()] = {} autocomplete_fr = AutoComplete(words=words_fr) logging.info('Starting web server...') app = Flask(__name__) app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_port=1) api = Api(app, doc='/doc') cors = CORS(app)
def execute_algorithm(self, data: dict, current_string: str): print(data) autocomplete = AutoComplete(words=data) print(autocomplete) # word -> what to search by, max_cost -> , size -> number of results to propagate back return autocomplete.search(word=current_string, max_cost=3, size=3)
@api_view(['GET']) #@permission_classes([IsAuthenticated]) def remove_authors(request, *args, **kwargs): spp = kwargs.get('taxon').split(" ") not_words = [] for i,j in enumerate(spp, start = 0): word = Words.objects.filter(word = spp[i]) if not word: not_words.append(spp[i]) spp = [x for x in spp if x not in not_words] return Response({'taxon': ' '.join(spp)}, status=status.HTTP_200_OK) with open('data.pickle', 'rb') as f: names = pickle.load(f) autocomplete_names = AutoComplete(words=names) def autocomplete(request): if request.GET.get('term'): q = request.GET['term'] data = autocomplete_names.search(word = q, max_cost = 3, size = 3) data = [s[0] for s in data if len(s[0]) >= len(q)] return HttpResponse(json.dumps([x.capitalize() for x in data]), 'application/json') else: return render(request, 'api/autocomplete.html') @api_view(['GET']) #@permission_classes([IsAuthenticated]) def synonyms(request, *args, **kwargs): spp = Details.objects.filter(search_str = kwargs.get('search_str').capitalize()).first() if spp == None:
class RelatedArticles(): @staticmethod def get_filtered_by_date(articles, distances, days=0, months=0, years=1): print(len(articles), len(distances)) filter_date = ( datetime.now() - relativedelta(days=days, months=months, years=years)).date() filtered_articles = [] filtered_distances = [] for i, article in enumerate(articles): if article.publish_date > filter_date: filtered_articles.append(article) filtered_distances.append(distances[i]) return filtered_articles, np.array(filtered_distances) @staticmethod def article2text(article): title = CromaGNI.preprocess_aws_data(article['title']) text = CromaGNI.preprocess_aws_data(article['text']) text = title + '\n' + text return text @staticmethod def doc2tokens(doc): tokens = [] i = 0 while i < len(doc): t = doc[i] tx = t.text # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_) if t.ent_iob_ == 'O': ent_tex = tx i += 1 if (not t.is_space and '@' not in t.text ) or '\n' in t.text: # and t.text != '\n'): if t.is_digit: tokens.append('__DIGIT__') elif '$' in tx: tokens.append('__CURRENCY__') else: tokens.append(ent_tex) else: ent_tex = '' while t.ent_iob_ != 'O': if t.pos_ == 'DET' and t.ent_iob_ == 'B': # It is an article tokens.append(tx) else: ent_tex = ent_tex + ' ' + tx i += 1 if i < len(doc): t = doc[i] tx = t.text else: break ent_tex = ent_tex.strip().replace(' - ', '-') tokens.append(ent_tex) return tokens def __init__(self, spacy_model_path=None, gensim_model_path=None, faiss_indexes_path=None, faiss_indexes_tfidf_path=None, token2tfidf_path=None): self.faiss_indexes_path = faiss_indexes_path self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path if spacy_model_path is not None: self.nlp = spacy.load(spacy_model_path) if gensim_model_path is not None: self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r') model_words, self.model_synonyms = self.prepare_autocomplete() self.autocomplete_model = AutoComplete(words=model_words) if faiss_indexes_path is not None and os.path.exists( faiss_indexes_path): self.faiss_indexes = faiss.read_index(faiss_indexes_path) else: self.faiss_indexes = None if faiss_indexes_tfidf_path is not None and os.path.exists( faiss_indexes_tfidf_path): self.faiss_indexes_tfidf = faiss.read_index( faiss_indexes_tfidf_path) else: self.faiss_indexes_tfidf = None if token2tfidf_path is not None: self.token2tfidf = np.load(token2tfidf_path, allow_pickle=True).item() else: self.token2tfidf = None # def save_training_tokens(self, publication_name, chunk_size = 50_000): # dst_folder = f'training_data_{publication_name}_{chunk_size}/' # if not os.path.exists(dst_folder): # os.makedirs(dst_folder) # articles = Article.objects(publication=Publication.objects(name=publication_name).get()).order_by('-publish_date') # N = articles.count() # N_chunks = np.ceil(N/chunk_size) # sentences = [] # ids = [] # chunk = 0 # for i, article in enumerate(articles): # if i%chunk_size == 0 and i!=0: # chunk+=1 # file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy' # np.save(file_name, sentences) # sentences = [] # print() # print(f'{file_name} saved!') # file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy' # np.save(file_name_ids, ids) # ids = [] # text = RelatedArticles.article2text(article) # print(f'\r{i}/{N}', end=' ') # doc = self.nlp(text) # sentences.append(RelatedArticles.doc2tokens(doc)) # ids.append(str(article['id'])) # chunk+=1 # file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy' # np.save(file_name, sentences) # sentences = [] # print() # print(f'{file_name} saved!') # file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy' # np.save(file_name_ids, ids) # ids = [] def get_autocomplete_words_list(self, text): autocomplets = self.autocomplete_model.search(text, size=10) near_words = [] for word in autocomplets: near_words = near_words + self.model_synonyms[word[0]] return near_words def get_similar(self, word, topn=10): words = [] distances = [] for word, distance in self.w2v_model.wv.most_similar(word, topn=topn): words.append(word) distances.append(distance) return words, distances def get_related_articles(self, article, years=1, months=0, days=0, radius=0.89): id_form_article_id = article['faiss_index'] if id_form_article_id is None: # Not in faiss db already vector = self.article2vect( article ) # np.array([article_to_faiss_vect(article, nlp_custom, w2v_model)]) else: vector = np.array( [self.faiss_indexes.index.reconstruct(id_form_article_id)]) articles, distances = self.get_related_articles_from_vector( vector, years=years, months=months, days=days, radius=radius) if id_form_article_id is None: articles = list(articles) articles.insert(0, article) distances = list(distances) distances.insert(0, 1.0) return articles, distances def tokens2vect(self, art_arry, tfidf=True): if self.token2tfidf is None: tfidf = False word_vect_dim = self.w2v_model.wv.vector_size v = np.zeros(word_vect_dim) if tfidf: v_tfidf = np.zeros(word_vect_dim) for word in art_arry: if word in self.w2v_model.wv.vocab: if tfidf: wordtfidf = self.token2tfidf.get(word, 0) v_tfidf = v_tfidf + self.w2v_model.wv.get_vector( word) * wordtfidf v = v + self.w2v_model.wv.get_vector(word) else: words = word.split(' ') if len(words) > 1: for word in words: if word in self.w2v_model.wv.vocab: v = v + self.w2v_model.wv.get_vector(word) if tfidf: wordtfidf = self.token2tfidf.get(word, 0) v_tfidf = v_tfidf + self.w2v_model.wv.get_vector( word) * wordtfidf norm = np.linalg.norm(v) if norm == 0: v = np.zeros(word_vect_dim) else: v = v / norm if tfidf: norm_tfidf = np.linalg.norm(v_tfidf) if norm_tfidf == 0: v_tfidf = np.zeros(word_vect_dim) else: v_tfidf = v_tfidf / norm_tfidf return v.astype('float32'), v_tfidf.astype('float32') else: return v.astype('float32') def doc2vect(self, doc): tokens = RelatedArticles.doc2tokens(doc) return self.tokens2vect(tokens) def text2doc(self, text): return self.nlp(text) def text2vect(self, text): doc = self.text2doc(text) return self.doc2vect(doc) def article2vect(self, article): text = RelatedArticles.article2text(article) return self.text2vect(text) def get_related_articles_from_vector(self, vector, radius=0.89, k=None, fr=0, filter_by_date=True, years=1, months=0, days=0): indexes = [] distances = [] if type(vector) == tuple: # Hay que hacer un fix para tfidf aca vector = vector[0] if len(vector.shape) == 1: vector = np.array([vector]) if self.faiss_indexes is not None: if k is None: """ returns by radius """ lims, D, I = self.faiss_indexes.range_search(vector, radius) j = 0 distances = D[lims[j]:lims[j + 1]][fr:] sorted_idx = np.argsort(distances)[::-1] distances = distances[sorted_idx] indexes = I[lims[j]:lims[j + 1]][fr:][sorted_idx] else: """ returns k related """ D, I = self.faiss_indexes.search(vector, k) distances = D[0][fr:] indexes = I[0][fr:] articles = [] for idx in indexes: art_ = Article.objects(faiss_index=idx).first() if art_ is not None: articles.append(art_) if len(articles) > 0 and filter_by_date: articles, distances = RelatedArticles.get_filtered_by_date( articles, distances, years=years, months=months, days=days) return articles, distances # def add_faiss_vectors(self, articles, old_faiss_ids_f, old_faiss_indexes_f, old_faiss_indexes_tfidf_f, new_faiss_ids_f, new_faiss_indexes_f, new_faiss_indexes_tfidf_f): # if new_faiss_indexes_tfidf_f is not None: # tfidf=True # else: # tfidf=False # # Read faiss indexes and mongoids # if old_faiss_ids_f is None or old_faiss_indexes_f is None: # faiss_articles_ids = [] # faiss_index2 = None # faiss_index2_tfidf = None # else: # faiss_articles_ids = np.load(old_faiss_ids_f) # faiss_index2 = faiss.read_index(old_faiss_indexes_f) # faiss_index2_tfidf = faiss.read_index(old_faiss_indexes_tfidf_f) # N_vects = len(articles) # # Get wordvectors # word_vect_dim = self.w2v_model.wv.vector_size # xb = np.zeros((N_vects, word_vect_dim), dtype='float32') # if tfidf: # xb_tfidf = np.zeros((N_vects, word_vect_dim), dtype='float32') # new_article_ids = [] # i=0 # j=0 # while j<N_vects: # article = articles[i] # if str(article.id) not in faiss_articles_ids: # new_article_ids.append(str(article.id)) # if tfidf: # xb[j, :], xb_tfidf[j, :] = self.article2vect(article) # else: # xb[j, :] = self.article2vect(article) # j+=1 # i+=1 # print(f'\r{i}, {j} / {N_vects}', end='') # # Update articles ids # all_articles_ids = list(faiss_articles_ids) + new_article_ids # np.save(new_faiss_ids_f, all_articles_ids) # if len(faiss_articles_ids) == 0: # ids = np.arange(N_vects).astype('int64') # + faiss_index2.ntotal # else: # ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal # if len(faiss_articles_ids) == 0: # index = faiss.IndexFlatIP(word_vect_dim) # faiss_index2 = faiss.IndexIDMap(index) # if tfidf: # index_tfidf = faiss.IndexFlatIP(word_vect_dim) # faiss_index2_tfidf = faiss.IndexIDMap(index_tfidf) # faiss_index2.add_with_ids(xb, ids) # faiss.write_index(faiss_index2, new_faiss_indexes_f) # if tfidf: # faiss_index2_tfidf.add_with_ids(xb_tfidf, ids) # faiss.write_index(faiss_index2_tfidf, new_faiss_indexes_tfidf_f) def prepare_autocomplete(self): words = {} for word, g in self.w2v_model.wv.vocab.items(): lower = word.lower() if lower in words: if g.count > words[lower]['count']: words[lower] = {'count': g.count} else: words[lower] = {'count': g.count} synonyms = {} for word, g in self.w2v_model.wv.vocab.items(): lower = word.lower() if lower not in synonyms: synonyms[lower] = [] synonyms[lower].append(word) return words, synonyms def add_faiss_vectors(self, articles, tfidf=True): total_vectors = 0 vector_size = self.w2v_model.wv.vector_size if self.faiss_indexes is None: index = faiss.IndexFlatIP(vector_size) self.faiss_indexes = faiss.IndexIDMap(index) if tfidf: index_tfidf = faiss.IndexFlatIP(vector_size) self.faiss_indexes_tfidf = faiss.IndexIDMap(index_tfidf) total_vectors = self.faiss_indexes.ntotal total_vectors_tfidf = self.faiss_indexes_tfidf.ntotal xb = [] xb_tfidf = [] faiss_count = total_vectors faiss_count_tfidf = total_vectors_tfidf ids = [] ids_tfidf = [] for article in articles: if article['faiss_index'] is None: vect, vect_tfidf = self.article2vect(article) xb.append(vect) xb_tfidf.append(vect_tfidf) article['faiss_index'] = faiss_count article['faiss_index_tfidf'] = faiss_count_tfidf ids.append(faiss_count) ids_tfidf.append(faiss_count_tfidf) article.save() faiss_count = faiss_count + 1 faiss_count_tfidf = faiss_count_tfidf + 1 if len(ids) == 0: # No se agrego nada por que ya estaba return total_vectors, len(ids) xb = np.array(xb, dtype='float32') xb_tfidf = np.array(xb_tfidf, dtype='float32') ids = np.array(ids, dtype='int64') ids_tfidf = np.array(ids_tfidf, dtype='int64') self.faiss_indexes.add_with_ids(xb, ids) faiss.write_index(self.faiss_indexes, self.faiss_indexes_path) self.faiss_indexes_tfidf.add_with_ids(xb_tfidf, ids_tfidf) faiss.write_index(self.faiss_indexes_tfidf, self.faiss_indexes_tfidf_path) return total_vectors, len(ids) # def article_to_faiss_vect(article, nlp, w2v_model): # # article2vect # title = CromaGNI.preprocess_aws_data(article['title']) # text = CromaGNI.preprocess_aws_data(article['text']) # text = title + '\n' + text # doc = nlp(text) # return get_sentence_vect(doc, w2v_model) # def get_related_aticles(vector, faiss_indexes, faiss_article_ids, Article, radius=0.89, k=None, fr = 0, filter_by_date=True, years=1, months=0, days=0): # if k is None: # lims, D, I = faiss_indexes.range_search(vector, radius) # j = 0 # distances = D[lims[j]:lims[j+1]][fr:] # sorted_idx = np.argsort(distances)[::-1] # distances = distances[sorted_idx] # indexes = I[lims[j]:lims[j+1]][fr:][sorted_idx] # else: # D, I = faiss_indexes.search(vector, k) # distances = D[0][fr:] # indexes = I[0][fr:] # articles = [] # for idx in indexes: # articles.append(Article.objects(id=faiss_article_ids[idx]).first()) # if filter_by_date: # articles, distances = get_filtered_by_date(articles, distances, years=years, months=months, days=days) # return articles, distances # def add_faiss_vectors(old_faiss_ids_f, old_faiss_indexes_f, new_faiss_ids_f, new_faiss_indexes_f, articles, w2v_model, nlp_ner, N_vects=10000): # # Read faiss indexes and mongoids # if old_faiss_ids_f is None or old_faiss_indexes_f is None: # faiss_articles_ids = [] # faiss_index2 = None # else: # faiss_articles_ids = np.load(old_faiss_ids_f) # faiss_index2 = faiss.read_index(old_faiss_indexes_f) # # Get wordvectors # word_vect_dim = w2v_model.wv.vector_size # xb = np.zeros((N_vects, word_vect_dim), dtype='float32') # new_article_ids = [] # i=0 # j=0 # while j<N_vects: # article = articles[i] # if str(article.id) not in faiss_articles_ids: # new_article_ids.append(str(article.id)) # xb[j, :] = article_to_faiss_vect(article, nlp_ner, w2v_model) # j+=1 # i+=1 # print(f'\r{i}, {j}', end='') # # Update articles ids # all_articles_ids = list(faiss_articles_ids) + new_article_ids # np.save(new_faiss_ids_f, all_articles_ids) # if len(faiss_articles_ids) == 0: # ids = np.arange(N_vects).astype('int64') # + faiss_index2.ntotal # else: # ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal # if len(faiss_articles_ids) == 0: # index = faiss.IndexFlatIP(word_vect_dim) # faiss_index2 = faiss.IndexIDMap(index) # faiss_index2.add_with_ids(xb, ids) # faiss.write_index(faiss_index2, new_faiss_indexes_f) # def array_to_sentence_vect(art_arry, w2v_model): # word_vect_dim=w2v_model.wv.vector_size # v = np.zeros(word_vect_dim) # for word in art_arry: # if word in w2v_model.wv.vocab: # v = v + w2v_model.wv.get_vector(word) # else: # words = word.split(' ') # if len(words)>1: # for word in words: # if word in w2v_model.wv.vocab: # v = v + w2v_model.wv.get_vector(word) # norm = np.linalg.norm(v) # if norm==0: # return np.zeros(word_vect_dim) # else: # return v/np.linalg.norm(v) # def word2vect_encode(doc): # tokens = [] # i = 0 # while i<len(doc): # t = doc[i] # tx = t.text # # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_) # if t.ent_iob_=='O': # ent_tex = tx # i+=1 # if (not t.is_space and '@' not in t.text): # if t.is_digit: # tokens.append('__DIGIT__') # elif '$' in tx: # tokens.append('__CURRENCY__') # else: # tokens.append(ent_tex) # else: # ent_tex = '' # while t.ent_iob_!='O': # if t.pos_ == 'DET' and t.ent_iob_=='B': # # It is an article # tokens.append(tx) # else: # ent_tex = ent_tex + ' ' + tx # i+=1 # if i<len(doc): # t = doc[i] # tx = t.text # else: # break # ent_tex = ent_tex.strip().replace(' - ', '-') # tokens.append(ent_tex) # return tokens # def get_sentence_vect(doc, w2v_model): # tokens = word2vect_encode(doc) # return array_to_sentence_vect(tokens, w2v_model).astype('float32')
def test_autocomplete_synonym_part_of_another_word(self): words = {'cartoon': {}, 'vehicle': {}} synonyms = {'vehicle': ['car']} autocomplete = AutoComplete(words=words, synonyms=synonyms) result = autocomplete.search(word='ca') assert [['vehicle'], ['cartoon']] == result
from sanic import Blueprint from sanic import response from fast_autocomplete import AutoComplete import json autocomplete = Blueprint('autocomplete', url_prefix='/autocomplete') models_directory = './autocomplete' autocomplete_models = { 'en': AutoComplete(words=json.load(open(f'{models_directory}/en.json', 'r'))), 'es': AutoComplete(words=json.load(open(f'{models_directory}/es.json', 'r'))), 'fr': AutoComplete(words=json.load(open(f'{models_directory}/fr.json', 'r'))), 'pt': AutoComplete(words=json.load(open(f'{models_directory}/pt.json', 'r'))) } @autocomplete.post('/') async def complete(request): lang = request.args.get('language', 'en') query = request.args.get('text') limit = request.args.get('limit', 7) model = autocomplete_models.get(lang, autocomplete_models['en']) if any(stop in query for stop in ['.', '?', '!']): return response.json([]) elif query.count(' ') >= int(limit): return response.json([])