Example #1
0
    def __init__(self,
                 spacy_model_path=None,
                 gensim_model_path=None,
                 faiss_indexes_path=None,
                 faiss_indexes_tfidf_path=None,
                 token2tfidf_path=None):
        self.faiss_indexes_path = faiss_indexes_path
        self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path
        if spacy_model_path is not None:
            self.nlp = spacy.load(spacy_model_path)
        if gensim_model_path is not None:
            self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r')
            model_words, self.model_synonyms = self.prepare_autocomplete()
            self.autocomplete_model = AutoComplete(words=model_words)
        if faiss_indexes_path is not None and os.path.exists(
                faiss_indexes_path):
            self.faiss_indexes = faiss.read_index(faiss_indexes_path)
        else:
            self.faiss_indexes = None
        if faiss_indexes_tfidf_path is not None and os.path.exists(
                faiss_indexes_tfidf_path):
            self.faiss_indexes_tfidf = faiss.read_index(
                faiss_indexes_tfidf_path)
        else:
            self.faiss_indexes_tfidf = None

        if token2tfidf_path is not None:
            self.token2tfidf = np.load(token2tfidf_path,
                                       allow_pickle=True).item()
        else:
            self.token2tfidf = None
Example #2
0
 def test_get_word_context(self, word, expected_results):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS,
                                  synonyms=SYNONYMS,
                                  full_stop_words=['bmw', 'alfa romeo'])
     results = auto_complete.get_word_context(word)
     print_results(locals())
     assert expected_results == results
Example #3
0
 def test_search_without_synonyms(self, word, max_cost, size,
                                  expected_results):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS)
     results, find_steps = auto_complete._find(word, max_cost, size)
     results = dict(results)
     print_results(locals())
     assert expected_results == results
Example #4
0
    def test_get_all_descendent_words_for_condition(self, word,
                                                    expected_results):
        auto_complete = AutoComplete(words=WIKIPEDIA_WORDS,
                                     synonyms=SYNONYMS,
                                     full_stop_words=['bmw', 'alfa romeo'])

        results = auto_complete.get_tokens_flat_list(word, max_cost=0, size=3)
        print_results(locals())
        assert expected_results == results
Example #5
0
 def test_find(self, word, max_cost, size, expected_find_results,
               expected_steps, expected_find_and_sort_results):
     expected_results = expected_find_results
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
     results, find_steps = auto_complete._find(word, max_cost, size)
     results = dict(results)
     print_results(locals())
     assert expected_results == results
     assert expected_steps == find_steps
Example #6
0
 def test_search_unicode_without_synonyms(self, word, max_cost, size,
                                          expected_results):
     auto_complete = AutoComplete(
         words=SHORT_WORDS_UNICODE,
         valid_chars_for_string='اآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی')
     results, find_steps = auto_complete._find(word, max_cost, size)
     results = dict(results)
     print_results(locals())
     assert expected_results == results
Example #7
0
    def test_special_characters(self):
        words = {'abcd(efgh)ijk': {}, 'u (2 off)': {}}
        autocomplete = AutoComplete(
            words=words,
            valid_chars_for_string=string.ascii_letters + string.punctuation)
        # result = autocomplete.search(word='abcd(efgh)')
        # assert [['abcd(efgh)ijk']] == result

        result2 = autocomplete.search(word='u (2 o')
        assert [['u (2 off)']] == result2
Example #8
0
 def test_get_descendants_nodes(self, word, expected_results):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
     matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(
         word)
     size = 2
     found_words_gen = node.get_descendants_nodes(size=size)
     found_words = [_node.word for _node in found_words_gen][:size + 1]
     print(f'word: {word}')
     print(f'expected_results: {expected_results}')
     print(f'found_words: {found_words}')
     assert expected_results == list(found_words)
Example #9
0
    def test_get_all_descendent_words_for_condition1(self, word,
                                                     expected_results):
        auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)

        def condition(word_info):
            return 'model' in word_info

        size = 10
        results = auto_complete.get_all_descendent_words_for_condition(
            word=word, size=size, condition=condition)
        print_results(locals())
Example #10
0
    def video_loop(self):
        ok, frame = self.vs.read()
        if ok:
            cv2image = cv2.flip(frame, 1)
            x1 = int(0.5 * frame.shape[1])
            y1 = 10
            x2 = frame.shape[1] - 10
            y2 = int(0.5 * frame.shape[1])
            cv2.rectangle(cv2image, (x1 - 1, y1 - 1), (x2 + 1, y2 + 1),
                          (255, 0, 0), 1)
            cv2image = cv2.cvtColor(cv2image, cv2.COLOR_BGR2RGBA)
            self.current_image = Image.fromarray(cv2image)
            imgtk = ImageTk.PhotoImage(image=self.current_image)
            self.panel.imgtk = imgtk
            self.panel.config(image=imgtk)
            cv2image = cv2image[y1:y2, x1:x2]
            gray = cv2.cvtColor(cv2image, cv2.COLOR_BGR2GRAY)
            blur = cv2.GaussianBlur(gray, (5, 5), 2)
            th3 = cv2.adaptiveThreshold(blur, 255,
                                        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                        cv2.THRESH_BINARY_INV, 11, 2)
            ret, res = cv2.threshold(th3, 70, 255,
                                     cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            self.predict(res)
            # self.current_image2 = Image.fromarray(res)
            # imgtk = ImageTk.PhotoImage(image=self.current_image2)
            # self.panel2.imgtk = imgtk
            # self.panel2.config(image=imgtk)

            self.panel3.config(text=self.current_symbol, font=("Courier", 10))
            self.panel4.config(text=self.word, font=("Courier", 10))
            self.panel5.config(text=self.str, font=("Courier", 10))

            predicts = self.word
            autocomplete = AutoComplete(words=self.words)
            self.a = autocomplete.search(word=predicts, max_cost=2, size=2)
            print("Initial {0}: ,Suggest : {1}".format(predicts, self.a))
            if (len(self.a) > 0):
                self.bt1.config(text=self.a[0][0], font=("Courier", 10))
            else:
                self.bt1.config(text="None")

            if (len(self.a) > 1):
                self.bt2.config(text=self.a[1][0], font=("Courier", 10))
            else:
                self.bt2.config(text="None")

            if (len(self.a) > 2):
                self.bt3.config(text=self.a[2][0], font=("Courier", 10))
            else:
                self.bt3.config(text="None")

        self.root.after(60, self.video_loop)
Example #11
0
    def __init__(self):
        with open("shaker_dictionary.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'shaker_dictionary.txt') as f:
            words = f.readlines()
        words = dict(zip(words, [dict()] * len(words)))
        self.autocomplete = AutoComplete(words=words)

        with open("authors.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'authors.txt') as f:
            names = f.readlines()
        names = dict(zip(names, [dict()] * len(names)))
        self.authors = AutoComplete(words=names)
Example #12
0
 def test_update_count_of_word(self, word, update_dict, expected_results,
                               expected_new_count):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS,
                                  synonyms=SYNONYMS,
                                  full_stop_words=['bmw', 'alfa romeo'])
     if update_dict:
         new_count = auto_complete.update_count_of_word(**update_dict)
         assert expected_new_count == new_count
         assert expected_new_count == auto_complete.get_count_of_word(
             update_dict['word'])
     results = auto_complete.search(word, max_cost=2, size=4)
     print_results(locals())
     assert expected_results == results
Example #13
0
 def test__find_and_sort(self, word, max_cost, size, expected_find_results,
                         expected_steps, expected_find_and_sort_results):
     expected_results = expected_find_and_sort_results
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
     results = auto_complete._find_and_sort(word, max_cost, size)
     results = list(results)
     search_results = auto_complete.search(word, max_cost, size)
     print_results(locals())
     assert expected_results == results
     if word.strip():
         assert expected_results == search_results
     else:
         assert [] == search_results
Example #14
0
def autocomplete():
    search = request.args.get('q')
    print(request.args.get('term'))
    print('search is ---------')
    print(str(search))
    autocomplete = AutoComplete(words=autocmplete_label_dict)
    print(autocomplete.search(word=str(search), max_cost=3, size=3))
    t = autocomplete.search(word=str(search), max_cost=3, size=6)

    flatten = [item for sublist in t for item in sublist]
    print(flatten)

    # results = autocomplete.search(word=str(search), max_cost=3, size=3) #class_labels  # ['Beer', 'Wine', 'Soda', 'Juice', 'Water']
    results = flatten
    print(results)

    return jsonify(matching_results=results)
Example #15
0
class SM_Autocomplete:

    def __init__(self):
        with open("shaker_dictionary.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'shaker_dictionary.txt') as f:
            words = f.readlines()
        words = dict(zip(words, [dict()] * len(words)))
        self.autocomplete = AutoComplete(words=words)

        with open("authors.txt", "r", encoding="utf8") as f:
        #with stream(__name__, 'authors.txt') as f:
            names = f.readlines()
        names = dict(zip(names, [dict()] * len(names)))
        self.authors = AutoComplete(words=names)

    def general(self, s):
        return sorted(self.autocomplete.search(word=s, max_cost=3, size=10))

    def author(self, s):
        return sorted(self.authors.search(word=s, max_cost=3, size=10))
Example #16
0
 def test_prefix_autofill(self, word, expected_matched_prefix_of_last_word,
                          expected_rest_of_word, expected_matched_words,
                          expected_node_path):
     auto_complete = AutoComplete(words=WIKIPEDIA_WORDS, synonyms=SYNONYMS)
     matched_prefix_of_last_word, rest_of_word, node, matched_words = auto_complete._prefix_autofill(
         word)
     print(f'word: {word}')
     print(
         f'expected_matched_prefix_of_last_word: {expected_matched_prefix_of_last_word}'
     )
     print(f'matched_prefix_of_last_word: {matched_prefix_of_last_word}')
     print(f'expected_rest_of_word: {expected_rest_of_word}')
     print(f'rest_of_word: {rest_of_word}')
     print(f'node: {node}')
     print(f'expected_matched_words: {expected_matched_words}')
     print(f'matched_words: {matched_words}')
     expected_node = auto_complete._dwg
     for k in expected_node_path.split(','):
         expected_node = expected_node[k]
     assert expected_node is node
     assert expected_matched_prefix_of_last_word == matched_prefix_of_last_word
     assert expected_rest_of_word == rest_of_word
     assert expected_matched_words == matched_words
Example #17
0
 def test_immutable_info(self, word):
     auto_complete = AutoComplete(words=SHORT_WORDS, synonyms=SYNONYMS)
     auto_complete_immutable = AutoComplete(
         words=SHORT_WORDS_IMMUTABLE_INFO, synonyms=SYNONYMS)
     search_results = auto_complete._find(word, max_cost=3, size=3)
     search_results_immutable = auto_complete_immutable._find(word,
                                                              max_cost=3,
                                                              size=3)
     print_results(locals())
     assert search_results_immutable == search_results
    def load_autocomplete_trie(self):
        directory = glob('bigData/*')

        words = set()

        for file_name in directory:
            if file_name == 'bigData/bigtext.txt':
                continue

            with open(file_name, 'r') as file:
                for line in file:
                    line = line.strip()
                    words.add(line)

        words = {word: {} for word in words}

        self.Trie = AutoComplete(words=words)
Example #19
0
    def _build_search_dictionary(self):
        '''Takes a JournalFile and returns an AutoComplete object.

        '''

        words = {}
        synonyms = {}
        del self._vardict['sdict']

        for count, entry in enumerate(self._vardict['journal_file']):

            title = entry['Title']
            sub_title = entry['subTitle']
            subsub_title = entry['subsubTitle']

            words[title.lower()] = {'Title': title, 'Index': count}

            synonyms[title] = [sub_title.lower(), subsub_title.lower()]
            for abbrev in entry['Abbreviations']:
                if abbrev:
                    synonyms[title].append(str(abbrev.lower()))

        self._vardict['sdict'] = AutoComplete(words=words, synonyms=synonyms)
Example #20
0
def getAutocompleteEntries() -> dict:
    words = {}
    logger.info("Starting autocomplete words intialisation...")
    with gzip.open('wikidump/enwiki-20210820-abstract.xml.gz', 'rb') as f:
        doc_id = 1
        for _, element in etree.iterparse(f, events=('end',), tag='doc'):
            if (doc_id % 78 == 1):
                title = element.findtext('./title')
                logger.debug("Creating autocomplete word entry for [id={0}] [title={1}]".format(doc_id, title))
                words[title[11:]] = {}
            doc_id += 1
            element.clear()
    logger.info("Finished autocomplete words intialisation")
    return words

autocomplete = AutoComplete(words=getAutocompleteEntries())

class ErrorHandlerWrapper:

    def __call__(self, func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            self.client_addr = request.remote_addr
            try:
                with self:
                    return func(*args, **kwargs)
            except SSLError as e:
                logger.error("An issue with SSL occured", e)
                response  = {
                    "status_code": e.status_code,
                    "status": "An issue with SSL occured: {}".format(e)
Example #21
0
from sanic import Blueprint
from sanic import response

from fast_autocomplete import AutoComplete
import json

autocomplete = Blueprint('autocomplete', url_prefix='/autocomplete')

models_directory = './autocomplete'
autocomplete_models = {
    'en':
    AutoComplete(words=json.load(open(f'{models_directory}/en.json', 'r'))),
    'es':
    AutoComplete(words=json.load(open(f'{models_directory}/es.json', 'r'))),
    'fr':
    AutoComplete(words=json.load(open(f'{models_directory}/fr.json', 'r'))),
    'pt':
    AutoComplete(words=json.load(open(f'{models_directory}/pt.json', 'r')))
}


@autocomplete.post('/')
async def complete(request):
    lang = request.raw_args.get('language', 'en')
    query = request.raw_args.get('text')
    limit = request.raw_args.get('limit', 7)
    model = autocomplete_models.get(lang, autocomplete_models['en'])

    if any(stop in query for stop in ['.', '?', '!']):
        return response.json([])
    elif query.count(' ') >= int(limit):
Example #22
0
        model = line['model']
        count = line['count']
        if make != model:
            #local_words = [model, '{}{}'.format(make,model)]
            # print(local_words)
            # while local_words:
            #    word = local_words.pop()
            #    if word not in words:
            words['{}{}'.format(make, model)] = {}
        # if make not in words:
        #words[make] = {}
    return words


synonyms = {
    "alfa romeo 4c coupe": ["the alfa", "hello"],
    "bmw": ["beemer", "bimmer"]
}
words = get_words("autocomp.csv")
autocomplete = AutoComplete(words=words, synonyms=synonyms)

autocomplete.search(word='the ', max_cost=3, size=5)

# %%
auto_complete.update_count_of_word(word='toyota aygo', count=10000)
autocomplete.get_count_of_word('toyota aygo')

# %%

# %%
import os
import tkinter as tk
from fast_autocomplete import AutoComplete
from scanners.smmx import scan
from pathlib import Path
import subprocess
import platform
from pynput import keyboard
import threading
import copy

rootdir = 'C:/Users/xxx/Dropbox/SimpleMind'
words, paths = scan(rootdir)
words_copy = copy.deepcopy(words)

autocomplete = AutoComplete(words=words)
contexts = []
selected = None


def remove_prefix(str, prefix):
    if str.startswith(prefix):
        return str[len(prefix):]
    return str


def on_keyrelease(event):

    # get text from entry
    value = event.widget.get()
    value = value.strip().lower()
Example #24
0
 def _init_autocompleter(self, df: Card_DF):
     card_names = self._get_words(df)
     card_names_to_dicts = {word: {} for word in card_names}
     completer = AutoComplete(words=card_names_to_dicts)
     return completer
Example #25
0
parser.add_argument('--disallowed-rels', help='List of semicolon-separated relations that are disallowed', default='')
parser.add_argument('-v', '--verbose', help='increase output verbosity', action='store_true')
args = parser.parse_args()
if args.verbose:
    logging.basicConfig(level=logging.DEBUG)
logging.debug('Args:', args)

logging.info('Loading autocomplete vocabulary...')
words_en = {}
vocab_filepath = '/data/zeste_cache/vocab.txt'
if os.path.exists(vocab_filepath):
    vocab_file = open(vocab_filepath, 'r')
    lines = vocab_file.readlines()
    for line in lines:
        words_en[line.strip()] = {}
autocomplete_en = AutoComplete(words=words_en)

words_fr = {}
vocab_filepath = '/data/zeste_cache/vocab_fr.txt'
if os.path.exists(vocab_filepath):
    vocab_file = open(vocab_filepath, 'r')
    lines = vocab_file.readlines()
    for line in lines:
        words_fr[line.strip()] = {}
autocomplete_fr = AutoComplete(words=words_fr)

logging.info('Starting web server...')
app = Flask(__name__)
app.wsgi_app = ProxyFix(app.wsgi_app, x_for=1, x_proto=1, x_host=1, x_port=1)
api = Api(app, doc='/doc')
cors = CORS(app)
 def execute_algorithm(self, data: dict, current_string: str):
     print(data)
     autocomplete = AutoComplete(words=data)
     print(autocomplete)
     # word -> what to search by, max_cost -> , size -> number of results to propagate back
     return autocomplete.search(word=current_string, max_cost=3, size=3)
Example #27
0
@api_view(['GET'])
#@permission_classes([IsAuthenticated])
def remove_authors(request, *args, **kwargs):
    spp = kwargs.get('taxon').split(" ")
    not_words = []
    for i,j in enumerate(spp, start = 0):
        word = Words.objects.filter(word = spp[i])
        if not word:
            not_words.append(spp[i])
    spp = [x for x in spp if x not in not_words]
    return Response({'taxon': ' '.join(spp)}, status=status.HTTP_200_OK)

with open('data.pickle', 'rb') as f:  
    names = pickle.load(f)

autocomplete_names = AutoComplete(words=names)

def autocomplete(request):
    if request.GET.get('term'):
        q = request.GET['term']
        data = autocomplete_names.search(word = q, max_cost = 3, size = 3)
        data = [s[0] for s in data if len(s[0]) >= len(q)]
        return HttpResponse(json.dumps([x.capitalize() for x in data]), 'application/json')
    else:
        return render(request, 'api/autocomplete.html')

@api_view(['GET'])
#@permission_classes([IsAuthenticated])
def synonyms(request, *args, **kwargs):
    spp = Details.objects.filter(search_str = kwargs.get('search_str').capitalize()).first()
    if spp == None:
Example #28
0
class RelatedArticles():
    @staticmethod
    def get_filtered_by_date(articles, distances, days=0, months=0, years=1):
        print(len(articles), len(distances))
        filter_date = (
            datetime.now() -
            relativedelta(days=days, months=months, years=years)).date()
        filtered_articles = []
        filtered_distances = []
        for i, article in enumerate(articles):
            if article.publish_date > filter_date:
                filtered_articles.append(article)
                filtered_distances.append(distances[i])
        return filtered_articles, np.array(filtered_distances)

    @staticmethod
    def article2text(article):
        title = CromaGNI.preprocess_aws_data(article['title'])
        text = CromaGNI.preprocess_aws_data(article['text'])
        text = title + '\n' + text
        return text

    @staticmethod
    def doc2tokens(doc):
        tokens = []
        i = 0
        while i < len(doc):
            t = doc[i]
            tx = t.text
            #             print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_)
            if t.ent_iob_ == 'O':
                ent_tex = tx
                i += 1
                if (not t.is_space and '@' not in t.text
                    ) or '\n' in t.text:  # and t.text != '\n'):
                    if t.is_digit:
                        tokens.append('__DIGIT__')
                    elif '$' in tx:
                        tokens.append('__CURRENCY__')
                    else:
                        tokens.append(ent_tex)
            else:
                ent_tex = ''
                while t.ent_iob_ != 'O':
                    if t.pos_ == 'DET' and t.ent_iob_ == 'B':
                        # It is an article
                        tokens.append(tx)
                    else:
                        ent_tex = ent_tex + ' ' + tx
                    i += 1
                    if i < len(doc):
                        t = doc[i]
                        tx = t.text
                    else:
                        break
                ent_tex = ent_tex.strip().replace(' - ', '-')
                tokens.append(ent_tex)
        return tokens

    def __init__(self,
                 spacy_model_path=None,
                 gensim_model_path=None,
                 faiss_indexes_path=None,
                 faiss_indexes_tfidf_path=None,
                 token2tfidf_path=None):
        self.faiss_indexes_path = faiss_indexes_path
        self.faiss_indexes_tfidf_path = faiss_indexes_tfidf_path
        if spacy_model_path is not None:
            self.nlp = spacy.load(spacy_model_path)
        if gensim_model_path is not None:
            self.w2v_model = KeyedVectors.load(gensim_model_path, mmap='r')
            model_words, self.model_synonyms = self.prepare_autocomplete()
            self.autocomplete_model = AutoComplete(words=model_words)
        if faiss_indexes_path is not None and os.path.exists(
                faiss_indexes_path):
            self.faiss_indexes = faiss.read_index(faiss_indexes_path)
        else:
            self.faiss_indexes = None
        if faiss_indexes_tfidf_path is not None and os.path.exists(
                faiss_indexes_tfidf_path):
            self.faiss_indexes_tfidf = faiss.read_index(
                faiss_indexes_tfidf_path)
        else:
            self.faiss_indexes_tfidf = None

        if token2tfidf_path is not None:
            self.token2tfidf = np.load(token2tfidf_path,
                                       allow_pickle=True).item()
        else:
            self.token2tfidf = None

    # def save_training_tokens(self, publication_name, chunk_size = 50_000):
    #     dst_folder = f'training_data_{publication_name}_{chunk_size}/'
    #     if not os.path.exists(dst_folder):
    #         os.makedirs(dst_folder)
    #     articles = Article.objects(publication=Publication.objects(name=publication_name).get()).order_by('-publish_date')
    #     N = articles.count()
    #     N_chunks = np.ceil(N/chunk_size)
    #     sentences = []
    #     ids = []
    #     chunk = 0
    #     for i, article in enumerate(articles):
    #         if i%chunk_size == 0 and i!=0:
    #             chunk+=1
    #             file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy'
    #             np.save(file_name, sentences)
    #             sentences = []
    #             print()
    #             print(f'{file_name} saved!')
    #             file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy'
    #             np.save(file_name_ids, ids)
    #             ids = []

    #         text = RelatedArticles.article2text(article)
    #         print(f'\r{i}/{N}', end=' ')
    #         doc = self.nlp(text)
    #         sentences.append(RelatedArticles.doc2tokens(doc))
    #         ids.append(str(article['id']))
    #     chunk+=1
    #     file_name = f'{dst_folder}tokens_{publication_name}_{chunk}.npy'
    #     np.save(file_name, sentences)
    #     sentences = []
    #     print()
    #     print(f'{file_name} saved!')
    #     file_name_ids = f'{dst_folder}tokens_{publication_name}_{chunk}_ids.npy'
    #     np.save(file_name_ids, ids)
    #     ids = []

    def get_autocomplete_words_list(self, text):
        autocomplets = self.autocomplete_model.search(text, size=10)
        near_words = []
        for word in autocomplets:
            near_words = near_words + self.model_synonyms[word[0]]

        return near_words

    def get_similar(self, word, topn=10):
        words = []
        distances = []
        for word, distance in self.w2v_model.wv.most_similar(word, topn=topn):
            words.append(word)
            distances.append(distance)
        return words, distances

    def get_related_articles(self,
                             article,
                             years=1,
                             months=0,
                             days=0,
                             radius=0.89):
        id_form_article_id = article['faiss_index']
        if id_form_article_id is None:
            # Not in faiss db already
            vector = self.article2vect(
                article
            )  # np.array([article_to_faiss_vect(article, nlp_custom, w2v_model)])
        else:
            vector = np.array(
                [self.faiss_indexes.index.reconstruct(id_form_article_id)])

        articles, distances = self.get_related_articles_from_vector(
            vector, years=years, months=months, days=days, radius=radius)

        if id_form_article_id is None:
            articles = list(articles)
            articles.insert(0, article)
            distances = list(distances)
            distances.insert(0, 1.0)
        return articles, distances

    def tokens2vect(self, art_arry, tfidf=True):
        if self.token2tfidf is None:
            tfidf = False
        word_vect_dim = self.w2v_model.wv.vector_size
        v = np.zeros(word_vect_dim)
        if tfidf:
            v_tfidf = np.zeros(word_vect_dim)
        for word in art_arry:
            if word in self.w2v_model.wv.vocab:
                if tfidf:
                    wordtfidf = self.token2tfidf.get(word, 0)
                    v_tfidf = v_tfidf + self.w2v_model.wv.get_vector(
                        word) * wordtfidf
                v = v + self.w2v_model.wv.get_vector(word)
            else:
                words = word.split(' ')
                if len(words) > 1:
                    for word in words:
                        if word in self.w2v_model.wv.vocab:
                            v = v + self.w2v_model.wv.get_vector(word)
                            if tfidf:
                                wordtfidf = self.token2tfidf.get(word, 0)
                                v_tfidf = v_tfidf + self.w2v_model.wv.get_vector(
                                    word) * wordtfidf
        norm = np.linalg.norm(v)

        if norm == 0:
            v = np.zeros(word_vect_dim)
        else:
            v = v / norm

        if tfidf:
            norm_tfidf = np.linalg.norm(v_tfidf)
            if norm_tfidf == 0:
                v_tfidf = np.zeros(word_vect_dim)
            else:
                v_tfidf = v_tfidf / norm_tfidf

            return v.astype('float32'), v_tfidf.astype('float32')
        else:
            return v.astype('float32')

    def doc2vect(self, doc):
        tokens = RelatedArticles.doc2tokens(doc)
        return self.tokens2vect(tokens)

    def text2doc(self, text):
        return self.nlp(text)

    def text2vect(self, text):
        doc = self.text2doc(text)
        return self.doc2vect(doc)

    def article2vect(self, article):
        text = RelatedArticles.article2text(article)
        return self.text2vect(text)

    def get_related_articles_from_vector(self,
                                         vector,
                                         radius=0.89,
                                         k=None,
                                         fr=0,
                                         filter_by_date=True,
                                         years=1,
                                         months=0,
                                         days=0):
        indexes = []
        distances = []
        if type(vector) == tuple:
            # Hay que hacer un fix para tfidf aca
            vector = vector[0]
        if len(vector.shape) == 1:
            vector = np.array([vector])
        if self.faiss_indexes is not None:
            if k is None:
                """
                    returns by radius
                """
                lims, D, I = self.faiss_indexes.range_search(vector, radius)
                j = 0
                distances = D[lims[j]:lims[j + 1]][fr:]
                sorted_idx = np.argsort(distances)[::-1]
                distances = distances[sorted_idx]
                indexes = I[lims[j]:lims[j + 1]][fr:][sorted_idx]
            else:
                """
                    returns k related
                """
                D, I = self.faiss_indexes.search(vector, k)
                distances = D[0][fr:]
                indexes = I[0][fr:]

        articles = []
        for idx in indexes:
            art_ = Article.objects(faiss_index=idx).first()
            if art_ is not None:
                articles.append(art_)

        if len(articles) > 0 and filter_by_date:
            articles, distances = RelatedArticles.get_filtered_by_date(
                articles, distances, years=years, months=months, days=days)

        return articles, distances

    # def add_faiss_vectors(self, articles, old_faiss_ids_f, old_faiss_indexes_f, old_faiss_indexes_tfidf_f, new_faiss_ids_f, new_faiss_indexes_f, new_faiss_indexes_tfidf_f):
    #     if new_faiss_indexes_tfidf_f is not None:
    #         tfidf=True
    #     else:
    #         tfidf=False
    #     # Read faiss indexes and mongoids
    #     if old_faiss_ids_f is None or old_faiss_indexes_f is None:
    #         faiss_articles_ids = []
    #         faiss_index2 = None
    #         faiss_index2_tfidf = None
    #     else:
    #         faiss_articles_ids = np.load(old_faiss_ids_f)
    #         faiss_index2 = faiss.read_index(old_faiss_indexes_f)
    #         faiss_index2_tfidf = faiss.read_index(old_faiss_indexes_tfidf_f)

    #     N_vects = len(articles)
    #     # Get wordvectors
    #     word_vect_dim = self.w2v_model.wv.vector_size
    #     xb = np.zeros((N_vects, word_vect_dim), dtype='float32')
    #     if tfidf:
    #         xb_tfidf = np.zeros((N_vects, word_vect_dim), dtype='float32')
    #     new_article_ids = []
    #     i=0
    #     j=0
    #     while j<N_vects:
    #         article = articles[i]
    #         if str(article.id) not in faiss_articles_ids:
    #             new_article_ids.append(str(article.id))
    #             if tfidf:
    #                 xb[j, :], xb_tfidf[j, :] = self.article2vect(article)
    #             else:
    #                 xb[j, :] = self.article2vect(article)
    #             j+=1
    #         i+=1
    #         print(f'\r{i}, {j} / {N_vects}', end='')

    #     # Update articles ids
    #     all_articles_ids = list(faiss_articles_ids) + new_article_ids
    #     np.save(new_faiss_ids_f, all_articles_ids)
    #     if len(faiss_articles_ids) == 0:
    #         ids = np.arange(N_vects).astype('int64')  # + faiss_index2.ntotal
    #     else:
    #         ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal

    #     if len(faiss_articles_ids) == 0:
    #         index = faiss.IndexFlatIP(word_vect_dim)
    #         faiss_index2 = faiss.IndexIDMap(index)
    #         if tfidf:
    #             index_tfidf = faiss.IndexFlatIP(word_vect_dim)
    #             faiss_index2_tfidf = faiss.IndexIDMap(index_tfidf)

    #     faiss_index2.add_with_ids(xb, ids)
    #     faiss.write_index(faiss_index2, new_faiss_indexes_f)

    #     if tfidf:
    #         faiss_index2_tfidf.add_with_ids(xb_tfidf, ids)
    #         faiss.write_index(faiss_index2_tfidf, new_faiss_indexes_tfidf_f)

    def prepare_autocomplete(self):
        words = {}
        for word, g in self.w2v_model.wv.vocab.items():
            lower = word.lower()
            if lower in words:
                if g.count > words[lower]['count']:
                    words[lower] = {'count': g.count}
            else:
                words[lower] = {'count': g.count}

        synonyms = {}
        for word, g in self.w2v_model.wv.vocab.items():
            lower = word.lower()
            if lower not in synonyms:
                synonyms[lower] = []
            synonyms[lower].append(word)
        return words, synonyms

    def add_faiss_vectors(self, articles, tfidf=True):
        total_vectors = 0
        vector_size = self.w2v_model.wv.vector_size
        if self.faiss_indexes is None:
            index = faiss.IndexFlatIP(vector_size)
            self.faiss_indexes = faiss.IndexIDMap(index)
            if tfidf:
                index_tfidf = faiss.IndexFlatIP(vector_size)
                self.faiss_indexes_tfidf = faiss.IndexIDMap(index_tfidf)

        total_vectors = self.faiss_indexes.ntotal
        total_vectors_tfidf = self.faiss_indexes_tfidf.ntotal

        xb = []
        xb_tfidf = []
        faiss_count = total_vectors
        faiss_count_tfidf = total_vectors_tfidf
        ids = []
        ids_tfidf = []

        for article in articles:
            if article['faiss_index'] is None:
                vect, vect_tfidf = self.article2vect(article)
                xb.append(vect)
                xb_tfidf.append(vect_tfidf)
                article['faiss_index'] = faiss_count
                article['faiss_index_tfidf'] = faiss_count_tfidf
                ids.append(faiss_count)
                ids_tfidf.append(faiss_count_tfidf)
                article.save()
                faiss_count = faiss_count + 1
                faiss_count_tfidf = faiss_count_tfidf + 1

        if len(ids) == 0:
            # No se agrego nada por que ya estaba
            return total_vectors, len(ids)

        xb = np.array(xb, dtype='float32')
        xb_tfidf = np.array(xb_tfidf, dtype='float32')

        ids = np.array(ids, dtype='int64')
        ids_tfidf = np.array(ids_tfidf, dtype='int64')

        self.faiss_indexes.add_with_ids(xb, ids)
        faiss.write_index(self.faiss_indexes, self.faiss_indexes_path)

        self.faiss_indexes_tfidf.add_with_ids(xb_tfidf, ids_tfidf)
        faiss.write_index(self.faiss_indexes_tfidf,
                          self.faiss_indexes_tfidf_path)

        return total_vectors, len(ids)


# def article_to_faiss_vect(article, nlp, w2v_model):
#     # article2vect
#     title = CromaGNI.preprocess_aws_data(article['title'])
#     text = CromaGNI.preprocess_aws_data(article['text'])
#     text = title + '\n' + text
#     doc = nlp(text)
#     return get_sentence_vect(doc, w2v_model)

# def get_related_aticles(vector, faiss_indexes, faiss_article_ids, Article, radius=0.89, k=None, fr = 0, filter_by_date=True, years=1, months=0, days=0):
#     if k is None:
#         lims, D, I = faiss_indexes.range_search(vector, radius)
#         j = 0
#         distances = D[lims[j]:lims[j+1]][fr:]
#         sorted_idx = np.argsort(distances)[::-1]
#         distances = distances[sorted_idx]
#         indexes = I[lims[j]:lims[j+1]][fr:][sorted_idx]
#     else:
#         D, I = faiss_indexes.search(vector, k)
#         distances = D[0][fr:]
#         indexes = I[0][fr:]

#     articles = []

#     for idx in indexes:
#         articles.append(Article.objects(id=faiss_article_ids[idx]).first())

#     if filter_by_date:
#         articles, distances = get_filtered_by_date(articles, distances, years=years, months=months, days=days)

#     return articles, distances

# def add_faiss_vectors(old_faiss_ids_f, old_faiss_indexes_f, new_faiss_ids_f, new_faiss_indexes_f, articles, w2v_model, nlp_ner, N_vects=10000):
#     # Read faiss indexes and mongoids
#     if old_faiss_ids_f is None or old_faiss_indexes_f is None:
#         faiss_articles_ids = []
#         faiss_index2 = None
#     else:
#         faiss_articles_ids = np.load(old_faiss_ids_f)
#         faiss_index2 = faiss.read_index(old_faiss_indexes_f)

#     # Get wordvectors
#     word_vect_dim = w2v_model.wv.vector_size
#     xb = np.zeros((N_vects, word_vect_dim), dtype='float32')
#     new_article_ids = []
#     i=0
#     j=0
#     while j<N_vects:
#         article = articles[i]
#         if str(article.id) not in faiss_articles_ids:
#             new_article_ids.append(str(article.id))
#             xb[j, :] = article_to_faiss_vect(article, nlp_ner, w2v_model)
#             j+=1
#         i+=1
#         print(f'\r{i}, {j}', end='')

#     # Update articles ids
#     all_articles_ids = list(faiss_articles_ids) + new_article_ids
#     np.save(new_faiss_ids_f, all_articles_ids)
#     if len(faiss_articles_ids) == 0:
#         ids = np.arange(N_vects).astype('int64')  # + faiss_index2.ntotal
#     else:
#         ids = np.arange(N_vects).astype('int64') + faiss_index2.ntotal

#     if len(faiss_articles_ids) == 0:
#         index = faiss.IndexFlatIP(word_vect_dim)
#         faiss_index2 = faiss.IndexIDMap(index)
#     faiss_index2.add_with_ids(xb, ids)
#     faiss.write_index(faiss_index2, new_faiss_indexes_f)

# def array_to_sentence_vect(art_arry, w2v_model):
#     word_vect_dim=w2v_model.wv.vector_size
#     v = np.zeros(word_vect_dim)
#     for word in art_arry:
#         if word in w2v_model.wv.vocab:
#             v = v + w2v_model.wv.get_vector(word)
#         else:
#             words = word.split(' ')
#             if len(words)>1:
#                 for word in words:
#                     if word in w2v_model.wv.vocab:
#                         v = v + w2v_model.wv.get_vector(word)
#     norm = np.linalg.norm(v)
#     if norm==0:
#         return np.zeros(word_vect_dim)
#     else:
#         return v/np.linalg.norm(v)

# def word2vect_encode(doc):
#     tokens = []
#     i = 0
#     while i<len(doc):
#         t = doc[i]
#         tx = t.text
#         # print(tx, t.ent_type_, t.ent_iob_, t.pos_, t.ent_kb_id_)
#         if t.ent_iob_=='O':
#             ent_tex = tx
#             i+=1
#             if (not t.is_space and '@' not in t.text):
#                 if t.is_digit:
#                     tokens.append('__DIGIT__')
#                 elif '$' in tx:
#                     tokens.append('__CURRENCY__')
#                 else:
#                     tokens.append(ent_tex)
#         else:
#             ent_tex = ''
#             while t.ent_iob_!='O':
#                 if t.pos_ == 'DET' and t.ent_iob_=='B':
#                     # It is an article
#                     tokens.append(tx)
#                 else:
#                     ent_tex = ent_tex + ' ' + tx
#                 i+=1
#                 if i<len(doc):
#                     t = doc[i]
#                     tx = t.text
#                 else:
#                     break

#             ent_tex = ent_tex.strip().replace(' - ', '-')
#             tokens.append(ent_tex)

#     return tokens

# def get_sentence_vect(doc, w2v_model):
#     tokens = word2vect_encode(doc)
#     return array_to_sentence_vect(tokens, w2v_model).astype('float32')
Example #29
0
 def test_autocomplete_synonym_part_of_another_word(self):
     words = {'cartoon': {}, 'vehicle': {}}
     synonyms = {'vehicle': ['car']}
     autocomplete = AutoComplete(words=words, synonyms=synonyms)
     result = autocomplete.search(word='ca')
     assert [['vehicle'], ['cartoon']] == result
Example #30
0
from sanic import Blueprint
from sanic import response

from fast_autocomplete import AutoComplete
import json


autocomplete = Blueprint('autocomplete', url_prefix='/autocomplete')


models_directory = './autocomplete'
autocomplete_models = {
    'en': AutoComplete(words=json.load(open(f'{models_directory}/en.json', 'r'))),
    'es': AutoComplete(words=json.load(open(f'{models_directory}/es.json', 'r'))),
    'fr': AutoComplete(words=json.load(open(f'{models_directory}/fr.json', 'r'))),
    'pt': AutoComplete(words=json.load(open(f'{models_directory}/pt.json', 'r')))
}


@autocomplete.post('/')
async def complete(request):
    lang = request.args.get('language', 'en')
    query = request.args.get('text')
    limit = request.args.get('limit', 7)
    model = autocomplete_models.get(lang, autocomplete_models['en'])

    if any(stop in query for stop in ['.', '?', '!']):
        return response.json([])
    elif query.count(' ') >= int(limit):
        return response.json([])