Beispiel #1
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # thesaurus
        for word in query_as_dict.copy().keys():
            if len(thes.synonyms(word)[1][1]):
                syn = list(thes.synonyms(word)[1][1])[:30]
                for s in syn:
                    if s not in query_as_dict and s in self._indexer.inverted_idx:
                        query_as_dict[s] = 1
                        break

        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # print("SE4 top5:")
        # print(ranked_doc_ids[:5])
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Beispiel #2
0
 def teasarous_(self, query):
     new_query = []
     new_query.extend(query)
     try:
         for word in query:
             new_word = list(thesaurus.synonyms(word, fileid="simN.lsp"))
             new_word_1 = list(thesaurus.synonyms(word, fileid="simV.lsp"))
             if len(new_word) > 0:
                 new_query.append(new_word[0])
             if len(new_word_1):
                 new_query.append(new_word_1[0])
     except:
         pass
         # print("no word for theasarous")
         # print(len(new_word))
     return new_query
    def expand_query_theasaurus(self, query):
        expand_set = set()
        for term in query:
            sys_list = list(thesaurus.synonyms(term, fileid="simN.lsp"))
            if len(sys_list) > 0 and sys_list[0] not in query:
                expand_set.add(sys_list[0])

        # [query.append(term) for term in expand_set]
        return list(expand_set)
def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.scored_synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print(f"Similarity score for {word1} and {word2}:")
    print(thes.similarity(word1, word2))
Beispiel #5
0
def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print("Getting synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting scored synonyms for " + word1)
    print(thes.synonyms(word1))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
    print(thes.synonyms(word1, fileid="simN.lsp"))

    print("Similarity score for %s and %s:" % (word1, word2))
    print(thes.similarity(word1, word2))
    def do_thesaurus(query):

        lowered = []
        toAdd = set()
        # lower every word in query
        for word in query:
            lowered.append(word.lower())

        # Go over every word in query
        for word in lowered:
            counterNoMoreThen4 = 0

            dictionary = thes.synonyms(word)[1][1]

            # find similar expressions and their scores
            listOfScores = thes.scored_synonyms(word)[1][1]
            dictOfScored = dict(listOfScores)
            # print("\n word: ",word)
            # print(dictOfScored)
            # print(dictionary)

            # Go over the thesaurus words
            #for idx, syn in enumerate(dictionary):
            #    related.append(syn)

            # Go over the scored dictionary
            for key in dictOfScored:

                # Check if similar enough and no more then 4 per word
                if dictOfScored[key] > 0.21 and key not in lowered and counterNoMoreThen4 < 4:
                    counterNoMoreThen4 += 1

                    # if the similar term contains ' '
                    if key.__contains__(' '):
                        splited = key.split()

                        # add only relevant term
                        for term in splited:
                            if term not in lowered:
                                toAdd.add(term)
                    else:
                        toAdd.add(key)
                elif counterNoMoreThen4 == 4:
                    # Too many terms for word
                    continue
            #print("word: ",word," similar:",list(toAdd))

        # Lower term in listToAdd
        listToAdd = list(toAdd)
        for i, term in enumerate(toAdd):
            listToAdd[i] = term.lower()

        #print("list: ", listToAdd)
        #print("how much: ", len(listToAdd))
        return listToAdd
Beispiel #7
0
def thesaurus(terms):
    extended_terms = set()
    for query_word in terms:
        if query_word == "trump":
            continue
        synonyms = linthesaurus.synonyms(query_word)
        for sim, keys in synonyms:
            if len(keys) > 1:
                keys_list = list(keys)
                if len(keys_list) > 2: keys_list = keys_list[:2]  # add only 2
                extended_terms.update(keys_list)
    return list(extended_terms)
 def add_synonyms_to_list(tokens_list):
     """
     Apply thesaurus synonym addition of one synonym per token in the list. (Performance + results relevance
     are the motivation to limit to one synonym per token).
     Returns the new tokens list including the originn and the synonyms
     """
     out_list = []
     for token in tokens_list:
         out_list.append(token)
         for syn in thes.synonyms(token, fileid="simN.lsp"):
             out_list.append(syn)
             break
     return out_list
Beispiel #9
0
    def extend_query(self):
        word_to_add = []
        if len(self.query) == 0:
            return []
        for word in self.query:
            list_of_thes = thesaurus.synonyms(word)
            #print(list_of_thes)
            for i in range(len(list_of_thes)):
                if len(list_of_thes[i][1]) > 1:
                    word_to_add.append(list(list_of_thes[i][1])[0])

        self.query.extend(word_to_add)
        return self.query
    def thesaurus_method(self, query_list):
        """
        This function use thesaurus synonym addition of one synonym per token in the query list.
        There is limitation to one synonym per token.
        Returns the new tokens list including the originn and the synonyms
        """
        new_query_list = []
        for token in query_list:
            for synonym_term in thes.synonyms(token, fileid="simN.lsp"):
                new_query_list.append(synonym_term)
                break
        query_list.extend(new_query_list)

        return query_list
Beispiel #11
0
    def query_expansion(self, query):
        """
        for each word in query.query_text apply Part Of Speach tagging.
        then, apply thesaurus for finding synonyms of each word in the query.
        expand the query with these synonyms/
        :param query:
        :return:
        """
        query_dict = query.query_dict
        query_length = query.query_length

        thes_dict = {}

        for word in query_dict.keys():
            thes_dict[word] = query_dict[word]
            text = [word]
            word_pos = nltk.pos_tag(text)
            word_pos = self.tag(word_pos[0][1])

            word_list_thesaurus = thesaurus.synonyms(word)

            if word_list_thesaurus:

                word_to_switch_list = []
                max_counter = 10
                chosen_words = []

                if word_pos == "ADJ":
                    word_to_switch_list = word_list_thesaurus[0][1]
                elif word_pos == "NOUN" or word_pos == "PROPN":
                    word_to_switch_list = word_list_thesaurus[1][1]
                elif word_pos == "VERB":
                    word_to_switch_list = word_list_thesaurus[2][1]

                for token in word_to_switch_list:
                    if len(chosen_words) == max_counter:
                        break
                    split_token = token.split(" ")
                    if len(split_token) > 1:
                        continue

                    if token in self._indexer.inverted_idx and token not in query_dict.keys():
                        chosen_words.append(token)

                for words in chosen_words:
                    thes_dict[str(words)] = query_dict[word]

        query.query_length = len(thes_dict)
        query.query_dict = thes_dict
Beispiel #12
0
 def get_synonym(word):
     """
     finds term's synonym using lin thesaurus
     :param word:
     :return:
     """
     synonyms_types = lt.synonyms(word[0])
     pos_tag = word[1]
     if pos_tag.startswith('J'):
         synonyms_list = list(synonyms_types[0][1])
     elif pos_tag.startswith('V'):
         synonyms_list = list(synonyms_types[2][1])
     else:
         synonyms_list = list(synonyms_types[1][1])
     if len(synonyms_list) > 0:
         return synonyms_list[0]
     return None
Beispiel #13
0
Datei: lin.py Projekt: sp00/nltk
def demo():
    from nltk.corpus import lin_thesaurus as thes

    word1 = "business"
    word2 = "enterprise"
    print "Getting synonyms for " + word1
    print thes.synonyms(word1)

    print "Getting scored synonyms for " + word1
    print thes.synonyms(word1)

    print "Getting synonyms from simN.lsp (noun subsection) for " + word1
    print thes.synonyms(word1, fileid="simN.lsp")

    print "Getting synonyms from simN.lsp (noun subsection) for " + word1
    print thes.synonyms(word1, fileid="simN.lsp")

    print "Similarity score for %s and %s:" % (word1, word2)
    print thes.similarity(word1, word2)
Beispiel #14
0
    def get_term_synonym(self, tagged_term):
        chosen_syn = None
        try:
            syn_types = thesaurus.synonyms(tagged_term[0])
            part_of_speech = tagged_term[1]
            if part_of_speech.startswith('V'):
                type = syn_types[2]
            elif part_of_speech.startswith('J'):
                type = syn_types[0]
            else:
                type = syn_types[1]

            if len(type[1]) > 0:
                chosen_syn = list(type[1])[0]

        except:
            return chosen_syn
        return chosen_syn
Beispiel #15
0
    def search(self, query, k=None, methods=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """

        # spell corrections
        if 1 in methods:
            spell = SpellChecker()
            query = ' '.join(
                [spell.correction(word) for word in query.split()])

        query_terms = self._parser.Tokenize(query).keys()
        extenders = set()

        # wordNet
        if 2 in methods:
            for word in query_terms:
                for ex_word in self.wordNet(word.text):
                    extenders.add(self._parser.add_to_dict(ex_word))

        # lin_thesaurus
        if 3 in methods:
            for word in query_terms:
                for ex_word in list(thes.synonyms(
                        word.text)[1][1])[:self._the_count]:
                    extenders.add(self._parser.add_to_dict(ex_word))

        extenders = {extender for extender in extenders if extender}
        w_of_term_in_query = self.CalculateW(query_terms, extenders)

        relevant_docs = self._relevant_docs_from_posting(
            w_of_term_in_query.keys())
        ranked_doc_ids = self._ranker.rank_relevant_docs(
            relevant_docs, k, w_of_term_in_query)

        return len(ranked_doc_ids), ranked_doc_ids
Beispiel #16
0
    def synonyms(words_to_check):
        """
        The method will receive a query after parse, expand it by adding a noun synonym (if exists)
        to each word in the query using the lin_thesaurus module  and return the updated query dictionary
        :param words_to_check: a parsed query {term: tf in dictionary}
        :return: updated query dictionary with added terms
        """
        query_terms = []

        for word in words_to_check:
            synonym_words = thesaurus.synonyms(word)
            if not synonym_words:
                continue
            # take noun words only if exist
            noun_synonyms = synonym_words[1]
            if len(noun_synonyms[1]) > 0:
                # take highest fit noun word
                noun_word = list(noun_synonyms[1])[0]
                query_terms.append(noun_word)
        Thesaurus.add_to_dict(words_to_check, query_terms)
        return words_to_check
Beispiel #17
0
 def __init__(self, indexer):
     self._indexer = indexer
     w = thesaurus.synonyms("")
Beispiel #18
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """

        # parse query according to the same parsing rules of the corpus
        entities = {}
        term_dict = {}
        parsed_query = self._parser.parse_sentence(query,
                                                   entities,
                                                   stemming=self.stemming)
        self._parser.parse_capital_letters(parsed_query, term_dict)
        processed_query = [*term_dict.keys()] + [*entities.keys()]

        # perform spell correction
        if self.spell_correction:

            from spellchecker import SpellChecker
            spell_checker = SpellChecker()
            corrected_terms = []

            # list all misspelled terms in the query
            misspelled_terms = spell_checker.unknown([*term_dict.keys()])
            for term in misspelled_terms:

                # only correct terms that aren't in the inverted dictionary
                # terms in the dictionary are considered correct for retrieval
                if term not in self._indexer.inverted_idx:
                    candidates = spell_checker.candidates(term)
                    max_to_return = min(Searcher.TOP_N, len(candidates))
                    candidates = candidates[:
                                            max_to_return]  # return only the top 3 results
                    if term in candidates:  # remove duplicate originally correct terms
                        candidates.remove(term)

                    for candidate in candidates:  # remove corrections already in query
                        if candidate in parsed_query:
                            candidates.remove(candidate)

                    corrected_terms.extend(candidates)

            processed_query += corrected_terms  # extend query with corrected words

        if self.thesaurus:

            from nltk.corpus import lin_thesaurus as thes

            candidates = []
            for term in processed_query:

                synsets = thes.synonyms(term)
                for synset in synsets:
                    synonyms = [*synset[1]]
                    if len(synonyms) > 0:
                        max_to_return = min(Searcher.TOP_N, len(synonyms))
                        best_synonyms = synonyms[:max_to_return]
                        for synonym in best_synonyms:
                            if synonym != term and synonym not in processed_query and synonym in self._indexer.inverted_idx:
                                candidates.append(synonym)  # extend the query
                        break

            processed_query += candidates

        if self.wordnet:

            from nltk.corpus import wordnet

            print("wordenting")
            candidates = []
            for term in processed_query:
                print(f"term {term}:")
                synsests = wordnet.synsets(term)  # retrieve best syn_sets
                max_to_return = min(Searcher.TOP_N, len(synsests))
                synsests = synsests[0:max_to_return]
                print("returned synsets")
                skip = False
                for synset in synsests:
                    for lemma in synset.lemmas(
                    )[:max_to_return]:  # possible synonyms
                        print(f"possible lemma: {lemma.name()}")
                        if lemma.name() != term and lemma.name(
                        ) not in processed_query and lemma.name():
                            if lemma.name() in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break
                            elif lemma.name().lower(
                            ) in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break
                            elif lemma.name().upper(
                            ) in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break

                    if skip:
                        break

            parsed_query += candidates

        # dictionary for holding all relevant documents (at least one query term appeared in the document)
        # format: {document_id: score}
        relevant_docs = {}
        for term in processed_query:

            # check if term exists in inverted dictionary in either lower or upper form
            if term in self._indexer.inverted_idx:
                self.calculate_doc_scores(term, relevant_docs)
            elif term.islower() and term.upper() in self._indexer.inverted_idx:
                self.calculate_doc_scores(term.upper(), relevant_docs)
            elif term.isupper() and term.lower() in self._indexer.inverted_idx:
                self.calculate_doc_scores(term.lower(), relevant_docs)

        n_relevant = len(relevant_docs)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)

        return n_relevant, ranked_doc_ids
Beispiel #19
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        # all_dicts = self._indexer.load_index('inverted_idx.pkl')
        inverted_index = self._indexer.inverted_idx
        posting = self._indexer.postingDict
        documents = self._indexer.documents
        dict_of_methods = self._indexer.dict_of_method

        if dict_of_methods['wordnet']== True:
            #wordnet method
            doc_query_app = self.finished_dict(query, inverted_index) #  first parse query words
            list_of_query = doc_query_app.keys()
            words_to_add = {}
            # get each query word its synsets and add to query the ones that in inverted index
            for word in list_of_query:
                opt = wordnet.synsets(word)
                for i in range(len(opt)):
                    check_word = opt[i].lemmas()[0].name()
                    if check_word in doc_query_app.keys() or check_word in words_to_add.keys():
                        continue
                    tested = self._indexer.check_upper_lower(inverted_index, check_word)
                    if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                        continue
                    if tested[1] is True:
                        words_to_add[tested[0]] = 0.0001
                    elif tested[1] is 'replace':
                        words_to_add[tested[0].upper()] = 0.0001
            doc_query_app.update(words_to_add)

        elif dict_of_methods['spell_correction']== True:
            spell = SpellChecker(case_sensitive=True)
            query_as_list = query.split()
            for index in range(len(query_as_list)):
                is_upper = False
                word = query_as_list[index]
                # if word from query not in inverted index look for correction- take the first one that is in inverted index
                if self._indexer.check_upper_lower(inverted_index, word)[1] is False:  # word not in inverted index
                    if word[0].isupper() is True:
                        is_upper = True
                    options = spell.candidates(word)
                    is_found = False
                    i = 0
                    options = list(options)
                    while i < len(options):
                        if self._indexer.check_upper_lower(inverted_index, options[i])[1] is True:
                            corrected = options[i]
                            is_found = True
                            break
                        i += 1
                    # corrected = spell.correction(word)
                    if is_found is not False and corrected != query_as_list[index]:
                        if is_upper is True:
                            corrected = corrected.capitalize()
                        query_as_list[index] = corrected
            doc_query_app = self.finished_dict(" ".join(query_as_list), inverted_index)

        elif dict_of_methods['word2vec'] == True:
            words_to_add = {}
            doc_query_app = self.finished_dict(query, inverted_index)
            query_as_list = query.split()
            insert_new_words = []
            for word in query_as_list:
                if word in self._model.wv.wv.vocab:
                    lst_sim_word_model = self._model.most_similar(word.lower())
                    for similiar_word in lst_sim_word_model:
                        if similiar_word[1] > 0.33:
                            insert_new_words.append(similiar_word[0])

            # if len(insert_new_words) == 0:
            #     continue
            idx = 0
            while idx < len(insert_new_words):
                if insert_new_words[idx] in doc_query_app.keys() or insert_new_words[idx] in words_to_add.keys():
                    idx += 1
                    continue
                tested = self._indexer.check_upper_lower(inverted_index, insert_new_words[idx])
                if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                    idx += 1
                    continue
                if tested[1] is True:
                    words_to_add[tested[0]] = 0.6
                    break
                elif tested[1] is 'replace':
                    words_to_add[tested[0].upper()] = 0.6
                    break
                idx += 1
            doc_query_app.update(words_to_add)

        elif dict_of_methods['thesaurus'] == True:
            doc_query_app = self.finished_dict(query, inverted_index) #  first parse query words
            list_of_query = list(doc_query_app.keys())
            words_to_add = {}
            # get each query word its synonyms and add to query the first that is in inverted index
            stop = set(stopwords.words('english'))
            results = [thes.synonyms(i, fileid="simN.lsp") for i in list_of_query if i not in stop]
            results_as_list = list(results)
            for words in results_as_list:
                inside_list = list(words)
                if len(inside_list) == 0:
                    continue
                idx = 0
                while idx < len(inside_list):
                    if inside_list[idx] in doc_query_app.keys() or inside_list[idx] in words_to_add.keys():
                        idx += 1
                        continue
                    tested = self._indexer.check_upper_lower(inverted_index, inside_list[idx])
                    if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                        idx += 1
                        continue
                    if tested[1] is True:
                        words_to_add[tested[0]] = 0.0001
                        break
                    elif tested[1] is 'replace':
                        words_to_add[tested[0].upper()] = 0.0001
                        break
                    idx += 1
            doc_query_app.update(words_to_add)

        else:  # dict_of_methods['parser'] = True
            doc_query_app = self.finished_dict(query, inverted_index)

        if len(doc_query_app) == 0:
            return []

        dict_relevant_docs = self._relevant_docs_from_posting(doc_query_app, posting)
        ranked_doc_ids = Ranker.rank_relevant_docs(dict_relevant_docs , posting, documents, doc_query_app)
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids