def nameCompare(name1, name2):
    """the function checks by typo's and nicknames if two names are equals. returns false if differnent names and true if equals"""
    if name1 == name2:
        return True
    spell = SpellChecker(distance=1)
    nameSet1 = spell.edit_distance_1(name1)
    nameSet2 = spell.edit_distance_1(name2)

    candidateSet1 = set()
    candidateSet2 = set()
    with open('nicknames.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            row1 = row[1][1:].lower()
            row2 = row[2][1:].lower()
            for name in nameSet1:
                if name.lower() == row1 or name.lower() == row2:
                    candidateSet1.add(row1)
                    candidateSet1.add(row2)

            for name in nameSet2:
                if name.lower() == row1 or name.lower() == row2:
                    candidateSet2.add(row1)
                    candidateSet2.add(row2)

            if len(set.intersection(candidateSet1, candidateSet2)) > 0:
                return True
        return False
Ejemplo n.º 2
0
class Spell_Searcher:
    def __init__(self, indexer):
        self._indexer = indexer
        self.spell = None

    def query_expansion(self, query):
        """
        This function finds a misspelled word and finds its closest similarity.
        first by tracking all of its candidates. the candidate with the most appearances in the inverted index
        will be the "replaced"
        :param query: query dictionary
        :return: query dictionary with replaced correct words.
        """
        try:
            self.spell = SpellChecker(local_dictionary='spell_dict.json', distance=1)
        except:
            pass

        query_dict = query.query_dict
        for term in query_dict:

            if term.lower() not in self._indexer.inverted_idx and term.upper() not in self._indexer.inverted_idx:

                misspelled_checker = self.spell.unknown([term])

                if len(misspelled_checker) != 0:
                    candidates = list(self.spell.edit_distance_1(term))

                    super_candidates = list(self.spell.candidates(term))
                    candidates.extend(super_candidates)

                    max_freq_in_corpus = 0
                    max_freq_name = ''

                    for i, candidate in enumerate(candidates):
                        if candidate in self._indexer.inverted_idx:
                            curr_freq = self._indexer.inverted_idx[candidate]
                            if curr_freq > max_freq_in_corpus:
                                max_freq_in_corpus = curr_freq
                                max_freq_name = candidate

                        elif candidate.upper() in self._indexer.inverted_idx:
                            curr_freq = self._indexer.inverted_idx[candidate.upper()]
                            if curr_freq > max_freq_in_corpus:
                                max_freq_in_corpus = curr_freq
                                max_freq_name = candidate

                    if max_freq_name != '':
                        print(max_freq_name)
                        query_dict[max_freq_name] = query_dict.pop(term)
                    else:
                        continue
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        # spell checker
        query_as_list = self._parser.parse_sentence(query)
        inverted_idx = self.indexer.inverted_idx
        spell = SpellChecker()
        misspelled = spell.unknown(query_as_list)
        assist = [x.lower()
                  for x in query_as_list]  # all the query terms in lower case

        for word in misspelled:
            if word.upper() in inverted_idx.keys() or word.lower(
            ) in inverted_idx.keys() or ' ' in word:
                continue  # if the word is in the inverted index- no correction need

            word_idx = assist.index(word)
            corrections = spell.edit_distance_1(
                word
            )  # list of all the suggested corrections with distance value 1
            corrections_dict = {}
            # check if the suggested corrections is in inverted index and collect the frequency of each correction
            for correction in corrections:
                if correction.upper() in inverted_idx.keys():
                    corrections_dict[correction] = inverted_idx[
                        correction.upper()]

                if correction.lower() in inverted_idx.keys():
                    corrections_dict[correction] = inverted_idx[
                        correction.lower()]

            if corrections_dict:
                query_as_list[word_idx] = max(
                    corrections_dict, key=corrections_dict.get
                )  # choose the most common correction
            else:
                query_as_list[word_idx] = spell.correction(word)

        new_query = ' '.join(query_as_list)
        relevant_docs = searcher.search(new_query)
        return relevant_docs
def SurNameCompare(name1, name2):
    """checks if two surnames are equals by surnames database and typo's. returns false if differnent names and true if equals"""
    if name1 == name2:
        return True
    spell = SpellChecker(distance=1)
    nameSet1 = spell.edit_distance_1(name1)
    nameSet2 = spell.edit_distance_1(name2)
    candidateSet1 = set()
    candidateSet2 = set()
    with open('surnames.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=' ')
        for row in csv_reader:
            for name in nameSet1:
                if name.lower() == row[0].lower():
                    candidateSet1.add(row[0].lower())

            for name in nameSet2:
                if name.lower() == row[0].lower():
                    candidateSet2.add(row[0].lower())

            if len(set.intersection(candidateSet1, candidateSet2)) > 0:
                return True
        return False
Ejemplo n.º 5
0
 def test_checking_odd_word(self):
     ''' test checking a word that is really a number '''
     spell = SpellChecker()
     self.assertEqual(spell.edit_distance_1('12345'), {'12345'})
Ejemplo n.º 6
0
class Searcher:
    def __init__(self, inverted_index, number_of_documents, load_path):
        """
        :param inverted_index: dictionary of inverted index
        :param number_of_documents: number of documents in the corpus
        """
        self.parser = Parse()
        self.ranker = Ranker()
        self.inverted_index = inverted_index
        self.current_file_name = ""
        self.current_posting = None

        self.term_posting_dict = {}
        self.sorted_query_dict = {}
        self.number_of_documents = number_of_documents
        self.docs_dict = {}
        self.spell = SpellChecker(local_dictionary='spell_dict.json',
                                  distance=1)

        self.load_path = load_path + "\\{}"

    def relevant_docs_from_posting(self, query):
        """
        This function loads the posting list and counts the amount of relevant documents per term.
        :param query: query
        :return: dictionary of relevant documents.
        """

        query_dict = query.query_dict
        query_dict = self.spell_correction(query_dict)
        for term in query_dict:
            if term in self.inverted_index:
                continue

            elif term.isupper() and term not in self.inverted_index:
                if term.lower() in self.inverted_index:
                    query_dict[term.lower()] = query_dict.pop(term)

            elif term.islower() and term not in self.inverted_index:
                if term.upper() in self.inverted_index:
                    query_dict[term.upper()] = query_dict.pop(term)

        self.sorted_query_dict = {k: query_dict[k] for k in sorted(query_dict)}
        for term in self.sorted_query_dict:
            if term in self.inverted_index:
                posting_file_to_load = self.inverted_index[term][1]

            else:
                continue

            if posting_file_to_load != self.current_file_name:
                self.current_file_name = posting_file_to_load
                self.current_posting = self.read_posting(posting_file_to_load)

            if term in self.current_posting:
                self.term_posting_dict[term] = self.current_posting[term]

        self.document_dict_init(self.term_posting_dict, query.query_length)

        return self.docs_dict

    def spell_correction(self, query_dict):
        """
        This function finds a misspelled word and finds its closest similarity.
        first by tracking all of its candidates. the candidate with the most appearances in the inverted index
        will be the "replacer"
        :param query: query dictionary
        :return: query dictionary with replaced correct words.
        """

        for term in query_dict:

            if term.lower() not in self.inverted_index and term.upper(
            ) not in self.inverted_index:

                misspelled_checker = self.spell.unknown([term])

                if len(misspelled_checker) != 0:
                    candidates = list(self.spell.edit_distance_1(term))

                    super_candidates = list(self.spell.candidates(term))
                    candidates.extend(super_candidates)

                    max_freq_in_corpus = 0
                    max_freq_name = ''

                    for i, candidate in enumerate(candidates):
                        if candidate in self.inverted_index:
                            curr_freq = self.inverted_index[candidate][0]
                            if curr_freq > max_freq_in_corpus:
                                max_freq_in_corpus = curr_freq
                                max_freq_name = candidate

                        elif candidate.upper() in self.inverted_index:
                            curr_freq = self.inverted_index[
                                candidate.upper()][0]
                            if curr_freq > max_freq_in_corpus:
                                max_freq_in_corpus = curr_freq
                                max_freq_name = candidate

                    if max_freq_name != '':
                        query_dict[max_freq_name] = query_dict.pop(term)
                    else:
                        continue

        return query_dict

    def read_posting(self, posting_name):
        """
        This function seeks for the file name and reads it from the disk.
        :param posting_name: file name
        :return: posting file
        """
        pickle_in = open(self.load_path.format(posting_name), "rb")
        dict_to_load = pickle.load(pickle_in)
        pickle_in.close()

        return dict_to_load

    def document_dict_init(self, postings_dict, query_length):
        """
       This function initiates the sorted dictionary that will contain each term of the query and its
       corresponding posting list
       :param postings_dict: a dictionary of term (key) and a posting list (value)
       :param query_length: query length
       :return:
       """

        tf_idf_list = [0] * query_length
        sorted_posting_dict = {
            k: postings_dict[k]
            for k in sorted(postings_dict)
        }

        for idx, (term, doc_list) in enumerate(sorted_posting_dict.items()):
            for doc_tuple in doc_list:
                if doc_tuple[0] not in self.docs_dict:
                    self.docs_dict[doc_tuple[0]] = tf_idf_list

                try:
                    dfi = self.inverted_index[term][2]
                except:
                    dfi = self.inverted_index[term.lower()][2]

                idf = math.log(self.number_of_documents / dfi, 10)
                tf_idf = idf * doc_tuple[2]

                self.docs_dict[doc_tuple[0]][idx] = tf_idf
                tf_idf_list = [0] * query_length

    def normalized_query(self, query):
        """
       This function normalizes each term in the auery by the max term freq in the SORTED query dict.
       :param query: a query object
       :return: normalized query values
       """

        normalized = []
        max_freq_term = query.max_freq_term

        for key in self.sorted_query_dict:
            tf = self.sorted_query_dict[key]
            normalized.append(tf / max_freq_term)

        return normalized