Ejemplo n.º 1
0
 def test_edit_distance_two(self):
     ''' test a case where edit_distance_2 is called '''
     here = os.path.dirname(__file__)
     filepath = '{}/resources/small_dictionary.json'.format(here)
     spell = SpellChecker(language=None, local_dictionary=filepath)
     self.assertEqual(spell.edit_distance_2('hie'), ['bike'])
def query_expansion_thesaurus(query_str):
    """
    Params:
        - query_str: Original input query string

    Returns:
        - new_query: List of expanded query terms and weights [ (term, weight), ...]
    """

    query_str = query_str.replace('AND', '').replace('"', '')
    query_terms = set([query_term.strip() for query_term in query_str.split(' ')])

    term_weights = defaultdict(lambda: 0.0)
    new_query_terms = list(query_terms)

    ORIGINAL_TERM_WEIGHT = 1
    SYNONYM_TERM_WEIGHT = 0.5
    MISSPELLING_TERM_WEIGHT = 0.6
    CONTEXT_TERM_WEIGHT = 0.7


    for term in query_terms:
        term_weights[term] = ORIGINAL_TERM_WEIGHT


    # Add synonyms to query
    FRACTION_SYNONYMS = 0.35

    synonyms = []
    for term in query_terms:
        for syn in wordnet.synsets(term):
            num_synonyms = math.ceil(len(syn.lemmas()) * FRACTION_SYNONYMS)

            for l in syn.lemmas()[:num_synonyms]:
                for t in l.name().split("_"):
                    synonyms.append(t)
                    term_weights[t] = max(SYNONYM_TERM_WEIGHT, term_weights[t])

        # Use original query context to find suitable synonyms
        context_syn = lesk(query_str, term)
        if context_syn is not None:
            for l in context_syn.lemmas():
                for t in l.name().split("_"):
                    synonyms.append(t)
                    term_weights[t] = max(CONTEXT_TERM_WEIGHT, term_weights[t])

    new_query_terms.extend(synonyms)


    # Add spelling corrections
    spell = SpellChecker()

    unknown_misspelled_terms = spell.unknown(query_terms)
    for term in unknown_misspelled_terms:
        new_query_terms.append(spell.correction(term))  # Add the most likely correct word

    known_misspelled_terms = spell.known(query_terms)
    for mispelled_term in known_misspelled_terms:
        corrected_terms = spell.known(spell.edit_distance_2(mispelled_term))

        probs = []
        for term in corrected_terms:
            probs.append((term, spell.word_probability(term)))

        probs = sorted(probs, key=lambda x: x[1], reverse=True)
        new_corrected_terms = [term[0] for term in probs]

        new_query_terms.extend(new_corrected_terms[:3])  # Add top 3 corrections
        for t in new_corrected_terms[:3]:
            term_weights[t] = max(MISSPELLING_TERM_WEIGHT, term_weights[t])


    # Preprocess terms and assign weights to the expanded query terms
    new_query = defaultdict(lambda: 0.0)
    for term in set(new_query_terms):
        for new_term in set(util.preprocess_content(term)):
            new_query[new_term] = max(term_weights[term], new_query[new_term])

    new_query = [(term, weight) for term, weight in new_query.items()]

    return new_query