Ejemplo n.º 1
0
    def generate_chunks(self, keywords):
        """
        params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b'])
        returns: a list of fields matching a combination of nearby keywords

        .. doctest::

            {
                '[result_type]':
                    [ matched_field, ...]
            }
        """

        if not get_setting('SERVICE_RESULT_FIELDS'):
            return {}

        matches = self.get_phrase_matches(keywords)
        self.append_subquery_matches(keywords, matches)
        # return the matches in sorted order (per result type)
        for entity, m_list in matches.items():
            for match in m_list:
                last_token = match['tokens_required'][-1]
                tokens_used = match['tokens_required']
                match['predicate'] = get_operator_and_param(last_token)
                match['field_name'] = match['field']['name']
                match['tokens_required_non_stopw'] = \
                    filter_stopwords(tokens_used)
                match['tokens_required_set'] = set(tokens_used)

            m_list.sort(key=lambda f: f['score'], reverse=True)

            # as IR based matching is fairly dumb now,
            # prune out the useless matches
            purge = []
            for m1 in m_list:
                for m2 in m_list:
                    tokens1 = m1['tokens_required_set']
                    tokens2 = m2['tokens_required_set']
                    if (m2 != m1 and m1['field_name'] == m2['field_name']
                            and tokens1.issubset(tokens2)
                            and m1['score'] + 0.01 >= m2['score']):
                        # mark a useless match for deletion
                        purge.append(m2)
            matches[entity] = [match for match in m_list if match not in purge]

        normalize_scores(matches)

        # if enabled, prune low scoring chunks
        if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'):
            cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS')
            for key in matches:
                matches[key] = [
                    match for match in matches[key] if match['score'] > cutoff
                ]

        print_debug(matches)
        return matches
Ejemplo n.º 2
0
    def generate_chunks(self, keywords):
        """
        params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b'])
        returns: a list of fields matching a combination of nearby keywords

        .. doctest::

            {
                '[result_type]':
                    [ matched_field, ...]
            }
        """

        if not get_setting('SERVICE_RESULT_FIELDS'):
            return {}

        matches = self.get_phrase_matches(keywords)
        self.append_subquery_matches(keywords, matches)
        # return the matches in sorted order (per result type)
        for entity, m_list in matches.items():
            for match in m_list:
                last_token = match['tokens_required'][-1]
                tokens_used = match['tokens_required']
                match['predicate'] = get_operator_and_param(last_token)
                match['field_name'] = match['field']['name']
                match['tokens_required_non_stopw'] = \
                    filter_stopwords(tokens_used)
                match['tokens_required_set'] = set(tokens_used)

            m_list.sort(key=lambda f: f['score'], reverse=True)

            # as IR based matching is fairly dumb now,
            # prune out the useless matches
            purge = []
            for m1 in m_list:
                for m2 in m_list:
                    tokens1 = m1['tokens_required_set']
                    tokens2 = m2['tokens_required_set']
                    if (m2 != m1 and m1['field_name'] == m2['field_name'] and
                            tokens1.issubset(tokens2) and
                            m1['score'] + 0.01 >= m2['score']):
                        # mark a useless match for deletion
                        purge.append(m2)
            matches[entity] = [match for match in m_list if match not in purge]

        normalize_scores(matches)

        # if enabled, prune low scoring chunks
        if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'):
            cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS')
            for key in matches:
                matches[key] = [match for match in matches[key]
                                if match['score'] > cutoff]

        print_debug(matches)
        return matches
Ejemplo n.º 3
0
def string_distance(keyword, match_to, allow_low_scores=False):
    """
    Basic string-edit distance metrics do not perform well,
    they either introduce too many false positives (file as site), or do not
    recognize fairly similar words, such as 'config' vs 'configuration'.

    Therefore, to minimize the false positives (which have direct effect
    to ranking), we use a combination of more trustful metrics
    listed in the order decreasing score:
    * full match
    * lemma match (e.g. only the word number differs)
    * stem match
    * stem match within a small edit distance (returning a low usable score)
        e.g. 1-2 characters differing, maximum 1 mutation
    """

    if keyword == match_to:
        return 1.0

    lemma = lemmatize(keyword)
    lemma2 = lemmatize(match_to)
    if lemma == lemma2:
        return 0.9

    if get_setting('STRING_DIST_ENABLE_NLTK_STEM'):
        kwd_stem = getstem(keyword)
        match_stem = getstem(match_to)

        if kwd_stem == match_stem:
            return 0.7
    else:
        kwd_stem = keyword
        match_stem = match_to

    score = 0.7 * levenshtein_norm(kwd_stem, match_stem, subcost=2, maxcost=3)

    if allow_low_scores:
        return score if score > 0.1 else 0.0
    else:
        return score if score > 0.35 else 0.0