Ejemplo n.º 1
0
def attributive_of_noun(sense, main_sense_of_entry):
    """
    An attrib. sense of a noun is treated as the adj. equivalent
    of the main sense of the entry (or of a particular sense, if referenced)
    """
    # Use a particular sense if it's cross-referenced
    #  Check that it's an internal cross-reference to a main sense
    #   (hence no lemma)
    xrefs = [xr for xr in sense.cross_references if
        xr.refentry == sense.entry_id and xr.lemma is None]
    if xrefs:
        target_sense = tdb.highest_ranked(lemma=sense.lemma,
                                          wordclass='NN',
                                          refentry=xrefs[0].refentry,
                                          refid=xrefs[0].refid)
        if target_sense is not None and target_sense.thesclass is not None:
            equiv = tdb.equivalent_class(target_sense.thesclass, 'JJ')
            equiv.reason_text = 'Adjective equivalent of cross-referenced noun sense'
            equiv.reason_code = 'attb'
            return equiv
        elif target_sense is not None:
            return None

    # ... otherwise, default to the main sense of the entry
    if (main_sense_of_entry is not None and
        main_sense_of_entry.thesclass is not None):
        equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, 'JJ')
        equiv.reason_text = 'Adjective equivalent of main noun sense'
        equiv.reason_code = 'attb'
        return equiv
    else:
        return None
Ejemplo n.º 2
0
def infer_derivative(sense, main_sense_of_entry):
    equiv = None
    if (main_sense_of_entry is not None and
        not main_sense_of_entry.is_affix() and
        main_sense_of_entry.thesclass is not None):
        # Take precedent from the main sense of the parent entry
        equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, sense.wordclass)
        if equiv is not None:
            equiv.reason_text = 'Parallel to "%s"' % main_sense_of_entry.lemma
            equiv.reason_code = 'driv'
    if equiv is None:
        # Take precedent from other sibling derivatives
        candidates = tdb.ranked_search(refentry=sense.entry_id,
            thes_linked=True, currentOnly=True)
        candidates = [c for c in candidates if c.is_derivative() and
                      not ' ' in c.lemma and
                      c.superclass() is not None]
        if candidates:
            j = defaultdict(int)
            for c in candidates:
                j[c.superclass()] += 1
            parents = [p for p, num in j.items() if num == max(j.values())]
            # If there's more than one possible parent, pick the one with
            #  the largest branch
            parents.sort(key=lambda p: p.branch_size, reverse=True)
            equiv = tdb.child_wordclass_branch(parents[0], sense.wordclass)
            if equiv is None:
                equiv = parents[0]
            if equiv is not None:
                equiv.reason_text = 'Parallel to %s' % ', '.join(
                    ['"%s"' % (c.lemma,) for c in candidates if c.lemma])
                equiv.reason_code = 'driv'
    return equiv
def _append_derivational_classification(best_guesses, sense):
    """
    If the compound is a derivative of another compound, prepend or append a
    classification based on the root form
    """
    base_compound_class, insert_position = compound_derivative(sense)
    if base_compound_class is not None:
        equiv = tdb.equivalent_class(base_compound_class, sense.wordclass)
        if (equiv is not None and
                equiv.wordclass is not None and
                not any([b.target.is_descendant_of(equiv) for b in best_guesses])):
            # Create a new derivative-based guess object
            #   Give it a dummy score, calculated to be higher or lower
            #   than the set of existing guesses
            if not best_guesses:
                score = 1
            elif insert_position == 'first':
                score = best_guesses[0].target_score * 1.1
            else:
                score = best_guesses[-1].target_score * 0.9
            new_guess = BestGuess(None, equiv, score, 'derivative')
            # ... and prepend/append it to the best_guesses list
            if insert_position == 'first':
                best_guesses.insert(0, new_guess)
            elif insert_position == 'last':
                best_guesses.append(new_guess)
    return best_guesses
Ejemplo n.º 4
0
def infer_from_etyma(sense, subjectFilter=False):
    etymon, target_instance = (None, None)
    if len(sense.etyma) == 1 and sense.etyma[0][0] == sense.lemma:
        etymon = sense.etyma[0]
    elif (len(sense.etyma) == 2 and
        re.search(r'^[a-zA-Z]+$', sense.etyma[0][0]) and
        re.search(r'^-[a-z]+$', sense.etyma[1][0])):
        suffix = sense.etyma[1][0]
        if (deriv_tester.is_neutral_suffix(suffix) or
            (suffix in ('-ist', '-ian', '-ful') and sense.wordclass == 'JJ')):
            etymon = sense.etyma[0]

    if etymon is not None:
        # First try to find the exact sense, in case the etymon points to a
        #  specific sense - see e.g. lam n./3
        target_instance = tdb.highest_ranked(lemma=etymon[0],
                                             refentry=etymon[1],
                                             refid=etymon[2],
                                             exact_sense=True)
        # ...but if the etymon just points to an entry in general, find that
        #  entry's main sense
        if target_instance is None and not subjectFilter:
            target_instance = main_sense_finder.main_sense(lemma=etymon[0],
                                                           refentry=etymon[1])
        elif target_instance is None and subjectFilter:
            main_sense = tdb.highest_ranked(lemma=etymon[0],
                                            refentry=etymon[1],
                                            subjects=sense.subjects)
            if main_sense is not None and main_sense.entry_size < 100:
                target_instance = main_sense

        if target_instance is not None:
            # Check if the target is also referenced in the sense's
            #  cross-references (in case a particular sense is pointed to,
            #  as in 'nocturning').
            for xr in sense.cross_references:
                if (xr.lemma == target_instance.lemma and
                    xr.refentry == target_instance.refentry):
                    specific_target = tdb.highest_ranked(lemma=xr.lemma,
                                                         refentry=xr.refentry,
                                                         refid=xr.refid,
                                                         exact_sense=True)
                    if specific_target is not None:
                        target_instance = specific_target
                    break

    if target_instance is not None and target_instance.thesclass is not None:
        if target_instance.wordclass == sense.wordclass:
            match = target_instance.thesclass.wordclass_parent()
        else:
            match = tdb.equivalent_class(target_instance.thesclass, sense.wordclass)
        if match is not None:
            match.reason_code = 'etym'
            match.reason_text = 'Analogy with "%s" in etymology' % etymon[0]
        return match
    else:
        return None
Ejemplo n.º 5
0
 def superordinate_adjective_state(self, sense):
     if sense.superordinate is not None:
         m = re.search(r'^state of being ([a-z-]+)-JJ$', sense.superordinate)
         if m is None:
             m = re.search(r'^being ([a-z-]+)-JJ state$', sense.superordinate)
         if m is not None:
             adjective = m.group(1)
             target_sense = MAIN_SENSE_FINDER.main_sense(lemma=adjective,
                                                         wordclass='JJ')
             if (target_sense is not None and
                 target_sense.thesclass is not None):
                 return tdb.equivalent_class(target_sense.thesclass, 'NN')
     return None
Ejemplo n.º 6
0
def superordinate_lookup(sense, **kwargs):
    match = superordinate_manager.superordinate_lookup(sense, **kwargs)
    if match is not None and match.wordclass is not None:
        if sense.wordclass == 'JJ':
            match = tdb.equivalent_class(match, 'JJ')
            match.reason_code = 'adeq'
            match.reason_text = 'Adjective equivalent of "%s"' % sense.superordinate
        else:
            match.reason_code = 'supe'
            match.reason_text = 'Classification of superordinate "%s" ("%s")'\
                % (sense.superordinate, sense.superordinate_full)
        #if sense.wordclass == 'VB':
        #    print('\n----------------------------------------')
        #    print(trace_sense(sense))
        #    print(trace_class(match))
        return match
    else:
        return None
Ejemplo n.º 7
0
def infer_from_neighbouring_wordclass(sense):
    match = None
    if sense.wordclass in ('JJ', 'VB'):
        opposite_class = 'NN'
    elif sense.wordclass in ('NN', 'RB'):
        opposite_class = 'JJ'
    else:
        opposite_class = None
    if opposite_class is not None:
        opposite = tdb.highest_ranked(lemma=sense.lemma,
                                      refentry=sense.entry_id,
                                      wordclass=opposite_class)
        if opposite is not None and opposite.thesclass is not None:
            match = tdb.equivalent_class(opposite.thesclass, sense.wordclass)

    if match is not None:
        match.reason_code = 'nbor'
        match.reason_text = 'Inferred from neighbouring %s branch' % opposite_class
    return match
Ejemplo n.º 8
0
def cf_cross_reference(sense):
    """
    If the current sense has 'cf'-type cross-reference, the classification
    of the target sense is taken as a good guide to the classification
    of the current sense.

    If the target sense and the current sense are in the same wordclass and
    have the same end word-ending, we assume that the current sense should
    go in the target sense's class (similar to how 'equals'-type xrefs are
    treated.

    Otherwise, we put the current sense at the top of the equivalent
    wordclass-level branch.
    """
    xr = sense.cf_crossreference()
    # Nb don't specify a wordclass here, since the target wordclass may well
    #  be different from the current sense's wordclass
    target_senses, sense_count = tdb.cross_reference_target(lemma=xr.lemma,
        refentry=xr.refentry, refid=xr.refid)

    # Don't attempt if the target is too ambiguous (too many possible senses)
    if target_senses and sense_count <= 3:
        high_scores = [t for t in target_senses
            if t.rating() == target_senses[0].rating()]
        high_scores.sort(key=lambda i: i.node_size(), reverse=True)
        if high_scores[0].thesclass is not None:
            target = high_scores[0]
            if (target.wordclass == sense.wordclass and
                target.lemma[-2:] == sense.lemma[-2:]):
                match = target.thesclass
            elif target.wordclass == sense.wordclass:
                match = target.wordclass_parent()
            else:
                match = tdb.equivalent_class(target.thesclass, sense.wordclass)
            if match is not None:
                match.reason_text = 'Analogy with target of cf-type xref ("%s")' % target.lemma
                match.reason_code = 'cfxr'
            return match

    return None
Ejemplo n.º 9
0
    def find_branch_from_superordinate(self, sense):
        """Classify by finding the main or only sense of the superordinate
        """
        if (sense.wordclass not in ('NN', 'JJ') or
                not sense.superordinate or
                len(sense.superordinate) < 3 or
                sense.superordinate in GENERICS):
            return None

        target_sense = None

        # If the superordinate is (more or less) single-sense, we assume that
        #  sense to be the correct one
        candidates = tdb.ranked_search(
            lemma=sense.superordinate,
            wordclass='NN',
            current_only=True)
        if candidates and tdb.distinct_senses(candidates) <= 2:
            target_sense = candidates[0]

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.confidence() >= 8:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on subject labels
        if target_sense is None and sense.label_based_classifications:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.label_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on cross-references
        if target_sense is None and sense.xref_branches:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.xref_branches,
                current_only=True)

        # Last gasp: If the gloss consists more or less *only* of the
        #   superordinate (e.g. 'an abbey'), then it should be adequate to
        #   just use the main sense of the superordinate, even if it's
        #   multi-sense.
        # But don't risk this is there are cross-references or subject
        #   labels which might suggest a more specific use
        if (target_sense is None and not sense.subjects and
            not sense.xref_branches and sense.gloss is not None):
            g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower())
            if g == sense.superordinate:
                target_sense = MAIN_SENSE_FINDER.main_sense(
                    lemma=sense.superordinate, wordclass='NN')

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.is_usable():
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        if target_sense is not None and target_sense.thesclass is not None:
            match = target_sense.thesclass
            if sense.wordclass == 'JJ':
                match = tdb.equivalent_class(match, 'JJ')
            return match
        else:
            return None
def formal_compound_analysis(sense, entry_main_sense):
    """
    Figure out a likely thesaurus class based on the form of a
    two-part compound lemma.

    This is the main way of classifying undefined compounds, and
    can be used to support other methods for defined compounds.

    Returns a CompoundAnalysisResult object
    """
    # Bug out if this is not a workable compound
    if (sense.last_element() is None or
            sense.last_element() in STOPWORDS or
            sense.first_element() in STOPWORDS or
            sense.wordclass not in WORDCLASSES):
        return formal_compound_null_result()

    # Initialize the CompoundAnalysisResult object which will be returned
    output = CompoundAnalysisResult(lemma=sense.lemma,
                                    refentry=sense.entry_id,
                                    refid=sense.node_id)
    output.is_undefined = sense.is_undefined()

    #=====================================
    # Build the core tables for the result object
    #=====================================

    # Insert consensus of different Bayes evaluations
    if sense.is_possibly_parasynthetic():
        bayes_modes = ('main', 'bias_high',)
    else:
        bayes_modes = ('main', 'bias_low',)
    output.bayes_consensus = compute_bayes_consensus(sense, bayes_modes)[0:10]

    # Insert ranked senses for the second word
    output.ranked_senses = rank_senses_for_last_element(sense)[0:10]

    # Likely thesaurus branches for the first and last elements,
    #  derived from the index of compound elements
    word1_index = retrieve_from_compound_index(sense.first_element(),
                                               'first')
    word2_index = retrieve_from_compound_index(sense.last_element(),
                                               sense.wordclass)

    if (word1_index is not None and
            word1_index.count >= 5 and
            word2_index is not None and
            word2_index.count >= 5):
        p = word2_index.combined_probabilities(word1_index)
    elif word2_index is not None and word2_index.count >= 5:
        p = word2_index.combined_probabilities(None)
    else:
        p = []
    output.index_consensus = p[0:10]
    if word2_index is not None:
        output.index_count = word2_index.count

    # Establish the main sense of the first and last words
    word1_main_sense = entry_main_sense
    word2_main_sense = MAIN_SENSE_FINDER.main_sense(
        lemma=sense.last_element(),
        wordclass=sense.wordclass
    )

    #=====================================
    # Handlers for special cases
    #=====================================

    # Special handling of 'doctor-like', 'cat-wise', etc. - we just return
    #   the wordclass branch equivalent to the first element
    if ((sense.last_element() in SIMILATIVE and
            sense.wordclass in ('JJ', 'RB')) or
            (sense.last_element() in ('piece', 'part') and
            sense.wordclass == 'NN')):
        if (word1_main_sense is not None and
                word1_main_sense.thesclass is not None):
            t = tdb.equivalent_class(word1_main_sense.thesclass,
                                     sense.wordclass)
            if t is not None:
                output.forced_result = t.wordclass_parent() or t
            output.forced_result = t
        return output

    # Special handling of 'pro-...' and 'anti-...'- we just return
    #   the wordclass branch equivalent to the first element
    if (sense.first_element() in ('pro', 'anti', 'un') and
            sense.wordclass in ('JJ', 'NN')):
        if (word2_main_sense is not None and
                word2_main_sense.thesclass is not None):
            t = tdb.equivalent_class(word2_main_sense.thesclass,
                                     sense.wordclass)
            if t is not None:
                output.forced_result = t.wordclass_parent() or t
        return output

    # Special handling of '...-shaped', '...-coloured', etc.
    if sense.last_element() in PARASYN_ENDINGS:
        output.forced_result = PARASYN_ENDINGS[sense.last_element()]
        return output

    # Special handling of cases where the Second element is single-sense
    #   (but don't risk this with upper-case forms or possible plurals)
    if (sense.wordclass == 'NN' and
            len(output.ranked_senses) == 1 and
            len(output.ranked_senses[0].classes) == 1 and
            output.ranked_senses[0].classes[0] is not None and
            sense.last_element().islower() and
            not re.search(r'[^s]s$', sense.last_element())):
        output.forced_result = output.ranked_senses[0].classes[0]
        return output

    #=====================================
    # Handler for regular cases (compute best guesses)
    #=====================================
    output.best_guesses = compute_best_guesses(sense, output)

    return output