Example #1
0
def lemma_as_superordinate(sense):
    """
    Check if the lemma is used anywhere as a superordinate
    """
    match = None
    riskable = False

    if sense.wordclass == 'NN' and len(sense.lemma) > 8:
        # If this is a subentry, then we can risk it without checking
        #   the database
        if (sense.is_subentry and
        sense.lemma.lower() != sense.entry_lemma.lower() and
        len(sense.lemma) > 8):
            riskable = True

        # ...But if it's a main sense, we check that (a) it's the only sense
        #  in its entry (or entry block), and then confirm that it's the only
        #  sense recorded in the database
        elif sense.senses_in_entry == 1:
            instances = tdb.search(lemma=sense.lemma)
            if len(instances) == 1 and instances[0].refid == sense.node_id:
                riskable = True

    if riskable:
        qterm = sense.lemma.lower().replace('-', '').replace(' ', '')
        record = tdb.get_superordinate_record(qterm)
        if record is not None:
            if sense.bayes.branches():
                # Look for commonalities with Bayes branches
                bayes_ancestors = sense.bayes.ancestors(level=2)
                record_ancestors = set([b.thesclass.ancestor(level=2)
                    for b in record.branches])
                common_branches = set.intersection(bayes_ancestors,
                    record_ancestors)
                for b in record.branches:
                    if b.thesclass.ancestor(level=2) in common_branches:
                        match = b.thesclass
                        break
            else:
                match = record.branches[0].thesclass

    if match is not None:
        match.reason_text = 'Lemma appears elsewhere as a superordinate'
        match.reason_code = 'lass'
        return match
    else:
        return None
    def superordinate_lookup(self, sense, panic=False):
        """
        Classify by looking up how other senses with the same superordinate
        have been classified.
        """
        # Get all the branches relevant for this sense's long and/or short
        #  superordinate.
        branches = []
        superordinates = [sense.superordinate_full,]
        if sense.superordinate != sense.superordinate_full:
            superordinates.append(sense.superordinate)
        seen = set()
        for superordinate in [s for s in superordinates if s is not None]:
            superordinate = superordinate.replace('-', '').replace(' ', '')
            record = tdb.get_superordinate_record(superordinate)
            if record is not None:
                for b in record.branches:
                    if b.thesclass.id not in seen:
                        branches.append(b)
                        seen.add(b.thesclass.id)

        if branches:
            branches_filtered = []
            if panic:
                branches_filtered = [b for b in branches if b.probability > 0.4]
            else:
                xref_nodes = set(sense.xref_branches)
                branches_filtered = [b for b in branches if
                    set.intersection(b.thesclass.ancestor_ids(), xref_nodes)]

                if not branches_filtered and sense.bayes.confidence() >= 4:
                    bayes_ids = set(sense.bayes.ids())
                    branches_filtered = [b for b in branches if
                        set.intersection(b.thesclass.ancestor_ids(), bayes_ids)]

                if not branches_filtered and sense.bayes.confidence() >= 4:
                    # Try again with the Bayes classifications, but this
                    #  time just use their level-3 parents
                    bayes_ids = set([b.ancestor(level=3).id for b in
                                     sense.bayes.branches()
                                     if b.ancestor(level=3) is not None])
                    branches_filtered = [b for b in branches if
                                         set.intersection(
                                         b.thesclass.ancestor_ids(), bayes_ids)]

            if branches_filtered:
                # Find the best branch below wordclass level, or failing that,
                #   above wordclass level
                wc_branches = [b for b in branches_filtered if
                    b.thesclass.wordclass is not None] or branches_filtered
                wc_branches.sort(key=lambda b: b.probability, reverse=True)
                winning_branch = wc_branches[0].thesclass

                # If this is a compound, see if we can get more specific
                #   by finding an instance of the second element within the
                #   winning branch.
                # (Fairly unlikely, since most of these should already
                #   have been picked off by the compound classifiers.)
                if sense.last_element() is not None:
                    subclass = tdb.highest_ranked(lemma=sense.last_element(),
                                                  wordclass=sense.wordclass,
                                                  branches=[winning_branch.id,])
                    if (subclass is not None and
                        subclass.thesclass is not None):
                        winning_branch = subclass.thesclass

                return winning_branch

        return None