def is_valid_bigram(bigram, wordclass):
    # Verb bigrams will only be phrasal verbs, so need to have 'to' prepended
    if wordclass == 'VB':
        bigram = 'to ' + bigram
        z = tdb.search(lemma=bigram)
    else:
        z = tdb.search(lemma=bigram, wordclass=wordclass)
    if z:
        return True
    else:
        return False
Example #2
0
def lemma_as_superordinate(sense):
    """
    Check if the lemma is used anywhere as a superordinate
    """
    match = None
    riskable = False

    if sense.wordclass == 'NN' and len(sense.lemma) > 8:
        # If this is a subentry, then we can risk it without checking
        #   the database
        if (sense.is_subentry and
        sense.lemma.lower() != sense.entry_lemma.lower() and
        len(sense.lemma) > 8):
            riskable = True

        # ...But if it's a main sense, we check that (a) it's the only sense
        #  in its entry (or entry block), and then confirm that it's the only
        #  sense recorded in the database
        elif sense.senses_in_entry == 1:
            instances = tdb.search(lemma=sense.lemma)
            if len(instances) == 1 and instances[0].refid == sense.node_id:
                riskable = True

    if riskable:
        qterm = sense.lemma.lower().replace('-', '').replace(' ', '')
        record = tdb.get_superordinate_record(qterm)
        if record is not None:
            if sense.bayes.branches():
                # Look for commonalities with Bayes branches
                bayes_ancestors = sense.bayes.ancestors(level=2)
                record_ancestors = set([b.thesclass.ancestor(level=2)
                    for b in record.branches])
                common_branches = set.intersection(bayes_ancestors,
                    record_ancestors)
                for b in record.branches:
                    if b.thesclass.ancestor(level=2) in common_branches:
                        match = b.thesclass
                        break
            else:
                match = record.branches[0].thesclass

    if match is not None:
        match.reason_text = 'Lemma appears elsewhere as a superordinate'
        match.reason_code = 'lass'
        return match
    else:
        return None
Example #3
0
 def update(self):
     for letter in letters:
         buffer = []
         pl = PickleLoader(self.input_dir, letters=letter)
         for sense in pl.iterate():
             if sense.definition is None:
                 # don't bother with undefined lemmas
                 pass
             else:
                 instances = tdb.search(refentry=sense.entry_id,
                                        refid=sense.node_id)
                 try:
                     instance = instances[0]
                 except IndexError:
                     pass
                 else:
                     buffer.append((instance, sense.class_id))
                     if len(buffer) > 1000:
                         tdb.add_links(buffer)
                         buffer = []
         tdb.add_links(buffer)
def match_single_synonym(sense):
    # Drop out any highly polysemous synonyms
    synonyms = []
    for syn in sense.synonyms:
        instances = tdb.search(lemma=syn,
                               wordclass=sense.wordclass,
                               current_only=True)
        if tdb.distinct_senses(instances) < 20:
            synonyms.append(syn)

    if not synonyms:
        return None, None

    match = None
    matching_synonym = None

    # If the sense can be restricted by subject area, try to find a match for
    #  *any* synonym (even if there's only one)
    if not match and synonyms and sense.subjects:
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                subjects=sense.subjects,
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If the sense is an interjection, try to find a match for
    #  *any* synonym (even if there's only one) - since interjection
    #  synonyms are more reliable and less ambiguous
    if not match and synonyms and sense.wordclass == 'UH':
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass='UH',
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If any of the synonyms are single-sense (or nearly single-sense),
    #  then we assume that that is the correct sense
    if not match:
        candidates = []
        for syn in synonyms:
            syn_senses = tdb.ranked_search(lemma=syn,
                                           wordclass=sense.wordclass,
                                           current_only=True)
            if (syn_senses and
                    (tdb.distinct_senses(syn_senses) == 1 or
                    (tdb.distinct_senses(syn_senses) <= 3 and
                    len(synonyms) == 1))):
                candidates.append(syn_senses[0])
        for c in candidates:
            if c.thesclass is not None:
                match = c.thesclass
                matching_synonym = c.lemma
                break

    # If the sense can be restricted by Bayes classification(s), try to
    #   find a match for *any* synonym (even if there's only one)
    if not match and synonyms and sense.bayes.is_usable():
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                branches=sense.bayes.ids(),
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    return match, matching_synonym