def is_valid_bigram(bigram, wordclass): # Verb bigrams will only be phrasal verbs, so need to have 'to' prepended if wordclass == 'VB': bigram = 'to ' + bigram z = tdb.search(lemma=bigram) else: z = tdb.search(lemma=bigram, wordclass=wordclass) if z: return True else: return False
def lemma_as_superordinate(sense): """ Check if the lemma is used anywhere as a superordinate """ match = None riskable = False if sense.wordclass == 'NN' and len(sense.lemma) > 8: # If this is a subentry, then we can risk it without checking # the database if (sense.is_subentry and sense.lemma.lower() != sense.entry_lemma.lower() and len(sense.lemma) > 8): riskable = True # ...But if it's a main sense, we check that (a) it's the only sense # in its entry (or entry block), and then confirm that it's the only # sense recorded in the database elif sense.senses_in_entry == 1: instances = tdb.search(lemma=sense.lemma) if len(instances) == 1 and instances[0].refid == sense.node_id: riskable = True if riskable: qterm = sense.lemma.lower().replace('-', '').replace(' ', '') record = tdb.get_superordinate_record(qterm) if record is not None: if sense.bayes.branches(): # Look for commonalities with Bayes branches bayes_ancestors = sense.bayes.ancestors(level=2) record_ancestors = set([b.thesclass.ancestor(level=2) for b in record.branches]) common_branches = set.intersection(bayes_ancestors, record_ancestors) for b in record.branches: if b.thesclass.ancestor(level=2) in common_branches: match = b.thesclass break else: match = record.branches[0].thesclass if match is not None: match.reason_text = 'Lemma appears elsewhere as a superordinate' match.reason_code = 'lass' return match else: return None
def update(self): for letter in letters: buffer = [] pl = PickleLoader(self.input_dir, letters=letter) for sense in pl.iterate(): if sense.definition is None: # don't bother with undefined lemmas pass else: instances = tdb.search(refentry=sense.entry_id, refid=sense.node_id) try: instance = instances[0] except IndexError: pass else: buffer.append((instance, sense.class_id)) if len(buffer) > 1000: tdb.add_links(buffer) buffer = [] tdb.add_links(buffer)
def match_single_synonym(sense): # Drop out any highly polysemous synonyms synonyms = [] for syn in sense.synonyms: instances = tdb.search(lemma=syn, wordclass=sense.wordclass, current_only=True) if tdb.distinct_senses(instances) < 20: synonyms.append(syn) if not synonyms: return None, None match = None matching_synonym = None # If the sense can be restricted by subject area, try to find a match for # *any* synonym (even if there's only one) if not match and synonyms and sense.subjects: candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, subjects=sense.subjects, current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If the sense is an interjection, try to find a match for # *any* synonym (even if there's only one) - since interjection # synonyms are more reliable and less ambiguous if not match and synonyms and sense.wordclass == 'UH': candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass='UH', current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If any of the synonyms are single-sense (or nearly single-sense), # then we assume that that is the correct sense if not match: candidates = [] for syn in synonyms: syn_senses = tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, current_only=True) if (syn_senses and (tdb.distinct_senses(syn_senses) == 1 or (tdb.distinct_senses(syn_senses) <= 3 and len(synonyms) == 1))): candidates.append(syn_senses[0]) for c in candidates: if c.thesclass is not None: match = c.thesclass matching_synonym = c.lemma break # If the sense can be restricted by Bayes classification(s), try to # find a match for *any* synonym (even if there's only one) if not match and synonyms and sense.bayes.is_usable(): candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, branches=sense.bayes.ids(), current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma return match, matching_synonym