def compound_derivative(sense): if sense.last_element() is None: return None, None thesclass = None for ending, wordclasses, replacement in derivation_forms: if (sense.wordclass in wordclasses and sense.last_element().endswith(ending) and len(sense.last_element()) > len(ending) + 2): # Figure out what the base form would look like, # if the lemma *is* a derivative # - Strip off the ending, then add the replacement ending # (which is usually a null string). hypothetical_base =\ sense.lemma[0:len(sense.lemma)-len(ending)] + replacement # Test if the hypothetical base form exists, and if so # find out how it is classified base_classifications = tdb.ranked_search(lemma=hypothetical_base, current_only=True) if (tdb.distinct_senses(base_classifications) == 1 and base_classifications[0].thesclass is not None): thesclass = base_classifications[0].thesclass break if thesclass is not None: # Don't risk things like 'yellow-bellied' - these are # likely to be transparent, not a derivative of e.g. # 'yellow-belly', so should go after existing guesses if ending in ('ed', 'ied') and sense.first_element().lower() in colours: position = 'last' else: position = 'first' return base_classifications[0].thesclass, position else: return None, None
def _parse_instances(instances): local_branches = set() if instances and tdb.distinct_senses(instances) <= 2: # Filter to just instances from the first sense that have # a thesaurus class attached instances = [i for i in instances if i.refid == instances[0].refid and i.thesclass is not None] # Get the set of level-3 ancestor branches covering the set of # instances for i in instances: branch = i.thesclass.ancestor(level=3) if branch is not None: local_branches.add(branch) # Nix it if too many branches (>3) have emerged from this set of # instances - which suggests that the underlying sense is too vague # to be relied on if len(local_branches) > 3: local_branches = () # Drop anything in the abstract properties and relative properties # branches - these aren't very useful return set([b for b in local_branches if not _is_useless(b)])
def match_single_synonym(sense): # Drop out any highly polysemous synonyms synonyms = [] for syn in sense.synonyms: instances = tdb.search(lemma=syn, wordclass=sense.wordclass, current_only=True) if tdb.distinct_senses(instances) < 20: synonyms.append(syn) if not synonyms: return None, None match = None matching_synonym = None # If the sense can be restricted by subject area, try to find a match for # *any* synonym (even if there's only one) if not match and synonyms and sense.subjects: candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, subjects=sense.subjects, current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If the sense is an interjection, try to find a match for # *any* synonym (even if there's only one) - since interjection # synonyms are more reliable and less ambiguous if not match and synonyms and sense.wordclass == 'UH': candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass='UH', current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If any of the synonyms are single-sense (or nearly single-sense), # then we assume that that is the correct sense if not match: candidates = [] for syn in synonyms: syn_senses = tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, current_only=True) if (syn_senses and (tdb.distinct_senses(syn_senses) == 1 or (tdb.distinct_senses(syn_senses) <= 3 and len(synonyms) == 1))): candidates.append(syn_senses[0]) for c in candidates: if c.thesclass is not None: match = c.thesclass matching_synonym = c.lemma break # If the sense can be restricted by Bayes classification(s), try to # find a match for *any* synonym (even if there's only one) if not match and synonyms and sense.bayes.is_usable(): candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, branches=sense.bayes.ids(), current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma return match, matching_synonym
def find_branch_from_superordinate(self, sense): """Classify by finding the main or only sense of the superordinate """ if (sense.wordclass not in ('NN', 'JJ') or not sense.superordinate or len(sense.superordinate) < 3 or sense.superordinate in GENERICS): return None target_sense = None # If the superordinate is (more or less) single-sense, we assume that # sense to be the correct one candidates = tdb.ranked_search( lemma=sense.superordinate, wordclass='NN', current_only=True) if candidates and tdb.distinct_senses(candidates) <= 2: target_sense = candidates[0] # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.confidence() >= 8: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) # Otherwise, narrow by branches based on subject labels if target_sense is None and sense.label_based_classifications: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.label_based_classifications, current_only=True) # Otherwise, narrow by branches based on cross-references if target_sense is None and sense.xref_branches: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.xref_branches, current_only=True) # Last gasp: If the gloss consists more or less *only* of the # superordinate (e.g. 'an abbey'), then it should be adequate to # just use the main sense of the superordinate, even if it's # multi-sense. # But don't risk this is there are cross-references or subject # labels which might suggest a more specific use if (target_sense is None and not sense.subjects and not sense.xref_branches and sense.gloss is not None): g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower()) if g == sense.superordinate: target_sense = MAIN_SENSE_FINDER.main_sense( lemma=sense.superordinate, wordclass='NN') # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.is_usable(): target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) if target_sense is not None and target_sense.thesclass is not None: match = target_sense.thesclass if sense.wordclass == 'JJ': match = tdb.equivalent_class(match, 'JJ') return match else: return None