def infer_derivative(sense, main_sense_of_entry): equiv = None if (main_sense_of_entry is not None and not main_sense_of_entry.is_affix() and main_sense_of_entry.thesclass is not None): # Take precedent from the main sense of the parent entry equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, sense.wordclass) if equiv is not None: equiv.reason_text = 'Parallel to "%s"' % main_sense_of_entry.lemma equiv.reason_code = 'driv' if equiv is None: # Take precedent from other sibling derivatives candidates = tdb.ranked_search(refentry=sense.entry_id, thes_linked=True, currentOnly=True) candidates = [c for c in candidates if c.is_derivative() and not ' ' in c.lemma and c.superclass() is not None] if candidates: j = defaultdict(int) for c in candidates: j[c.superclass()] += 1 parents = [p for p, num in j.items() if num == max(j.values())] # If there's more than one possible parent, pick the one with # the largest branch parents.sort(key=lambda p: p.branch_size, reverse=True) equiv = tdb.child_wordclass_branch(parents[0], sense.wordclass) if equiv is None: equiv = parents[0] if equiv is not None: equiv.reason_text = 'Parallel to %s' % ', '.join( ['"%s"' % (c.lemma,) for c in candidates if c.lemma]) equiv.reason_code = 'driv' return equiv
def branches_from_xrefs(self, sense): branches = set() for xr in sense.cross_references: instances = tdb.ranked_search(refentry=xr.refentry, refid=xr.refid) local_branches = _parse_instances(instances) branches = branches | local_branches return [b.id for b in branches]
def finalize(): """ Use the 'manual' file to override where necessary """ for wordclass in WORDCLASSES: lemmas = {} infile1 = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass) infile2 = os.path.join(DIRECTORY, '%s_manual.csv' % wordclass) with open(infile1, 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: lemmas[row[0]] = int(row[1]) # Do the manual file second, so that it overrides the # automatically-generated file with open(infile2, 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: lemmas[row[0]] = int(row[1]) output = [] for lemma, class_id in lemmas.items(): # Retrieve the branch that the majority of compounds are on compound_branch = tdb.get_thesclass(class_id) # Get the highest-rated senses for the lemma ranked_senses = tdb.ranked_search(lemma=lemma, wordclass=wordclass) if ranked_senses: max_rating = ranked_senses[0].rating() ranked_senses = [s for s in ranked_senses if max_rating > 0 and s.rating() > max_rating * 0.3] # Try filtering to just those senses that are on # the same branch as the compounds ranked_filtered = [s for i, s in enumerate(ranked_senses) if (i == 0 and s.thesclass is None) or s.is_descendant_of(compound_branch)] # ... or else stick with original ranking if not ranked_filtered: ranked_filtered = ranked_senses if ranked_filtered: output.append(ranked_filtered[0]) outfile = os.path.join(DIRECTORY, '%s.csv' % wordclass) output.sort(key=lambda s: s.lemma) with open(outfile, 'w') as filehandle: csvwriter = csv.writer(filehandle) for s in output: row = (s.lemma, s.refentry, s.refid, s.entry_size, s.breadcrumb()) csvwriter.writerow(row)
def compound_derivative(sense): if sense.last_element() is None: return None, None thesclass = None for ending, wordclasses, replacement in derivation_forms: if (sense.wordclass in wordclasses and sense.last_element().endswith(ending) and len(sense.last_element()) > len(ending) + 2): # Figure out what the base form would look like, # if the lemma *is* a derivative # - Strip off the ending, then add the replacement ending # (which is usually a null string). hypothetical_base =\ sense.lemma[0:len(sense.lemma)-len(ending)] + replacement # Test if the hypothetical base form exists, and if so # find out how it is classified base_classifications = tdb.ranked_search(lemma=hypothetical_base, current_only=True) if (tdb.distinct_senses(base_classifications) == 1 and base_classifications[0].thesclass is not None): thesclass = base_classifications[0].thesclass break if thesclass is not None: # Don't risk things like 'yellow-bellied' - these are # likely to be transparent, not a derivative of e.g. # 'yellow-belly', so should go after existing guesses if ending in ('ed', 'ied') and sense.first_element().lower() in colours: position = 'last' else: position = 'first' return base_classifications[0].thesclass, position else: return None, None
def ranked_sense_summary(**kwargs): """ Return a summary of the ranked_search() results, reduced to a given thesaurus level (defaults to level=3) Returns a ranked list of result objects, each with the following attributes: * parent: level-3 parent class; * classes: ranked list of thesaurus classes which map to the parent; * summed_rating: sum of c.rating() values for each c in classes; * probability: probability of this row (based on summed rating). Keyword arguments: * level (int): the parent thesaurus level which will be returned * omit_null (True/False): if True, any senses which don't have a classification will be ignored ... plus all the usual optional keyword arguments passed on to tdb.search() (lemma, wordclass, refentry, refid, etc.). """ level = kwargs.get('level', 3) lemma = kwargs.get('lemma') wordclass = kwargs.get('wordclass') omit_null = kwargs.get('omit_null', False) if lemma is not None: main_sense = main_sense_finder.main_sense(lemma=lemma, wordclass=wordclass, listed_only=True) if main_sense is not None: kwargs['promote'] = main_sense.refid candidates = tdb.ranked_search(**kwargs) if omit_null: candidates = [c for c in candidates if c.thesclass is not None] # Give each thesclass a probability value (as a ratio of the sum # of all senses' ratings) total = sum([c.rating() for c in candidates]) if total <= 0: total = 1 summary = {} for c in candidates: if c.thesclass is None or c.thesclass.ancestor(level=level) is None: ancestor = None identifier = 0 else: ancestor = c.thesclass.ancestor(level=level) identifier = ancestor.id if not identifier in summary: summary[identifier] = ResultRow(ancestor) summary[identifier].append(c) # Convert to a list summary = list(summary.values()) # Add a probability score (0 < p < 1) to each row [row.set_probability(total) for row in summary] # Sort by probability summary.sort(key=lambda r: r.probability, reverse=True) return summary
def match_single_synonym(sense): # Drop out any highly polysemous synonyms synonyms = [] for syn in sense.synonyms: instances = tdb.search(lemma=syn, wordclass=sense.wordclass, current_only=True) if tdb.distinct_senses(instances) < 20: synonyms.append(syn) if not synonyms: return None, None match = None matching_synonym = None # If the sense can be restricted by subject area, try to find a match for # *any* synonym (even if there's only one) if not match and synonyms and sense.subjects: candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, subjects=sense.subjects, current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If the sense is an interjection, try to find a match for # *any* synonym (even if there's only one) - since interjection # synonyms are more reliable and less ambiguous if not match and synonyms and sense.wordclass == 'UH': candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass='UH', current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma # If any of the synonyms are single-sense (or nearly single-sense), # then we assume that that is the correct sense if not match: candidates = [] for syn in synonyms: syn_senses = tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, current_only=True) if (syn_senses and (tdb.distinct_senses(syn_senses) == 1 or (tdb.distinct_senses(syn_senses) <= 3 and len(synonyms) == 1))): candidates.append(syn_senses[0]) for c in candidates: if c.thesclass is not None: match = c.thesclass matching_synonym = c.lemma break # If the sense can be restricted by Bayes classification(s), try to # find a match for *any* synonym (even if there's only one) if not match and synonyms and sense.bayes.is_usable(): candidates = [] for syn in synonyms: candidates.extend(tdb.ranked_search(lemma=syn, wordclass=sense.wordclass, branches=sense.bayes.ids(), current_only=True)) if candidates and candidates[0].thesclass is not None: match = candidates[0].thesclass matching_synonym = candidates[0].lemma return match, matching_synonym
def find_branch_from_superordinate(self, sense): """Classify by finding the main or only sense of the superordinate """ if (sense.wordclass not in ('NN', 'JJ') or not sense.superordinate or len(sense.superordinate) < 3 or sense.superordinate in GENERICS): return None target_sense = None # If the superordinate is (more or less) single-sense, we assume that # sense to be the correct one candidates = tdb.ranked_search( lemma=sense.superordinate, wordclass='NN', current_only=True) if candidates and tdb.distinct_senses(candidates) <= 2: target_sense = candidates[0] # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.confidence() >= 8: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) # Otherwise, narrow by branches based on subject labels if target_sense is None and sense.label_based_classifications: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.label_based_classifications, current_only=True) # Otherwise, narrow by branches based on cross-references if target_sense is None and sense.xref_branches: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.xref_branches, current_only=True) # Last gasp: If the gloss consists more or less *only* of the # superordinate (e.g. 'an abbey'), then it should be adequate to # just use the main sense of the superordinate, even if it's # multi-sense. # But don't risk this is there are cross-references or subject # labels which might suggest a more specific use if (target_sense is None and not sense.subjects and not sense.xref_branches and sense.gloss is not None): g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower()) if g == sense.superordinate: target_sense = MAIN_SENSE_FINDER.main_sense( lemma=sense.superordinate, wordclass='NN') # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.is_usable(): target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) if target_sense is not None and target_sense.thesclass is not None: match = target_sense.thesclass if sense.wordclass == 'JJ': match = tdb.equivalent_class(match, 'JJ') return match else: return None