def attributive_of_noun(sense, main_sense_of_entry): """ An attrib. sense of a noun is treated as the adj. equivalent of the main sense of the entry (or of a particular sense, if referenced) """ # Use a particular sense if it's cross-referenced # Check that it's an internal cross-reference to a main sense # (hence no lemma) xrefs = [xr for xr in sense.cross_references if xr.refentry == sense.entry_id and xr.lemma is None] if xrefs: target_sense = tdb.highest_ranked(lemma=sense.lemma, wordclass='NN', refentry=xrefs[0].refentry, refid=xrefs[0].refid) if target_sense is not None and target_sense.thesclass is not None: equiv = tdb.equivalent_class(target_sense.thesclass, 'JJ') equiv.reason_text = 'Adjective equivalent of cross-referenced noun sense' equiv.reason_code = 'attb' return equiv elif target_sense is not None: return None # ... otherwise, default to the main sense of the entry if (main_sense_of_entry is not None and main_sense_of_entry.thesclass is not None): equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, 'JJ') equiv.reason_text = 'Adjective equivalent of main noun sense' equiv.reason_code = 'attb' return equiv else: return None
def infer_derivative(sense, main_sense_of_entry): equiv = None if (main_sense_of_entry is not None and not main_sense_of_entry.is_affix() and main_sense_of_entry.thesclass is not None): # Take precedent from the main sense of the parent entry equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, sense.wordclass) if equiv is not None: equiv.reason_text = 'Parallel to "%s"' % main_sense_of_entry.lemma equiv.reason_code = 'driv' if equiv is None: # Take precedent from other sibling derivatives candidates = tdb.ranked_search(refentry=sense.entry_id, thes_linked=True, currentOnly=True) candidates = [c for c in candidates if c.is_derivative() and not ' ' in c.lemma and c.superclass() is not None] if candidates: j = defaultdict(int) for c in candidates: j[c.superclass()] += 1 parents = [p for p, num in j.items() if num == max(j.values())] # If there's more than one possible parent, pick the one with # the largest branch parents.sort(key=lambda p: p.branch_size, reverse=True) equiv = tdb.child_wordclass_branch(parents[0], sense.wordclass) if equiv is None: equiv = parents[0] if equiv is not None: equiv.reason_text = 'Parallel to %s' % ', '.join( ['"%s"' % (c.lemma,) for c in candidates if c.lemma]) equiv.reason_code = 'driv' return equiv
def _append_derivational_classification(best_guesses, sense): """ If the compound is a derivative of another compound, prepend or append a classification based on the root form """ base_compound_class, insert_position = compound_derivative(sense) if base_compound_class is not None: equiv = tdb.equivalent_class(base_compound_class, sense.wordclass) if (equiv is not None and equiv.wordclass is not None and not any([b.target.is_descendant_of(equiv) for b in best_guesses])): # Create a new derivative-based guess object # Give it a dummy score, calculated to be higher or lower # than the set of existing guesses if not best_guesses: score = 1 elif insert_position == 'first': score = best_guesses[0].target_score * 1.1 else: score = best_guesses[-1].target_score * 0.9 new_guess = BestGuess(None, equiv, score, 'derivative') # ... and prepend/append it to the best_guesses list if insert_position == 'first': best_guesses.insert(0, new_guess) elif insert_position == 'last': best_guesses.append(new_guess) return best_guesses
def infer_from_etyma(sense, subjectFilter=False): etymon, target_instance = (None, None) if len(sense.etyma) == 1 and sense.etyma[0][0] == sense.lemma: etymon = sense.etyma[0] elif (len(sense.etyma) == 2 and re.search(r'^[a-zA-Z]+$', sense.etyma[0][0]) and re.search(r'^-[a-z]+$', sense.etyma[1][0])): suffix = sense.etyma[1][0] if (deriv_tester.is_neutral_suffix(suffix) or (suffix in ('-ist', '-ian', '-ful') and sense.wordclass == 'JJ')): etymon = sense.etyma[0] if etymon is not None: # First try to find the exact sense, in case the etymon points to a # specific sense - see e.g. lam n./3 target_instance = tdb.highest_ranked(lemma=etymon[0], refentry=etymon[1], refid=etymon[2], exact_sense=True) # ...but if the etymon just points to an entry in general, find that # entry's main sense if target_instance is None and not subjectFilter: target_instance = main_sense_finder.main_sense(lemma=etymon[0], refentry=etymon[1]) elif target_instance is None and subjectFilter: main_sense = tdb.highest_ranked(lemma=etymon[0], refentry=etymon[1], subjects=sense.subjects) if main_sense is not None and main_sense.entry_size < 100: target_instance = main_sense if target_instance is not None: # Check if the target is also referenced in the sense's # cross-references (in case a particular sense is pointed to, # as in 'nocturning'). for xr in sense.cross_references: if (xr.lemma == target_instance.lemma and xr.refentry == target_instance.refentry): specific_target = tdb.highest_ranked(lemma=xr.lemma, refentry=xr.refentry, refid=xr.refid, exact_sense=True) if specific_target is not None: target_instance = specific_target break if target_instance is not None and target_instance.thesclass is not None: if target_instance.wordclass == sense.wordclass: match = target_instance.thesclass.wordclass_parent() else: match = tdb.equivalent_class(target_instance.thesclass, sense.wordclass) if match is not None: match.reason_code = 'etym' match.reason_text = 'Analogy with "%s" in etymology' % etymon[0] return match else: return None
def superordinate_adjective_state(self, sense): if sense.superordinate is not None: m = re.search(r'^state of being ([a-z-]+)-JJ$', sense.superordinate) if m is None: m = re.search(r'^being ([a-z-]+)-JJ state$', sense.superordinate) if m is not None: adjective = m.group(1) target_sense = MAIN_SENSE_FINDER.main_sense(lemma=adjective, wordclass='JJ') if (target_sense is not None and target_sense.thesclass is not None): return tdb.equivalent_class(target_sense.thesclass, 'NN') return None
def superordinate_lookup(sense, **kwargs): match = superordinate_manager.superordinate_lookup(sense, **kwargs) if match is not None and match.wordclass is not None: if sense.wordclass == 'JJ': match = tdb.equivalent_class(match, 'JJ') match.reason_code = 'adeq' match.reason_text = 'Adjective equivalent of "%s"' % sense.superordinate else: match.reason_code = 'supe' match.reason_text = 'Classification of superordinate "%s" ("%s")'\ % (sense.superordinate, sense.superordinate_full) #if sense.wordclass == 'VB': # print('\n----------------------------------------') # print(trace_sense(sense)) # print(trace_class(match)) return match else: return None
def infer_from_neighbouring_wordclass(sense): match = None if sense.wordclass in ('JJ', 'VB'): opposite_class = 'NN' elif sense.wordclass in ('NN', 'RB'): opposite_class = 'JJ' else: opposite_class = None if opposite_class is not None: opposite = tdb.highest_ranked(lemma=sense.lemma, refentry=sense.entry_id, wordclass=opposite_class) if opposite is not None and opposite.thesclass is not None: match = tdb.equivalent_class(opposite.thesclass, sense.wordclass) if match is not None: match.reason_code = 'nbor' match.reason_text = 'Inferred from neighbouring %s branch' % opposite_class return match
def cf_cross_reference(sense): """ If the current sense has 'cf'-type cross-reference, the classification of the target sense is taken as a good guide to the classification of the current sense. If the target sense and the current sense are in the same wordclass and have the same end word-ending, we assume that the current sense should go in the target sense's class (similar to how 'equals'-type xrefs are treated. Otherwise, we put the current sense at the top of the equivalent wordclass-level branch. """ xr = sense.cf_crossreference() # Nb don't specify a wordclass here, since the target wordclass may well # be different from the current sense's wordclass target_senses, sense_count = tdb.cross_reference_target(lemma=xr.lemma, refentry=xr.refentry, refid=xr.refid) # Don't attempt if the target is too ambiguous (too many possible senses) if target_senses and sense_count <= 3: high_scores = [t for t in target_senses if t.rating() == target_senses[0].rating()] high_scores.sort(key=lambda i: i.node_size(), reverse=True) if high_scores[0].thesclass is not None: target = high_scores[0] if (target.wordclass == sense.wordclass and target.lemma[-2:] == sense.lemma[-2:]): match = target.thesclass elif target.wordclass == sense.wordclass: match = target.wordclass_parent() else: match = tdb.equivalent_class(target.thesclass, sense.wordclass) if match is not None: match.reason_text = 'Analogy with target of cf-type xref ("%s")' % target.lemma match.reason_code = 'cfxr' return match return None
def find_branch_from_superordinate(self, sense): """Classify by finding the main or only sense of the superordinate """ if (sense.wordclass not in ('NN', 'JJ') or not sense.superordinate or len(sense.superordinate) < 3 or sense.superordinate in GENERICS): return None target_sense = None # If the superordinate is (more or less) single-sense, we assume that # sense to be the correct one candidates = tdb.ranked_search( lemma=sense.superordinate, wordclass='NN', current_only=True) if candidates and tdb.distinct_senses(candidates) <= 2: target_sense = candidates[0] # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.confidence() >= 8: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) # Otherwise, narrow by branches based on subject labels if target_sense is None and sense.label_based_classifications: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.label_based_classifications, current_only=True) # Otherwise, narrow by branches based on cross-references if target_sense is None and sense.xref_branches: target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.xref_branches, current_only=True) # Last gasp: If the gloss consists more or less *only* of the # superordinate (e.g. 'an abbey'), then it should be adequate to # just use the main sense of the superordinate, even if it's # multi-sense. # But don't risk this is there are cross-references or subject # labels which might suggest a more specific use if (target_sense is None and not sense.subjects and not sense.xref_branches and sense.gloss is not None): g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower()) if g == sense.superordinate: target_sense = MAIN_SENSE_FINDER.main_sense( lemma=sense.superordinate, wordclass='NN') # Otherwise, narrow by Bayes classification if target_sense is None and sense.bayes.is_usable(): target_sense = tdb.highest_ranked( lemma=sense.superordinate, wordclass='NN', branches=sense.bayes_based_classifications, current_only=True) if target_sense is not None and target_sense.thesclass is not None: match = target_sense.thesclass if sense.wordclass == 'JJ': match = tdb.equivalent_class(match, 'JJ') return match else: return None
def formal_compound_analysis(sense, entry_main_sense): """ Figure out a likely thesaurus class based on the form of a two-part compound lemma. This is the main way of classifying undefined compounds, and can be used to support other methods for defined compounds. Returns a CompoundAnalysisResult object """ # Bug out if this is not a workable compound if (sense.last_element() is None or sense.last_element() in STOPWORDS or sense.first_element() in STOPWORDS or sense.wordclass not in WORDCLASSES): return formal_compound_null_result() # Initialize the CompoundAnalysisResult object which will be returned output = CompoundAnalysisResult(lemma=sense.lemma, refentry=sense.entry_id, refid=sense.node_id) output.is_undefined = sense.is_undefined() #===================================== # Build the core tables for the result object #===================================== # Insert consensus of different Bayes evaluations if sense.is_possibly_parasynthetic(): bayes_modes = ('main', 'bias_high',) else: bayes_modes = ('main', 'bias_low',) output.bayes_consensus = compute_bayes_consensus(sense, bayes_modes)[0:10] # Insert ranked senses for the second word output.ranked_senses = rank_senses_for_last_element(sense)[0:10] # Likely thesaurus branches for the first and last elements, # derived from the index of compound elements word1_index = retrieve_from_compound_index(sense.first_element(), 'first') word2_index = retrieve_from_compound_index(sense.last_element(), sense.wordclass) if (word1_index is not None and word1_index.count >= 5 and word2_index is not None and word2_index.count >= 5): p = word2_index.combined_probabilities(word1_index) elif word2_index is not None and word2_index.count >= 5: p = word2_index.combined_probabilities(None) else: p = [] output.index_consensus = p[0:10] if word2_index is not None: output.index_count = word2_index.count # Establish the main sense of the first and last words word1_main_sense = entry_main_sense word2_main_sense = MAIN_SENSE_FINDER.main_sense( lemma=sense.last_element(), wordclass=sense.wordclass ) #===================================== # Handlers for special cases #===================================== # Special handling of 'doctor-like', 'cat-wise', etc. - we just return # the wordclass branch equivalent to the first element if ((sense.last_element() in SIMILATIVE and sense.wordclass in ('JJ', 'RB')) or (sense.last_element() in ('piece', 'part') and sense.wordclass == 'NN')): if (word1_main_sense is not None and word1_main_sense.thesclass is not None): t = tdb.equivalent_class(word1_main_sense.thesclass, sense.wordclass) if t is not None: output.forced_result = t.wordclass_parent() or t output.forced_result = t return output # Special handling of 'pro-...' and 'anti-...'- we just return # the wordclass branch equivalent to the first element if (sense.first_element() in ('pro', 'anti', 'un') and sense.wordclass in ('JJ', 'NN')): if (word2_main_sense is not None and word2_main_sense.thesclass is not None): t = tdb.equivalent_class(word2_main_sense.thesclass, sense.wordclass) if t is not None: output.forced_result = t.wordclass_parent() or t return output # Special handling of '...-shaped', '...-coloured', etc. if sense.last_element() in PARASYN_ENDINGS: output.forced_result = PARASYN_ENDINGS[sense.last_element()] return output # Special handling of cases where the Second element is single-sense # (but don't risk this with upper-case forms or possible plurals) if (sense.wordclass == 'NN' and len(output.ranked_senses) == 1 and len(output.ranked_senses[0].classes) == 1 and output.ranked_senses[0].classes[0] is not None and sense.last_element().islower() and not re.search(r'[^s]s$', sense.last_element())): output.forced_result = output.ranked_senses[0].classes[0] return output #===================================== # Handler for regular cases (compute best guesses) #===================================== output.best_guesses = compute_best_guesses(sense, output) return output