def refine_by_binomial(candidate_classifications, qbinomials): """ Refine classification using binomials found in quotations. """ def test_descent(current_class, possible_refinements): new_class = None for binomial_class in possible_refinements: if (binomial_class != current_class and binomial_class.is_descendant_of(current_class)): new_class = binomial_class break if new_class is not None: new_class.reason_code = current_class.reason_code new_class.reason_text = current_class.reason_text return new_class else: return current_class binomial_ids = set([binomial_checker.find_class(b) for b in qbinomials]) binomials = [tdb.get_thesclass(class_id) for class_id in binomial_ids if class_id is not None] if binomials: candidate_classifications = [test_descent(t, binomials) for t in candidate_classifications] return candidate_classifications
def make_raw_index(self): store = {v: defaultdict(list) for v in ('genera', 'binomials')} loader = PickleLoader(self.input_dir) for s in loader.iterate(): if (s.wordclass == 'NN' and (s.binomials or s.genera)): for leaf in s.thesaurus_nodes: thesclass = tdb.get_thesclass(leaf) if any([thesclass.is_descendant_of(id) for id in life_branches]): for g in s.genera: store['genera'][g].append(leaf) for b in s.binomials: store['binomials'][b].append(leaf) genus = b.split(' ')[0] if genus not in s.genera: store['genera'][b.split(' ')[0]].append(leaf) for k in ('genera', 'binomials'): with open(self.raw_files[k], 'w') as filehandle: csvwriter = csv.writer(filehandle) for t, vals in store[k].items(): row = [t,] row.extend(vals) csvwriter.writerow(row)
def winnow(superordinate, values): # Convert thesaurus IDs stored in the raw files to actual thesaurus classes thesclasses = [(tdb.get_thesclass(v[0]), v[1]) for v in values] if superordinate.endswith('VB'): thesclasses = [t for t in thesclasses if t[0].penn_wordclass() == 'VB'] else: thesclasses = [t for t in thesclasses if t[0].penn_wordclass() == 'NN'] # If there's only one sense, we can short-cut the winnowing process if len(thesclasses) <= 1: return [(t[0], 1) for t in thesclasses] # Use the hopper function to winnow out classes that contain less than # 25% of the total number of senses total = sum([t[1] for t in thesclasses]) winnowed = _hopper(thesclasses, total, 0.25) winnowed.sort(key=lambda a: a[1], reverse=True) winnowed.sort(key=lambda a: a[2], reverse=True) # Strip out classes that are just parents of other classes parents = {t[0].parent.id: 0 for t in winnowed} # Keep a tally of how many of the parent's senses are covered # by its children for t in winnowed: parents[t[0].parent.id] += t[1] # Remove a class if it's a parent of other classes and its total tally # of senses is not much more than the sum of its children's (we use # a margin of 1.3 to allow for some wastage through child classes that # have been skipped) parent_stripped = [t for t in winnowed if not t[0].id in parents or t[1] > parents[t[0].id] * 1.3] return [(t[0], t[1] / total) for t in parent_stripped]
def sample_to_csv(self, name, size, function): self.collect_sample(name, size, function) out_file = os.path.join(self.out_dir, name + '.csv') with open(out_file, 'wb') as fh: csvwriter = csv.writer(fh) csvwriter.writerow(columns) for sense in self.sample: if sense.definition is not None: definition = sense.definition[0:200] if definition.startswith('='): definition = '.' + definition else: definition = '[undefined]' thesclass = tdb.get_thesclass(sense.class_id) if thesclass.wordclass is None: wordclass_level = 'n' else: wordclass_level = 'y' row = ( sense.lemma.encode('utf8'), sense.wordclass, definition.encode('utf8'), thesclass.id, thesclass.breadcrumb().encode('utf8'), wordclass_level, '', sense.oed_url(), thesclass.oed_url(), sense.reason_code, ) csvwriter.writerow(row)
def refine_index(): """ For the lists of thesaurus nodes generated by make_raw_index(), try to pin down to branches where there are particular clusters (throwing away outliers, etc.). """ for wordclass in WORDCLASSES: lemmas = [] filepath = os.path.join(DIRECTORY, wordclass + '_raw.csv') with open(filepath, 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: lemma = row[0] values = row[1:] ids = [int(id) for id in values[::2]] scores = [float(s) for s in values[1::2]] if sum(scores) >= 4: idmap = defaultdict(int) for id, score in zip(ids, scores): idmap[id] += score lemmas.append((lemma, Counter(idmap).most_common())) store = [] for lemma, idcounter in lemmas: classes = [(tdb.get_thesclass(id), score) for id, score in idcounter] total_score = sum([c[1] for c in classes]) ancestors = defaultdict(int) for thesclass, score in classes: a = thesclass.ancestor(level=3) if a: ancestors[a] += score l3_ancestors = Counter(ancestors).most_common() if (l3_ancestors and l3_ancestors[0][1] > total_score * 0.3 and (len(l3_ancestors) == 1 or l3_ancestors[0][1] > l3_ancestors[1][1] * 1.3)): parent_branch = l3_ancestors[0][0] ancestors = defaultdict(int) for thesclass, score in classes: a = thesclass.ancestor(level=4) if a and a.is_descendant_of(parent_branch): ancestors[a] += score l4_ancestors = Counter(ancestors).most_common() if l4_ancestors and l4_ancestors[0][1] > total_score * 0.3: target = l4_ancestors[0][0] else: target = parent_branch store.append((lemma, target)) outfile = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass) with open(outfile, 'w') as filehandle: csvwriter = csv.writer(filehandle) for lemma, thesclass in store: row = (lemma, thesclass.id, thesclass.breadcrumb()) csvwriter.writerow(row)
def count_training(): counts = {i: 0 for i in range(17)} pl = PickleLoader(training_dir) for sense in pl.iterate(): for n in sense.thesaurus_nodes: thesclass = tdb.get_thesclass(n) counts[thesclass.level] += 1 for i in range(17): print '%d\t%d' % (i, counts[i])
def compare_binomials(sense): if sense.wordclass != 'NN': return None match = None for b in sense.binomials: class_id = binomial_checker.find_class(b) if class_id is not None: match = tdb.get_thesclass(class_id) if match is None: for g in sense.genera: class_id = binomial_checker.find_class(g) if class_id is not None: match = tdb.get_thesclass(class_id) if match is not None: match.reason_text = 'Taxonomic name: %s' % ', '.join( sense.binomials.union(sense.genera)) match.reason_code = 'txny' return match else: return None
def finalize(): """ Use the 'manual' file to override where necessary """ for wordclass in WORDCLASSES: lemmas = {} infile1 = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass) infile2 = os.path.join(DIRECTORY, '%s_manual.csv' % wordclass) with open(infile1, 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: lemmas[row[0]] = int(row[1]) # Do the manual file second, so that it overrides the # automatically-generated file with open(infile2, 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: lemmas[row[0]] = int(row[1]) output = [] for lemma, class_id in lemmas.items(): # Retrieve the branch that the majority of compounds are on compound_branch = tdb.get_thesclass(class_id) # Get the highest-rated senses for the lemma ranked_senses = tdb.ranked_search(lemma=lemma, wordclass=wordclass) if ranked_senses: max_rating = ranked_senses[0].rating() ranked_senses = [s for s in ranked_senses if max_rating > 0 and s.rating() > max_rating * 0.3] # Try filtering to just those senses that are on # the same branch as the compounds ranked_filtered = [s for i, s in enumerate(ranked_senses) if (i == 0 and s.thesclass is None) or s.is_descendant_of(compound_branch)] # ... or else stick with original ranking if not ranked_filtered: ranked_filtered = ranked_senses if ranked_filtered: output.append(ranked_filtered[0]) outfile = os.path.join(DIRECTORY, '%s.csv' % wordclass) output.sort(key=lambda s: s.lemma) with open(outfile, 'w') as filehandle: csvwriter = csv.writer(filehandle) for s in output: row = (s.lemma, s.refentry, s.refid, s.entry_size, s.breadcrumb()) csvwriter.writerow(row)
def _bayes_mismatch(sense): try: sense.bayes_classification except AttributeError: return False try: sense.class_id except AttributeError: return False if (sense.class_id is None or sense.bayes_classification is None or sense.bayes_confidence <= 3): return False selected_class = tdb.get_thesclass(sense.class_id) bayes_class = tdb.get_thesclass(sense.bayes_classification) if bayes_class.level > 3: bayes_class = bayes_class.ancestor(level=3) if selected_class.is_descendant_of(bayes_class): return False else: return True
def count_classified(): counts = {i: 0 for i in range(17)} for p in parent_directories: subdir = os.path.join(p, 'classified') pl = PickleLoader(subdir) for sense in pl.iterate(): try: sense.class_id except AttributeError: pass else: thesclass = tdb.get_thesclass(sense.class_id) counts[thesclass.level] += 1 for i in range(17): print '%d\t%d' % (i, counts[i])
def inspect_classification(self, sense): if sense.class_id not in Statistics.cache: thesclass = tdb.get_thesclass(sense.class_id) Statistics.cache[sense.class_id] = {'l': thesclass.level, 'w': False} if thesclass.wordclass is not None: Statistics.cache[sense.class_id]['w'] = True self.levels[Statistics.cache[sense.class_id]['l']] += 1 if Statistics.cache[sense.class_id]['w']: self.wordclass += 1 try: sense.reason_code except AttributeError: pass else: self.reasons[sense.reason_code] += 1
def drilldown(vals): thesclasses = [tdb.get_thesclass(v) for v in vals] thesclasses = [t for t in thesclasses if t.wordclass == 'NN' or t.wordclass == 'noun'] branch = living_world_node for lev in (4, 5, 6, 7, 8, 9): level_ancestors = [t.ancestor(level=lev) for t in thesclasses] level_ancestors = [a for a in level_ancestors if a is not None and a.is_descendant_of(branch)] if level_ancestors: histogram = Counter(level_ancestors).most_common() most_common = [t[0] for t in histogram if t[1] == histogram[0][1]] #print str(lev), str(len(most_common)) if len(most_common) > 1: break branch = most_common[0] if branch.wordclass is not None: break else: break return branch
def refine_binomial_index(self): # Load the genus terms and their branch genera = {} with open(self.clean_files['genera'], 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: genera[row[0]] = int(row[1]) # load the raw binomials data binomials = [] with open(self.raw_files['binomials'], 'r') as filehandle: csvreader = csv.reader(filehandle) for row in csvreader: b = row.pop(0) ids = [int(id) for id in row] binomials.append((b, ids)) # Trim down to just those thesaurus classes that are inside the genus # term's branch binomials2 = [] for b in binomials: binomial = b[0] genus = binomial.split(' ')[0] thesclasses = [tdb.get_thesclass(v) for v in b[1]] if genus in genera: thesclasses = [t for t in thesclasses if t.is_descendant_of(genera[genus])] # Of the remainder, pick the largest branch if thesclasses: histogram = Counter(thesclasses).most_common() most_common = [t[0] for t in histogram if t[1] == histogram[0][1]] most_common.sort(key=lambda t: t.branch_size, reverse=True) binomials2.append((binomial, most_common[0])) with open(self.clean_files['binomials'], 'w') as filehandle: csvwriter = csv.writer(filehandle) for t, v in binomials2: row = [t, v.id, v.breadcrumb()] csvwriter.writerow(row)
import lex.oed.thesaurus.thesaurusdb as tdb from resources.mainsense.mainsense import MainSense from ..indexer.compoundindexretriever import retrieve_from_compound_index from ..bayes.computebayesconsensus import compute_bayes_consensus from .computebestguesses import compute_best_guesses from classifyengine.rankedsensesummary import ranked_sense_summary WORDCLASSES = ('NN', 'JJ', 'RB', 'first') MAIN_SENSE_FINDER = MainSense() # Living world, abstract properties, relative properties - dangerous # classes since very vague and miscellaneous DANGER_BRANCHES = {8835, 82596, 111290} PARASYN_ENDINGS = {word: tdb.get_thesclass(class_id) for word, class_id in (('shaped', 98385), ('colour', 81487), ('coloured', 81487))} SIMILATIVE = {'like', 'wise', 'based', 'containing', 'form', 'formed', 'free'} # Don't attempt compounds where either word is one of these: STOPWORDS = {'of', 'a', 'an', 'in', 'to', 'the', 'by', 'for', 'less'} def formal_compound_analysis(sense, entry_main_sense): """ Figure out a likely thesaurus class based on the form of a two-part compound lemma.
def node(self): try: return self._node except AttributeError: self._node = tdb.get_thesclass(self.id) return self._node
def exact_node(self): try: return self._exact_node except AttributeError: self._exact_node = tdb.get_thesclass(self.exact_id) return self._exact_node
def trace_sense(sense): lines = ['--------------------------------------------------',] lines.append('"%s" %s (%d#eid%d)' % (sense.lemma, sense.wordclass, sense.entry_id, sense.node_id)) lines.append('"%s"' % sense.definition) lines.append('"%s"' % sense.gloss) if sense.subjects: lines.append('subjects: ' + ', '.join(['"%s"' % s for s in sense.subjects])) if sense.etyma: lines.append('etyma: ' + ', '.join(['"%s"' % e[0] for e in sense.etyma])) try: sense.superordinate except AttributeError: pass else: if sense.superordinate: lines.append('superordinate: %s (%s)' % ( sense.superordinate, sense.superordinate_full)) try: sense.synonyms except AttributeError: pass else: if sense.synonyms: lines.append('synonyms:' + ', '.join(['"%s"' % s for s in sense.synonyms])) try: sense.noun_phrases except AttributeError: pass else: if sense.noun_phrases: lines.append('NPs:' + ', '.join(['"%s"' % np for np in sense.noun_phrases])) try: sense.bayes except AttributeError: pass else: for thesclass in sense.bayes.branches(max_delta=0.3): lines.append('Bayes: %s' % thesclass.breadcrumb()) try: sense.class_id except AttributeError: pass else: thesclass = tdb.get_thesclass(sense.class_id) lines.append('>>>') lines.append(trace_class(thesclass)) try: sense.reason_code except AttributeError: pass else: if sense.reason_code is not None: lines.append('Reason code: %s' % sense.reason_code) try: sense.reason_text except AttributeError: pass else: if sense.reason_text is not None: lines.append('Reason: %s' % sense.reason_text) lines = [simples.sub('?', l) for l in lines] return '\n\t'.join(lines)
def _sense_to_row(sense, status): if sense.definition is None: undefined = True definition = None else: undefined = False definition = sense.definition[:200] if sense.definition_supplement: definition_supplement = sense.definition_supplement[:150] else: definition_supplement = None try: reasoncode = sense.reason_code except AttributeError: reasoncode = None try: reasontext = sense.reason_text[:200] except (AttributeError, TypeError): reasontext = None try: thesclass1_id = sense.class_id except AttributeError: thesclass1_id = None try: thesclass2_id = sense.runners_up[0] except (AttributeError, IndexError): thesclass2_id = None try: thesclass3_id = sense.runners_up[1] except (AttributeError, IndexError): thesclass3_id = None if thesclass1_id is not None: thesclass = tdb.get_thesclass(thesclass1_id) level2branch = thesclass.ancestor(level=2) checkstatus = 'u' else: level2branch = None checkstatus = 'n' if level2branch is not None: level2branch_id = level2branch.id else: level2branch_id = None try: bayes = sense.bayes_classification bayes_confidence = sense.bayes_confidence except AttributeError: bayes = None bayes_confidence = 0 row = [ status, sense.lemma[:100], lexical_sort(sense.lemma)[:100], sense.wordclass or 'NN', definition, definition_supplement, sense.entry_id, sense.node_id, sense.entry_lemma[:50], lexical_sort(sense.entry_lemma)[:50], sense.subentry_type or 'main sense', undefined, random.randint(0, 10000), # sample order bayes, bayes_confidence, _bayes_mismatch(sense), thesclass1_id, thesclass2_id, thesclass3_id, 'u', # checkbox for thesclass1 (unset) 'i', # checkbox for thesclass2 (incorrect) 'i', # checkbox for thesclass3 (incorrect) checkstatus, level2branch_id, reasontext, reasoncode, sense.clone_num, # Gets changed to True/False before committing to DB ] return row
def winnow(class_ids, wordclass): # Convert thesaurus IDs stored in the raw files to actual thesaurus classes thesclasses = [tdb.get_thesclass(id) for id in class_ids] if wordclass == 'NN': thesclasses = [t for t in thesclasses if t.wordclass == 'noun'] elif wordclass == 'JJ': thesclasses = [t for t in thesclasses if t.wordclass == 'adjective'] elif wordclass == 'RB': thesclasses = [t for t in thesclasses if t.wordclass == 'adverb'] # Keep a note of the total number of instances of this word in compounds # (before we start winnowing out stuff) total = len(thesclasses) # Group into wordclass-level parent classes wordclass_groups = {} for t in thesclasses: p = t.wordclass_parent() or t if not p.id in wordclass_groups: wordclass_groups[p.id] = (p, []) wordclass_groups[p.id][1].append(t) # Reduce to a list of (parent_node, child_nodes) tuples wordclass_groups = list(wordclass_groups.values()) # Sort so that the most common is first wordclass_groups.sort(key=lambda row: row[0].level) wordclass_groups.sort(key=lambda row: len(row[1]), reverse=True) # For each wordclass group, find the best child node to use # (which may often be the wordclass node itself) wordclass_groups2 = [] for parent_node, child_nodes in wordclass_groups: # If there's only one child node, or if any of the child nodes # are at wordclass level, then we'll just use the wordclass level if (len(child_nodes) == 1 or any([t.id == parent_node.id for t in child_nodes])): best_child = parent_node # If all the children are on the same node, then we'll use that node elif len(set([t.id for t in child_nodes])) == 1: best_child = child_nodes[0] # ... Otherwise, poll to find the leading one out of the classes # below wordclass level else: best_child = None for depth in (2, 1): # Find the level immediately below the parent wordclass level sub_parent_level = parent_node.level + depth # ... and count how many children are on each branch # at this level counts = Counter([t.ancestor(level=sub_parent_level) for t in child_nodes]).most_common() max_count = counts[0][1] if max_count >= len(child_nodes) * 0.8: best_child = counts[0][0] elif depth == 1: best = [c[0] for c in counts if c[1] == max_count] # If there's a clear winner, we use that; otherwise, we # revert to using the parent node as a fallback if len(best) == 1: best_child = best[0] else: best_child = parent_node if best_child is not None: break wordclass_groups2.append((parent_node, len(child_nodes), best_child)) # Group into level-3 classes level3_groups = {} for g in wordclass_groups2: wordclass_parent = g[0] p = wordclass_parent.ancestor(level=3) or wordclass_parent if not p.id in level3_groups: level3_groups[p.id] = (p, []) level3_groups[p.id][1].append(g) # Reduce to a list of (parent_node, count, child_groups) tuples level3_groups = level3_groups.values() level3_groups = [(row[0], sum([g[1] for g in row[1]]), row[1],) for row in level3_groups] # Sort so that the most common is first level3_groups.sort(key=lambda row: row[1], reverse=True) # Drop the long tail of comparatively low-frequency branches level3_groups2 = [] if level3_groups: max_count = level3_groups[0][1] level3_groups = [row for row in level3_groups if row[1] > max_count * 0.1] for parent, count, child_nodes in level3_groups: max_count = child_nodes[0][1] child_nodes = [g for g in child_nodes if g[1] > max_count * 0.1] level3_groups2.append((parent, count, child_nodes,)) return total, level3_groups2
""" Binomials - Manages indexing and lookup of binomial terms (animals and plants) """ import os import csv from collections import defaultdict, Counter import lex.oed.thesaurus.thesaurusdb as tdb from pickler.sensemanager import PickleLoader #from utils.tracer import trace_class living_world_id = 8835 living_world_node = tdb.get_thesclass(living_world_id) life_branches = (22501, 29205, 17709) # plant, animal, microorganism class Binomials(object): index = {'binomials': {}, 'genera': {}, } def __init__(self, **kwargs): for k, v in kwargs.items(): self.__dict__[k] = v try: self.resources_dir except AttributeError: pass else: dir = os.path.join(self.resources_dir, 'taxonomy') self.raw_files = { 'binomials': os.path.join(dir, 'binomials_raw.csv'),