Ejemplo n.º 1
0
def infer_derivative(sense, main_sense_of_entry):
    equiv = None
    if (main_sense_of_entry is not None and
        not main_sense_of_entry.is_affix() and
        main_sense_of_entry.thesclass is not None):
        # Take precedent from the main sense of the parent entry
        equiv = tdb.equivalent_class(main_sense_of_entry.thesclass, sense.wordclass)
        if equiv is not None:
            equiv.reason_text = 'Parallel to "%s"' % main_sense_of_entry.lemma
            equiv.reason_code = 'driv'
    if equiv is None:
        # Take precedent from other sibling derivatives
        candidates = tdb.ranked_search(refentry=sense.entry_id,
            thes_linked=True, currentOnly=True)
        candidates = [c for c in candidates if c.is_derivative() and
                      not ' ' in c.lemma and
                      c.superclass() is not None]
        if candidates:
            j = defaultdict(int)
            for c in candidates:
                j[c.superclass()] += 1
            parents = [p for p, num in j.items() if num == max(j.values())]
            # If there's more than one possible parent, pick the one with
            #  the largest branch
            parents.sort(key=lambda p: p.branch_size, reverse=True)
            equiv = tdb.child_wordclass_branch(parents[0], sense.wordclass)
            if equiv is None:
                equiv = parents[0]
            if equiv is not None:
                equiv.reason_text = 'Parallel to %s' % ', '.join(
                    ['"%s"' % (c.lemma,) for c in candidates if c.lemma])
                equiv.reason_code = 'driv'
    return equiv
Ejemplo n.º 2
0
 def branches_from_xrefs(self, sense):
     branches = set()
     for xr in sense.cross_references:
         instances = tdb.ranked_search(refentry=xr.refentry, refid=xr.refid)
         local_branches = _parse_instances(instances)
         branches = branches | local_branches
     return [b.id for b in branches]
Ejemplo n.º 3
0
def finalize():
    """
    Use the 'manual' file to override where necessary
    """
    for wordclass in WORDCLASSES:
        lemmas = {}
        infile1 = os.path.join(DIRECTORY, '%s_compounds.csv' % wordclass)
        infile2 = os.path.join(DIRECTORY, '%s_manual.csv' % wordclass)
        with open(infile1, 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                lemmas[row[0]] = int(row[1])
        # Do the manual file second, so that it overrides the
        #  automatically-generated file
        with open(infile2, 'r') as filehandle:
            csvreader = csv.reader(filehandle)
            for row in csvreader:
                lemmas[row[0]] = int(row[1])

        output = []
        for lemma, class_id in lemmas.items():
            # Retrieve the branch that the majority of compounds are on
            compound_branch = tdb.get_thesclass(class_id)

            # Get the highest-rated senses for the lemma
            ranked_senses = tdb.ranked_search(lemma=lemma, wordclass=wordclass)
            if ranked_senses:
                max_rating = ranked_senses[0].rating()
                ranked_senses = [s for s in ranked_senses if
                                 max_rating > 0 and
                                 s.rating() > max_rating * 0.3]

            # Try filtering to just those senses that are on
            #   the same branch as the compounds
            ranked_filtered = [s for i, s in enumerate(ranked_senses) if
                               (i == 0 and s.thesclass is None) or
                               s.is_descendant_of(compound_branch)]
            # ... or else stick with original ranking
            if not ranked_filtered:
                ranked_filtered = ranked_senses

            if ranked_filtered:
                output.append(ranked_filtered[0])

        outfile = os.path.join(DIRECTORY, '%s.csv' % wordclass)
        output.sort(key=lambda s: s.lemma)
        with open(outfile, 'w') as filehandle:
            csvwriter = csv.writer(filehandle)
            for s in output:
                row = (s.lemma, s.refentry, s.refid,
                       s.entry_size, s.breadcrumb())
                csvwriter.writerow(row)
def compound_derivative(sense):
    if sense.last_element() is None:
        return None, None

    thesclass = None
    for ending, wordclasses, replacement in derivation_forms:
        if (sense.wordclass in wordclasses and
            sense.last_element().endswith(ending) and
            len(sense.last_element()) > len(ending) + 2):

            # Figure out what the base form would look like,
            #  if the lemma *is* a derivative
            # - Strip off the ending, then add the replacement ending
            #  (which is usually a null string).
            hypothetical_base =\
                sense.lemma[0:len(sense.lemma)-len(ending)] + replacement

            # Test if the hypothetical base form exists, and if so
            #  find out how it is classified
            base_classifications = tdb.ranked_search(lemma=hypothetical_base,
                                                     current_only=True)

            if (tdb.distinct_senses(base_classifications) == 1 and
                    base_classifications[0].thesclass is not None):
                thesclass = base_classifications[0].thesclass
                break

    if thesclass is not None:
        # Don't risk things like 'yellow-bellied' - these are
        #  likely to be transparent, not a derivative of e.g.
        #  'yellow-belly', so should go after existing guesses
        if ending in ('ed', 'ied') and sense.first_element().lower() in colours:
            position = 'last'
        else:
            position = 'first'
        return base_classifications[0].thesclass, position
    else:
        return None, None
def ranked_sense_summary(**kwargs):
    """
    Return a summary of the ranked_search() results, reduced to a
    given thesaurus level (defaults to level=3)

    Returns a ranked list of result objects, each with the following
    attributes:
     * parent: level-3 parent class;
     * classes: ranked list of thesaurus classes which map to the parent;
     * summed_rating: sum of c.rating() values for each c in classes;
     * probability: probability of this row (based on summed rating).

    Keyword arguments:
     * level (int): the parent thesaurus level which will be returned
     * omit_null (True/False): if True, any senses which don't have
            a classification will be ignored
     ... plus all the usual optional keyword arguments passed on to
     tdb.search() (lemma, wordclass, refentry, refid, etc.).
    """
    level = kwargs.get('level', 3)
    lemma = kwargs.get('lemma')
    wordclass = kwargs.get('wordclass')
    omit_null = kwargs.get('omit_null', False)

    if lemma is not None:
        main_sense = main_sense_finder.main_sense(lemma=lemma,
                                                  wordclass=wordclass,
                                                  listed_only=True)
        if main_sense is not None:
            kwargs['promote'] = main_sense.refid

    candidates = tdb.ranked_search(**kwargs)
    if omit_null:
        candidates = [c for c in candidates if c.thesclass is not None]

    # Give each thesclass a probability value (as a ratio of the sum
    #  of all senses' ratings)
    total = sum([c.rating() for c in candidates])
    if total <= 0:
        total = 1

    summary = {}
    for c in candidates:
        if c.thesclass is None or c.thesclass.ancestor(level=level) is None:
            ancestor = None
            identifier = 0
        else:
            ancestor = c.thesclass.ancestor(level=level)
            identifier = ancestor.id
        if not identifier in summary:
            summary[identifier] = ResultRow(ancestor)
        summary[identifier].append(c)

    # Convert to a list
    summary = list(summary.values())
    # Add a probability score (0 < p < 1) to each row
    [row.set_probability(total) for row in summary]
    # Sort by probability
    summary.sort(key=lambda r: r.probability, reverse=True)

    return summary
Ejemplo n.º 6
0
def match_single_synonym(sense):
    # Drop out any highly polysemous synonyms
    synonyms = []
    for syn in sense.synonyms:
        instances = tdb.search(lemma=syn,
                               wordclass=sense.wordclass,
                               current_only=True)
        if tdb.distinct_senses(instances) < 20:
            synonyms.append(syn)

    if not synonyms:
        return None, None

    match = None
    matching_synonym = None

    # If the sense can be restricted by subject area, try to find a match for
    #  *any* synonym (even if there's only one)
    if not match and synonyms and sense.subjects:
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                subjects=sense.subjects,
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If the sense is an interjection, try to find a match for
    #  *any* synonym (even if there's only one) - since interjection
    #  synonyms are more reliable and less ambiguous
    if not match and synonyms and sense.wordclass == 'UH':
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass='UH',
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    # If any of the synonyms are single-sense (or nearly single-sense),
    #  then we assume that that is the correct sense
    if not match:
        candidates = []
        for syn in synonyms:
            syn_senses = tdb.ranked_search(lemma=syn,
                                           wordclass=sense.wordclass,
                                           current_only=True)
            if (syn_senses and
                    (tdb.distinct_senses(syn_senses) == 1 or
                    (tdb.distinct_senses(syn_senses) <= 3 and
                    len(synonyms) == 1))):
                candidates.append(syn_senses[0])
        for c in candidates:
            if c.thesclass is not None:
                match = c.thesclass
                matching_synonym = c.lemma
                break

    # If the sense can be restricted by Bayes classification(s), try to
    #   find a match for *any* synonym (even if there's only one)
    if not match and synonyms and sense.bayes.is_usable():
        candidates = []
        for syn in synonyms:
            candidates.extend(tdb.ranked_search(lemma=syn,
                                                wordclass=sense.wordclass,
                                                branches=sense.bayes.ids(),
                                                current_only=True))
        if candidates and candidates[0].thesclass is not None:
            match = candidates[0].thesclass
            matching_synonym = candidates[0].lemma

    return match, matching_synonym
Ejemplo n.º 7
0
    def find_branch_from_superordinate(self, sense):
        """Classify by finding the main or only sense of the superordinate
        """
        if (sense.wordclass not in ('NN', 'JJ') or
                not sense.superordinate or
                len(sense.superordinate) < 3 or
                sense.superordinate in GENERICS):
            return None

        target_sense = None

        # If the superordinate is (more or less) single-sense, we assume that
        #  sense to be the correct one
        candidates = tdb.ranked_search(
            lemma=sense.superordinate,
            wordclass='NN',
            current_only=True)
        if candidates and tdb.distinct_senses(candidates) <= 2:
            target_sense = candidates[0]

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.confidence() >= 8:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on subject labels
        if target_sense is None and sense.label_based_classifications:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.label_based_classifications,
                current_only=True)

        # Otherwise, narrow by branches based on cross-references
        if target_sense is None and sense.xref_branches:
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.xref_branches,
                current_only=True)

        # Last gasp: If the gloss consists more or less *only* of the
        #   superordinate (e.g. 'an abbey'), then it should be adequate to
        #   just use the main sense of the superordinate, even if it's
        #   multi-sense.
        # But don't risk this is there are cross-references or subject
        #   labels which might suggest a more specific use
        if (target_sense is None and not sense.subjects and
            not sense.xref_branches and sense.gloss is not None):
            g = re.sub(r'^(a|an|the) ', '', sense.gloss.lower())
            if g == sense.superordinate:
                target_sense = MAIN_SENSE_FINDER.main_sense(
                    lemma=sense.superordinate, wordclass='NN')

        # Otherwise, narrow by Bayes classification
        if target_sense is None and sense.bayes.is_usable():
            target_sense = tdb.highest_ranked(
                lemma=sense.superordinate,
                wordclass='NN',
                branches=sense.bayes_based_classifications,
                current_only=True)

        if target_sense is not None and target_sense.thesclass is not None:
            match = target_sense.thesclass
            if sense.wordclass == 'JJ':
                match = tdb.equivalent_class(match, 'JJ')
            return match
        else:
            return None