Beispiel #1
0
def fill_gaps(**kwargs):
    """
    Try To find ngrams for missing lemmas by scraping data from
    the Google Ngram Viewer site
    """
    letters = kwargs.get('letters', string.ascii_lowercase)

    # Load list of gaps from file
    infile = os.path.join(GBN_DIR, '4', 'tmp', filename(GBN_DIR))
    with open(infile, 'r') as filehandle:
        gaps = [l.strip() for l in filehandle.readlines()]
    gaps = [g for g in gaps if lexical_sort(g)
            and lexical_sort(g)[0] in letters]

    results = {letter: [] for letter in letters}
    gba = GoogleBooksApi(start=1750, end=2008)

    # We cluster ngrams into sets of five, which will be dealt with in
    #  a single request - cutting down the number of requests
    clusters = _cluster(gaps, 5)

    for ngram_set in clusters:
        print(ngram_set[0])
        for result in gba.get_ngram_data(queries=ngram_set):
            results[result.initial()].append(result)
            #print(result.tostring())
        sleep(SLEEPTIME)

    for letter in results:
        subdir = os.path.join(GBN_DIR, '4', letter)
        if not os.path.exists(subdir):
            os.mkdir(subdir)
        with open(os.path.join(subdir, filename(GBN_DIR)), 'w') as filehandle:
            for r in results[letter]:
                filehandle.write(r.tostring() + '\n')
Beispiel #2
0
 def matches_headword(self, lemma, exact=False):
     """
     Return True if the lemma matches the entry headword (which would
     indicate that this is a regular sense and not a subentry)
     """
     if exact and lemma == self.headword:
         return True
     elif not exact and lexical_sort(lemma) == lexical_sort(self.headword):
         return True
     else:
         return False
Beispiel #3
0
def _compound_reverse_match(lemma, tokens, ngrams):
    match = None
    words = lemma.split()
    if len(words) == 2:
        reverse1 = words[1] + " " + words[0]
        reverse2 = words[1] + "s " + words[0]
        reverse1 = stringtools.lexical_sort(reverse1)
        reverse2 = stringtools.lexical_sort(reverse2)
        for token_full, token_flat in tokens + ngrams:
            if token_flat == reverse1 or token_flat == reverse2:
                match = token_full
                break
    return match
Beispiel #4
0
    def parse_link_file(self):
        def parse_hw(node):
            text = etree.tostring(node, method='text', encoding='unicode')
            text = text.split('|')[0]
            return text.split(',')[0].strip()

        # Create mappings from OED to ODE
        multilinks = defaultdict(list)
        for filepath in self.link_files:
            tree = etree.parse(filepath)
            for entry in tree.findall('./e'):
                lexid = entry.get('lexid')
                linknode = entry.find('./linkSet/link')
                if linknode is not None:
                    oed_id = linknode.get('refentry')
                    sub_id = linknode.get('refid')
                    if sub_id is not None and sub_id != '0':
                        oed_id = oed_id + '#' + sub_id
                    oed_hw = parse_hw(linknode)
                    ode_hw = parse_hw(entry.find('label'))
                    multilinks[oed_id].append(CandidateLink(lexid, oed_hw, ode_hw))

        for oed_id, linklist in multilinks.items():
            # If there's only one possible ODO link for this OED ID, we accept that.
            #  But if there's more than one competing link, we look for the one where
            #  the headwords match; or failing that, the one where the headwords
            #  fuzzily match.
            if len(linklist) == 1:
                winner = linklist[0]
            else:
                # Exact match
                z = [l for l in linklist if l.oed_headword == l.odo_headword]
                try:
                    winner = z[0]
                except IndexError:
                    # Fuzzy match
                    z = [l for l in linklist if
                         lexical_sort(l.oed_headword) == lexical_sort(l.odo_headword)]
                    try:
                        winner = z[0]
                    except IndexError:
                        # Give up
                        winner = linklist[0]
            self.links[oed_id] = winner.lexid

        # Create the inverse mapping (from ODE to OED)
        for oed_id, lexid in self.links.items():
            self.links_reversed[lexid] = oed_id

        self.parse_oed_file()
Beispiel #5
0
def _tokenize_text(text):
    naive_tokens = text.split()
    tokens = [t.strip(',:;()[]."?! ') for t in naive_tokens]
    tokens = [re.sub(r"'s$", "", t) for t in tokens]
    tokens = [(t, stringtools.lexical_sort(t)) for t in tokens]
    ngrams = _compile_ngrams(naive_tokens, 2) + _compile_ngrams(naive_tokens, 3) + _compile_ngrams(naive_tokens, 4)
    return tokens, ngrams
Beispiel #6
0
def _parse_line(line, gram_count):
    line = line.strip()
    parts = line.split("\t")

    decades = {}
    while parts and DECADE_PATTERN.search(parts[-1]):
        p = parts.pop()
        decade, score = p.split(":")
        decades[int(decade)] = int(score)

    if len(parts) == 3:
        sortcode = parts[0]
        source_lemma = parts[1]
        wordclass = parts[2]
    elif len(parts) == 1:
        sortcode = None
        source_lemma = parts[0]
        wordclass = "ALL"
    elif len(parts) == 2 and gram_count != 3:
        sortcode = None
        source_lemma = parts[0]
        wordclass = parts[1]
    elif len(parts) == 2:
        sortcode = parts[0]
        source_lemma = parts[1]
        wordclass = "ALL"
    if gram_count >= 3:
        source_lemma = source_lemma.replace(" - ", "-")

    if not sortcode:
        sortcode = lexical_sort(source_lemma)

    return [line, source_lemma, source_lemma, sortcode, decades, gram_count, wordclass, None]
    def collect_sample(self, name, size, function):
        total = 0
        for parent_dir in self.directories:
            dir = os.path.join(parent_dir, 'classified')
            for letter in letters:
                pl = PickleLoader(dir, letters=letter)
                for sense in pl.iterate():
                    if is_valid(sense, name, function):
                        total += 1

        sense_index = set()
        while len(sense_index) < size:
            i = random.randint(0, total)
            if not i in sense_index:
                sense_index.add(i)

        self.sample = []
        count = 0
        for parent_dir in self.directories:
            dir = os.path.join(parent_dir, 'classified')
            for letter in letters:
                pl = PickleLoader(dir, letters=letter)
                for sense in pl.iterate():
                    if is_valid(sense, name, function):
                        if count in sense_index:
                            self.sample.append(sense)
                        count += 1

        self.sample.sort(key=lambda s: lexical_sort(s.lemma))
    def index_proper_names(self):
        allnames = set()
        for name_type in ('firstname', 'surname', 'placename'):
            for name in propernames.names_list(name_type):
                if ' ' in name:
                    continue
                allnames.add(name)

        for letter in string.ascii_lowercase:
            print('Indexing proper names in %s...' % letter)
            for entry in entry_iterator(letters=letter):
                if entry.primary_wordclass() not in ('NP', 'NPS'):
                    continue
                for typeunit in entry.types():
                    if (' ' in typeunit.form or
                        not typeunit.lemma_manager().capitalization_type() == 'capitalized'):
                        continue
                    allnames.add(typeunit.form)

        out_file = os.path.join(FORM_INDEX_DIR, 'proper_names', 'all.txt')
        with open(out_file, 'w') as filehandle:
            for name in allnames:
                sortable = stringtools.lexical_sort(name)
                if (not sortable or
                        len(sortable) > MAX_WORDLENGTH or
                        len(name) > MAX_WORDLENGTH):
                    continue
                filehandle.write('%s\t%s\t%s\n' % (sortable,
                                                   name,
                                                   str(propernames.is_common(name))))
Beispiel #9
0
    def _load_cache(self):
        for letter in LETTERS:
            fname = os.path.join(self.dir, letter + ".xml")
            doc = etree.parse(fname, PARSER)
            for entry in doc.findall("e"):
                blocks = _parse_entry(entry, self.with_definitions, self.max_senses)
                for block in blocks:
                    address = (block.entry_id, block.block_id)
                    MainSensesCache.blocks[address] = block

        # Index all the blocks by entry ID
        for block in MainSensesCache.blocks.values():
            try:
                MainSensesCache.entries[block.entry_id]
            except KeyError:
                MainSensesCache.entries[block.entry_id] = []
            MainSensesCache.entries[block.entry_id].append(block)

        # Identify minor homographs
        homographs = defaultdict(list)
        for block in MainSensesCache.blocks.values():
            address = (lexical_sort(block.headword), block.wordclass)
            homographs[address].append(block)
        for homograph_set in homographs.values():
            if len(homograph_set) > 1:
                homograph_set.sort(key=lambda b: b.quotations, reverse=True)
                for h in homograph_set[1:]:
                    MainSensesCache.minor_homographs[h.entry_id].add(h.block_id)
                    MainSensesCache.minor_homographs[h.entry_id].add(h.wordclass)
Beispiel #10
0
def _store_forms(block, entry, block_type, letter):
    us_variant = entry.us_variant()
    standardtypes = set()
    varianttypes = set()
    alientypes = set()
    for morphset in block.morphsets():
        if (morphset.form in (entry.lemma, us_variant, block.lemma) or
                morphset.is_oed_headword()):
            _add_types(morphset, standardtypes, letter)
        elif (block_type == 'entry' and
                morphset.date().end > VARIANT_MINIMUM_END_DATE and
                not morphset.is_nonstandard()):
            # Don't store variants for subentries; don't store
            #  very old or non-standard variants
            _add_types(morphset, varianttypes, letter)
            _add_alien_variants(morphset, alientypes, letter)
    varianttypes = varianttypes - standardtypes
    alientypes = alientypes - standardtypes

    refentry, refid = block.link(target='oed', asTuple=True)

    if block.has_frequency_table():
        f2000 = block.frequency_table().frequency(period='1990-2007')
        f1950 = block.frequency_table().frequency(period='1940-1959')
        f1900 = block.frequency_table().frequency(period='1890-1909')
        f1850 = block.frequency_table().frequency(period='1840-1859')
        f1800 = block.frequency_table().frequency(period='1790-1809')
        f1750 = block.frequency_table().frequency(period='1750-1769')
    else:
        f2000 = 0
        f1950 = 0
        f1900 = 0
        f1850 = 0
        f1800 = 0
        f1750 = 0

    definition = block.definition(src='oed') or None

    return BlockData(refentry,
                     refid,
                     block_type,
                     stringtools.lexical_sort(block.lemma),
                     block.lemma,
                     block.wordclass(),
                     definition,
                     block.date().exact('start'),
                     block.date().exact('end'),
                     None,
                     None,
                     standardtypes,
                     varianttypes,
                     alientypes,
                     _round_number(f2000),
                     _round_number(f1950),
                     _round_number(f1900),
                     _round_number(f1850),
                     _round_number(f1800),
                     _round_number(f1750),)
Beispiel #11
0
def _find_secondary_lemmas(sense):
    # Any other lemmas (<lm> or <vl> elements) in the sense
    secondary_lemmas = set()
    for tag in ("lm", "vl", "vf"):
        for node in sense.node.findall(".//" + tag):
            text = etree.tounicode(node, method="text", with_tail=False)
            # Skip truncated stuff
            if text.startswith("-") or text.endswith("-"):
                pass
            else:
                secondary_lemmas.add(text)
    secondary_lemmas.discard(sense.lemma)
    return [(l, stringtools.lexical_sort(l)) for l in secondary_lemmas]
Beispiel #12
0
def _hyphen_match(inflections, tokens):
    match = None
    for token_full, token_flat in tokens:
        parts = token_full.split("-")
        if len(parts) == 1:
            continue
        parts = [(p, stringtools.lexical_sort(p)) for p in parts]
        for p_full, p_flat in parts:
            if p_flat in inflections:
                match = p_full
        if match:
            break
    return match
Beispiel #13
0
def populate_entries():
    vsc = VitalStatisticsCache()
    entries = []
    for entry in vsc.entries:
        row = Entry(id=entry.id,
                    label=entry.label[:LABEL_LENGTH],
                    alphasort=lexical_sort(entry.headword)[:ALPHASORT_LENGTH])
        entries.append(row)

        if len(entries) > 1000:
            Entry.objects.bulk_create(entries)
            entries = []

    Entry.objects.bulk_create(entries)
Beispiel #14
0
    def ingest_sense(self, sense):
        # Clear the decks
        self.null_sense_level_attributes()

        # If the sense is a main sense defining a compound or similar,
        #   we adopt the entry headword as a fallback in case the
        #   full compound can't be found (useful for absol. uses, etc.).
        if sense.is_subentry() or sense.is_subentry_like():
            fallback = None
        elif sense.lemma.lower() == self.entry_lemma.lower():
            fallback = None
        elif len(sense.lemma.split()) <= 2:
            fallback = stringtools.lexical_sort(self.entry_lemma)
        else:
            fallback = None

        self.lemma = sense.lemma
        self.lemma_flat = stringtools.lexical_sort(sense.lemma)
        self.secondary_lemmas = _find_secondary_lemmas(sense)
        self.fallback_lemma = fallback
        self.wordclass = sense.primary_wordclass().penn
        self.inflections = _inflection_set(sense.lemma, self.wordclass)

        if sense.is_subentry() or sense.is_subentry_like():
            variants = {}
        elif sense.lemma.lower() == self.entry_lemma.lower():
            variants = {f: d for f, d in self.formslist.items()}
        else:
            variants = {}
        lemma_flat = stringtools.lexical_sort(sense.lemma)
        variants[lemma_flat] = 2050
        # Allow "whippin'" for "whipping"
        if lemma_flat.endswith("ing"):
            variants[lemma_flat.rstrip("g")] = 2050

        self.local_variants = variants
        self.local_variants_inflected = _inflect_variants(variants, self.wordclass)
Beispiel #15
0
    def find_lemma(self, lemma, **kwargs):
        wordclass = kwargs.get('wordclass')
        locale = kwargs.get('locale')
        candidates = self.find_sortcode(lexical_sort(lemma))
        candidates = [c for c in candidates if (c.lemma == lemma and
                      (wordclass is None or c.wordclass == wordclass))]
        if locale == 'uk':
            candidates = [c for c in candidates if c.variant_type != 'us']
        if locale == 'us':
            candidates = [c for c in candidates if c.variant_type != 'uk']

        # Sort so that the longest and highest-scoring morphsets are at the top
        candidates.sort(key=lambda c: c.score, reverse=True)
        candidates.sort(key=len, reverse=True)
        return candidates
Beispiel #16
0
def _compile_ngrams(tokens, length):
    ngrams = []
    for i in range(0, len(tokens)):
        try:
            window = tokens[i:i+length]
        except IndexError:
            pass
        else:
            ngram = ' '.join(window)
            ngram = ngram.strip(',:;().!?- ')
            ngram_flat = re.sub(r'<[^<>]+>', '', ngram)
            ngram_flat = stringtools.lexical_sort(ngram_flat)

            ngrams.append((ngram, ngram_flat))
    return ngrams
Beispiel #17
0
def _phrase_match(lemma, tokens, bigrams):
    words = lemma.split()
    if len(words) < 3:
        return None

    match = None
    if words[0] == "to":
        phrase_words = words[1:]
    else:
        phrase_words = words[:]

    phrase_words = [[w, stringtools.lexical_sort(w)] for w in phrase_words]
    for w in phrase_words:
        w_flat = w[1]
        if len(w_flat) > 3:
            w.append(w_flat[0:3])
        else:
            w.append(w_flat)

    phrase_flat = "".join([w[1] for w in phrase_words])
    for token_full, token_flat in bigrams:
        if token_flat == phrase_flat:
            match = token_full
            break

    if not match:
        phrase_length = len(phrase_words)
        for i in range(0, len(tokens) - 1):
            try:
                ngram = tokens[i : i + phrase_length]
            except IndexError:
                pass
            else:
                match_failed = False
                for p_token, q_token in zip(phrase_words, ngram):
                    if q_token[1].startswith(p_token[2]):
                        pass
                    elif p_token[0] in "one's" and q_token[0] in POSS_PRONOUNS:
                        pass
                    elif p_token[0] in "oneself" and q_token[0] in REFL_PRONOUNS:
                        pass
                    else:
                        match_failed = True
                        break
                if not match_failed:
                    match = " ".join([t[0] for t in ngram])
                    break
    return match
Beispiel #18
0
def _sanitize_lemma(instance, wordclass):
    new_lemma = instance.lemma()
    new_lemma = re.sub(r'\([^()]+\)', '', new_lemma)
    new_lemma = re.sub(r'\([a-z]+$', '', new_lemma)
    new_lemma = re.sub(r'  +', ' ', new_lemma)
    new_lemma = new_lemma.strip()
    if wordclass == 'VB':
        new_lemma = re.sub(r'^to ', '', new_lemma)
    if wordclass == 'NN':
        new_lemma = re.sub(r'^(the|a|an) ', '', new_lemma)

    new_lemma = new_lemma[0:LEMMA_LENGTH]

    if new_lemma != instance.lemma():
        instance.node.find('./lemma').text = new_lemma
        instance.node.set('sortAlpha', lexical_sort(new_lemma))
Beispiel #19
0
def _compile_ngrams(tokens, length):
    ngrams = []
    for i in range(0, len(tokens)):
        try:
            window = tokens[i : i + length]
        except IndexError:
            pass
        else:
            ngram = " ".join(window)
            ngram = ngram.strip(",:;().!?- ")
            ngram = re.sub(r"'s$", "", ngram)
            # check for internal punctuation
            if any([p in ngram for p in PUNCTUATION]):
                pass
            else:
                ngrams.append((ngram, stringtools.lexical_sort(ngram)))
    return ngrams
Beispiel #20
0
def _inflection_set(lemma, wordclass):
    lemma_flat = stringtools.lexical_sort(lemma)
    if wordclass == "NN":
        infs = {
            INFLECTOR.compute_inflection(lemma_flat, "NNS"),
            lemma_flat + "s",
            re.sub(r"(...)um$", r"\1a", lemma_flat),
            re.sub(r"(...)us$", r"\1i", lemma_flat),
            re.sub(r"(...)sis$", r"\1ses", lemma_flat),
        }
    elif wordclass == "VB":
        infs = {
            INFLECTOR.compute_inflection(lemma_flat, "VBZ"),
            INFLECTOR.compute_inflection(lemma_flat, "VBD"),
            INFLECTOR.compute_inflection(lemma_flat, "VBG"),
            INFLECTOR.compute_inflection(lemma_flat, "VBD", region="us"),
            INFLECTOR.compute_inflection(lemma_flat, "VBG", region="us"),
            lemma_flat + "in",
            lemma_flat + "eth",
            lemma_flat + "ethe",
            lemma_flat + "est",
            lemma_flat + "d",
            lemma_flat + "id",
            lemma_flat + "it",
            lemma_flat + "de",
            lemma_flat + "yng",
            lemma_flat + "ynge",
        }
    elif wordclass in ("JJ", "RB"):
        infs = {
            INFLECTOR.compute_inflection(lemma_flat, "JJR"),
            INFLECTOR.compute_inflection(lemma_flat, "JJS"),
            INFLECTOR.compute_inflection(lemma_flat, "JJR", region="us"),
            INFLECTOR.compute_inflection(lemma_flat, "JJS", region="us"),
            # We may as well throw in plural, since adj. and n. quotes
            #  are often mixed together ('Zyrian', etc.)
            INFLECTOR.compute_inflection(lemma_flat, "NNS"),
        }
    else:
        infs = set()
    infs.add(lemma_flat)
    return infs
Beispiel #21
0
def _store_forms(block, entry, block_type, letter):
    us_variant = entry.us_variant()
    standardtypes = set()
    varianttypes = set()
    alientypes = set()
    for morphset in block.morphsets():
        if morphset.form in (entry.lemma, us_variant, block.lemma):
            _add_types(morphset, standardtypes, letter)
        elif (block_type == 'entry' and
                morphset.date().end > VARIANT_MINIMUM_END_DATE and
                not morphset.is_nonstandard()):
            # Don't store variants for subentries; don't store
            #  very old or non-standard variants
            _add_types(morphset, varianttypes, letter)
            _add_alien_variants(morphset, alientypes, letter)
    varianttypes = varianttypes - standardtypes
    alientypes = alientypes - standardtypes

    refentry, refid = block.link(target='oed', asTuple=True)

    frequency = block.frequency()
    if frequency is not None:
        frequency = float('%.2g' % frequency)
        if frequency > 1:
            frequency = int(frequency)

    definition = block.definition(src='oed') or None

    return BlockData(refentry,
                     refid,
                     block_type,
                     stringtools.lexical_sort(block.lemma),
                     block.lemma,
                     block.wordclass(),
                     definition,
                     frequency,
                     block.date().exact('start'),
                     block.date().exact('end'),
                     None,
                     standardtypes,
                     varianttypes,
                     alientypes,)
Beispiel #22
0
def _deduplicate_instances(thesclass):
    thesclass.reload_instances()
    groups = defaultdict(list)
    for instance in thesclass.instances():
        groups[lexical_sort(instance.lemma())].append(instance)

    deletions = []
    for group in groups.values():
        if len(group) > 1:
            z = [i for i in group if not i.is_obsolete()] or group[:]
            z.sort(key=lambda i: i.num_quotations(), reverse=True)
            z.sort(key=lambda i: i.start_date())
            for instance in z[1:]:
                deletions.append(instance)

    if deletions:
        for instance in deletions:
            instance.selfdestruct()
        thesclass.reload_instances()
        thesclass.reset_size(len(thesclass.instances()))
Beispiel #23
0
    def __init__(self, **kwargs):
        node = kwargs.get('node', None)
        morphunits = kwargs.get('morphunits', None)
        if node is not None:
            self.sortcode = node.get('sort')
            self.variant_type = node.get('variantType')
            self.id = node.get('id')
            self.morphunits = [MorphUnit(n.findtext('./wordForm'), n.get('pos'))
                               for n in node.findall('./morphUnit')]
            self.score = int(node.get('score'))*2 or 0
        elif morphunits is not None:
            self.morphunits = morphunits
            self.sortcode = lexical_sort(self.lemma)
            self.variant_type = kwargs.get('variant_type', 'default')
            self.score = kwargs.get('score', 0)
            self.id = kwargs.get('id', 0)

        self.source = self.lemma  # this should remain unchanged
        if self.variant_type != 'us':
            self.score += 1
        self.computed = False
Beispiel #24
0
def find_htlink(block, main_sense_data, confidence):
    if confidence is not None and confidence <= 2:
        #print(block.lemma, block.wordclass, confidence)
        return None
    if block.lemma in BANNED_LEMMAS:
        return None
    if block.wordclass not in ('NN', 'VB', 'JJ', 'RB', 'UH'):
        return None
    if (block.wordclass == 'RB' and
            not block.lemma.endswith('ly') and
            not block.lemma.endswith('wise') and
            not block.lemma.endswith('ways') and
            not block.lemma in ALLOWED_ADVERBS):
        return None

    if block.type == 'entry' and main_sense_data:
        main_sense = (block.refentry, main_sense_data.sense_id)
    elif block.type != 'entry':
        main_sense = (block.refentry, block.refid)
    else:
        main_sense = None

    if main_sense:
        qset = ThesaurusInstance.objects.filter(refentry=main_sense[0],
                                                refid=main_sense[1])
        # Double-check that these are the right p.o.s...
        records = [r for r in qset if r.wordclass() == block.wordclass]
        # ...and roughly the right lemma
        records = ([r for r in records if r.lemma == block.lemma] or
                   [r for r in records if stringtools.lexical_sort(r.lemma) == block.sort])
        # sort so the record from the largest set is top
        records.sort(key=lambda r: r.thesclass.branch_size, reverse=True)
        if records and records[0].thesclass.node_size >= 3:
            return int(records[0].thesclass.id)

    return None
Beispiel #25
0
    def _check_for_omissions(self):
        # If there's a secondary/alternative headword, check that this has
        #   ended up included in the list of variants
        if self.lemma_manager.alt is not None:
            self.lemma_manager.refresh_variants_set()
            if not self.lemma_manager.in_variants_list(self.lemma_manager.alt.dictionary_sort):
                variant_form = VariantForm(self.lemma_manager.alt.lemma,
                                           self.date.start,
                                           self.date.projected_end())
                self.lemma_manager.variants.append(variant_form)

        varsets = []
        for varset in self.primary_sets():
            if varset.lemma == self.lemma:
                varsets.append(varset)
        variant_forms = _filter_varsets(self.primary_sets(),
                                        self.wordclass,
                                        self.date)
        if variant_forms:
            self.lemma_manager.refresh_variants_set()
            for variant_form in variant_forms:
                if not self.lemma_manager.in_variants_list(lexical_sort(variant_form.form)):
                    self.lemma_manager.variants.append(variant_form)

        # Check that the entry headword(s) is represented; given that the ODE
        #  lemma form may be substituted for the original OED lemma form, it's
        #  possible that it's not.
        if self.date.end > 1750:
            for headword in self.headwords:
                matches = [vf for vf in self.lemma_manager.variants if
                           vf.form.replace('~', '') == headword.replace('~', '')]
                if not matches:
                    variant_form = VariantForm(headword,
                                               self.date.start,
                                               self.date.projected_end())
                    self.lemma_manager.variants.append(variant_form)
Beispiel #26
0
 def lexical_sort(self):
     try:
         return self._lexical_sort
     except AttributeError:
         self._lexical_sort = stringtools.lexical_sort(self.lemma)
         return self._lexical_sort
Beispiel #27
0
 def text_lexical_sort(self):
     """
     Test stringtools.lexical_sort()
     """
     for source, _, result in self.test_texts:
         self.assertEqual(stringtools.lexical_sort(source), result)
Beispiel #28
0
def _sense_to_row(sense, status):
    if sense.definition is None:
        undefined = True
        definition = None
    else:
        undefined = False
        definition = sense.definition[:200]

    if sense.definition_supplement:
        definition_supplement = sense.definition_supplement[:150]
    else:
        definition_supplement = None

    try:
        reasoncode = sense.reason_code
    except AttributeError:
        reasoncode = None
    try:
        reasontext = sense.reason_text[:200]
    except (AttributeError, TypeError):
        reasontext = None

    try:
        thesclass1_id = sense.class_id
    except AttributeError:
        thesclass1_id = None
    try:
        thesclass2_id = sense.runners_up[0]
    except (AttributeError, IndexError):
        thesclass2_id = None
    try:
        thesclass3_id = sense.runners_up[1]
    except (AttributeError, IndexError):
        thesclass3_id = None

    if thesclass1_id is not None:
        thesclass = tdb.get_thesclass(thesclass1_id)
        level2branch = thesclass.ancestor(level=2)
        checkstatus = 'u'
    else:
        level2branch = None
        checkstatus = 'n'

    if level2branch is not None:
        level2branch_id = level2branch.id
    else:
        level2branch_id = None

    try:
        bayes = sense.bayes_classification
        bayes_confidence = sense.bayes_confidence
    except AttributeError:
        bayes = None
        bayes_confidence = 0

    row = [
        status,
        sense.lemma[:100],
        lexical_sort(sense.lemma)[:100],
        sense.wordclass or 'NN',
        definition,
        definition_supplement,
        sense.entry_id,
        sense.node_id,
        sense.entry_lemma[:50],
        lexical_sort(sense.entry_lemma)[:50],
        sense.subentry_type or 'main sense',
        undefined,
        random.randint(0, 10000),  # sample order
        bayes,
        bayes_confidence,
        _bayes_mismatch(sense),
        thesclass1_id,
        thesclass2_id,
        thesclass3_id,
        'u',  # checkbox for thesclass1 (unset)
        'i',  # checkbox for thesclass2 (incorrect)
        'i',  # checkbox for thesclass3 (incorrect)
        checkstatus,
        level2branch_id,
        reasontext,
        reasoncode,
        sense.clone_num,  # Gets changed to True/False before committing to DB
    ]
    return row
Beispiel #29
0
def tag_keyword(quotation, keyword):
    """
    Having identified the keyword within the quotation text (using
    KeywordFinder), mark the keyword by adding <kw> tags around it.
    """
    if keyword:
        serialized = etree.tounicode(quotation.text.node)
        qt_splitter = re.search(r'^(<qt(>| [^<>]*>))(.*)(</qt>)$', serialized)
        opentag = qt_splitter.group(1)
        text = ' ' + qt_splitter.group(3) + ' '
        closetag = qt_splitter.group(4)
        text_tagged = None

        keyword = _clean_brackets(keyword)
        keyword = keyword.replace('*', '.').replace('+', '.')
        keyword_flat = stringtools.lexical_sort(keyword)

        matches = None
        for m in ('([ (>])(' + keyword + ')([,;:!?)<. ])',
                  '(.)(' + keyword + ')([,;:!?)<. -])',
                  '([^a-zA-Z])(' + keyword + ')([^a-zA-Z])',
                  '([ (>-])(' + keyword + ')(.)',
                  '(.)(' + keyword + ')(.)'):
            matches = re.findall(m, text)
            if matches:
                break
        if matches:
            prec, match, following = matches[0]
            before = prec + match + following
            after = prec + '<kw>' + match + '</kw>' + following
            text_tagged = text.replace(before, after)

        if not text_tagged:
            text2 = re.sub(r'<([a-z]+) [^<>]*/>', r'<\1/>', text)
            text2 = re.sub(r'<([a-z]+) [^<>]*>', r'<\1>', text2)
            tokens = text2.split()
            for token in tokens:
                token2 = re.sub(r'<[^<>]+>', '', token)
                token2 = token2.strip(',:;!?.()')
                if token2 == keyword:
                    target = token.strip(',:;!?.()')
                    text_tagged = text2.replace(target, '<kw>' + target + '</kw>')
                    break

        if not text_tagged:
            for round in (1, 2):
                text2 = re.sub(r'<([a-z]+) [^<>]*/>', r'<\1/>', text)
                text2 = re.sub(r'<([a-z]+) [^<>]*>', r'<\1>', text2)

                # text_true is the version we'll actually be tagging
                #  - with ellipses, etc., still in place
                text_true = text2

                if round == 2:
                    # Replace ellipses and m-dashes with spaces, so that
                    # adjacent words get tokenized
                    for char in ('\u2025', '\u2026', '\u2014'):
                        text2 = text2.replace(char, ' ')

                # Tokenize and make into ngrams
                tokens = text2.split()
                ngrams = (_compile_ngrams(tokens, 1) +
                          _compile_ngrams(tokens, 2) +
                          _compile_ngrams(tokens, 3) +
                          _compile_ngrams(tokens, 4) +
                          _compile_ngrams(tokens, 5) +
                          _compile_ngrams(tokens, 6))

                target = None
                for ngram_full, ngram_flat in ngrams:
                    if keyword_flat == ngram_flat:
                        target = ngram_full
                        break
                if target:
                    # Strip ellipses and dashes
                    target = target.strip('\u2025\u2026\u2014')
                    text_tagged = text_true.replace(target, '<kw>' + target + '</kw>')
                    break

        if not text_tagged:
            keyword_tokens = keyword.split()
            if len(keyword_tokens) >= 2:
                first = re.findall(keyword_tokens[0], text)
                last = re.findall(keyword_tokens[-1], text)
                if len(first) == 1 and len(last) == 1:
                    pattern = ('(' + keyword_tokens[0] + '.*?' +
                               keyword_tokens[-1] + ')')
                    text_tagged = re.sub(pattern, r'<kw>\1</kw>', text)
                    #print('----------------------------------------------------')
                    #print(serialized)
                    #print('|' + keyword + '|')
                    #print(text_tagged)

        if text_tagged and '<kw>' in text_tagged:
            serialized_tagged = opentag + text_tagged.strip() + closetag
            try:
                node_tagged = etree.fromstring(serialized_tagged)
            except etree.XMLSyntaxError:
                pass
            else:
                parent = quotation.text.node.getparent()
                parent.replace(quotation.text.node, node_tagged)
        else:
            pass
Beispiel #30
0
def _replace_lemma(entry, headword):
    return entry._replace(lemma=headword, sort=stringtools.lexical_sort(headword))