Exemple #1
0
 def text_logprob(self, text):
     """
     Get the log probability of this text, along with its most likely
     spacing, gluing it together with multiple "segments" if necessary.
     """
     slug = slugify(text)
     n = len(slug)
     best_partial_results = ['']
     best_logprobs = [0.]
     for right_edge in range(1, n + 1):
         found = self.segment_logprob(slug[:right_edge])
         if found:
             rprob, rtext = found
             best_partial_results.append(rtext)
             best_logprobs.append(rprob)
         else:
             best_logprobs.append(-1000.)
             best_partial_results.append(slug[:right_edge])
         for left_edge in range(1, right_edge):
             lprob = best_logprobs[left_edge]
             found2 = self.segment_logprob(slug[left_edge:right_edge])
             if found2:
                 rprob, rtext = found2
                 totalprob = lprob + rprob - log(10)
                 if totalprob > best_logprobs[right_edge]:
                     best_logprobs[right_edge] = totalprob
                     ltext = best_partial_results[left_edge]
                     best_partial_results[right_edge] = ltext + ' ' + rtext
     return best_logprobs[-1], best_partial_results[-1]
Exemple #2
0
    def test_cromulence(self):
        """
        This test runs a corpus of past Mystery Hunt answers through the cromulence
        function, so we can tune it to return positive numbers for real answers.

        It does this by generating fake answers with the lengths of real answers, but
        with the letters drawn randomly from a unigram distribution.

        Sometimes this comes up with neat fake answers such as:

            7.4  ON FRODO
            6.8  ENIAC
            6.6  AS I CAN
            6.4  IBM STOP
            5.4  AIR LOL
            3.8  USE MIT
            3.7  VON POOPIN
            2.1  NA BEER
            0.2  DNA ARRRGH AH
        """
        real_answers = []
        years = ['1994', '1997'] + [str(year) for year in range(1999, 2021)]
        for year in years:
            with open(corpus_path('mh_answers/mystery%s.txt' % year)) as file:
                for line in file:
                    line = line.strip()
                    if line:
                        answer, _typ = line.rsplit(',', 1)
                        if slugify(answer):
                            real_answers.append(answer)
        fake_answers = [random_letters(len(real)) for real in real_answers]
        results = []
        for ans in real_answers:
            cromulence, spaced = self.cromulence(ans)
            logprob, _ = self.text_logprob(ans)
            if cromulence > 0:
                results.append((cromulence, logprob, spaced, 'true positive'))
            else:
                results.append((cromulence, logprob, spaced, 'false negative'))
        for ans in fake_answers:
            cromulence, spaced = self.cromulence(ans)
            logprob, _ = self.text_logprob(ans)
            if cromulence > 0:
                results.append((cromulence, logprob, spaced, 'false positive'))
            else:
                results.append((cromulence, logprob, spaced, 'true negative'))

        results.sort(reverse=True)
        counts = Counter([item[-1] for item in results])
        precision = counts['true positive'] / (counts['true positive'] +
                                               counts['false positive'])
        recall = counts['true positive'] / (counts['true positive'] +
                                            counts['false negative'])
        f_score = 2 / (1 / precision + 1 / recall)
        for cromulence, logprob, spaced, category in results:
            print("%1.1f\t%2.2f\t%s\t%s" %
                  (cromulence, logprob, category, spaced))
        print("Precision: %2.2f%%" % (precision * 100))
        print("Recall: %2.2f%%" % (recall * 100))
        return f_score
Exemple #3
0
def db_rank(clue):
    scores = defaultdict(float)
    for match, score in db_search(clue).items():
        scores[slugify(match)] += score * 1000
        parts = tokenize(match)
        for part in parts:
            scores[slugify(part)] += score * 1000 / len(parts)

    for word in tokenize(clue):
        logprob_result = WORDS.segment_logprob(slugify(word))
        if logprob_result is not None:
            logprob, _ = logprob_result
        else:
            logprob = -1000.
        rare_boost = min(25., -logprob)
        for match, score in db_search(word).items():
            scores[slugify(match)] += rare_boost * score * 10
            parts = tokenize(match)
            for part in parts:
                scores[slugify(part)] += rare_boost * score * 10 / len(parts)

        query = query_expand(word)
        for match, score in db_search(query).items():
            scores[slugify(match)] += rare_boost * score
            parts = tokenize(match)
            for part in parts:
                scores[slugify(part)] += rare_boost * score / len(parts)

    return scores
Exemple #4
0
def anagram_single(text, wildcards=0, wordlist=WORDS, count=10, quiet=True):
    """
    Search for anagrams that appear directly in the wordlist.
    """
    return eval_anagrams(_anagram_single(alphagram(slugify(text)), wildcards,
                                         wordlist),
                         wordlist,
                         count,
                         quiet=quiet)
Exemple #5
0
    def __contains__(self, word):
        """
        `word in wordlist` is a quick, idiomatic way to tell if the given word
        (or phrase) appears in the wordlist.

        The word can be entered in natural form, possibly with capital letters
        and spaces. It will be converted to a lowercase, unspaced 'slug' here.
        """
        slug = slugify(word)
        return self.lookup_slug(slug) is not None
Exemple #6
0
def anagram_double(text, wildcards=0, wordlist=WORDS, count=100, quiet=False):
    """
    Search for anagrams that can be made of two words or phrases from the
    wordlist.
    """
    return eval_anagrams(_anagram_double(alphagram(slugify(text)), wildcards,
                                         wordlist),
                         wordlist,
                         count,
                         quiet=quiet)
Exemple #7
0
    def search(self, pattern, length=None, count=10, use_cromulence=False):
        """
        Find results matching a given pattern, returning the cromulence
        and the text of each.

        If the length is known, it can be specified as an additional argument.
        """
        pattern = unspaced_lower(pattern)
        if is_exact(pattern):
            if use_cromulence:
                return [self.cromulence(pattern)]
            else:
                return [self.text_logprob(pattern)]

        minlen, maxlen = regex_len(pattern)
        if minlen != maxlen:
            # If there are variable-length matches, the dynamic programming
            # strategy won't work, so fall back on grepping for complete
            # matches in the wordlist.
            items = list(self.grep(pattern, length=length))
            items.sort(reverse=True)
            found = items[:count]
        else:
            if length is not None and not (minlen <= length <= maxlen):
                # This length is impossible, so there are no results.
                return []

            best_partial_results = [[]]
            for right_edge in range(1, maxlen + 1):
                segment = regex_slice(pattern, 0, right_edge)
                results_this_step = list(islice(self.grep(segment), count))

                for left_edge in range(1, right_edge):
                    if best_partial_results[left_edge]:
                        segment = regex_slice(pattern, left_edge, right_edge)
                        found = list(islice(self.grep(segment), count))
                        for lprob, ltext in best_partial_results[left_edge]:
                            for rprob, rtext in found:
                                results_this_step.append(
                                    (lprob + rprob - log(10),
                                     ltext + ' ' + rtext))
                results_this_step.sort(reverse=True)
                best_partial_results.append(results_this_step[:count])
            found = best_partial_results[-1]

        if not use_cromulence:
            return found
        else:
            results = []
            for (logprob, text) in found:
                cromulence = self.logprob_to_cromulence(
                    logprob, len(slugify(text)))
                results.append((cromulence, text))
            results.sort(reverse=True)
            return results
Exemple #8
0
def brute_force_diagonalize(answers, wordlist=WORDS, quiet=False):
    """
    Find the most cromulent diagonalization for a set of answers, trying all
    possible orders. See README.md for a cool example of this with 10 answers.

    As a somewhat artificial example, let's suppose we have these seven
    answers from the 2000 metas, but don't remember their order:

        >>> metas = ['benjamins', 'billgates', 'donors', 'luxor', 'mansion', 'miserly', 'realty']
        >>> brute_force_diagonalize(metas)[0]   # doctest: +NORMALIZE_WHITESPACE
    Cromulence  Text    Info
    15.4        BE NOISY
    15.1        RUN EAST
    14.7        MAX LAST
    14.6        MIX LAST
    14.5        MENOROT
    14.3        BOX STAY
    14.0        LINE TO I
    14.0        DELLROY
    13.9        LAS LAST
    13.8        RUN SALT
    13.4        BUS LIST
    13.3        MALORY I
    13.1        LES LIST
    12.9        ME NOT AN
    12.5        DEAL ROY
    12.3        LIN LAST
    12.2        RULE IS I
    12.2        MENOGYN
    12.2        LENORA I
    12.1        RUNS RAY
    (15.4, 'BE NOISY', None)

    The best answer, of course, is "BE NOISY". And if that doesn't work,
    you can try to solve the hunt with other strategies such as RUN EAST.
    """
    results = []
    seen = set()
    answers = [parse_cell(word) for word in answers]
    for i, permutation in enumerate(permutations(answers)):
        if not quiet and i > 0 and i % 10000 == 0:
            print("Tried %d permutations" % i)
        try:
            diag = diagonalize(permutation)
        except IndexError:
            continue
        found = wordlist.search(diag, count=1, use_cromulence=True)
        if found:
            logprob, text = found[0]
            slug = slugify(text)
            if slug not in seen:
                results.append((logprob, text, None))
                seen.add(slug)
    return wordlist.show_best_results(results)
Exemple #9
0
 def cromulence(self, text):
     """
     Estimate how likely this text is to be an answer. The "cromulence"
     scale is defined at the top of this module.
     """
     slug = slugify(text)
     if len(slug) == 0:
         return (0, '')
     logprob, found_text = self.text_logprob(slug)
     entropy = logprob / (len(slug) + 1)
     cromulence = round((entropy - NULL_HYPOTHESIS_ENTROPY) * DECIBEL_SCALE,
                        1)
     return cromulence, found_text
Exemple #10
0
 def freq(self, word):
     """
     Get the frequency of a single item in the wordlist.
     Always returns just a number, which is 0 if it's not found.
     """
     if self.logtotal is None:
         totalfreq, _ = self.lookup_slug('')
         self.logtotal = log(totalfreq)
     found = self.lookup_slug(slugify(word))
     if found is None:
         return 0.
     else:
         return log(found[0]) - self.logtotal
Exemple #11
0
def read_wordlist(name):
    """
    Read a wordlist from a comma-separated plain-text file, and iterate
    its entries in order.
    """
    filepath = wordlist_path_from_name(name)
    with open(filepath, encoding='utf-8') as wordfile:
        for i, line in enumerate(wordfile):
            if ',' not in line:
                continue
            line = line.rstrip()
            text, freq = line.split(',', 1)
            freq = int(freq)
            slug = slugify(text)
            if slug:
                yield (i, slug, freq, text)
Exemple #12
0
def anagrams(text,
             wildcards=0,
             wordlist=WORDS,
             count=100,
             quiet=False,
             time_limit=None):
    """
    Search for anagrams that are made of an arbitrary number of pieces from the
    wordlist.
    """
    return eval_anagrams(_anagram_recursive(alphagram(slugify(text)),
                                            wildcards, wordlist),
                         wordlist,
                         count,
                         quiet=quiet,
                         time_limit=time_limit)
Exemple #13
0
def brute_force_diagonalize(answers, wordlist=WORDS, quiet=False):
    """
    Find the most cromulent diagonalization for a set of answers, trying all
    possible orders. See README.md for a cool example of this with 10 answers.

    As a somewhat artificial example, let's suppose we have these seven
    answers from the 2000 metas, but don't remember their order:

    >>> metas = ['benjamins', 'billgates', 'donors', 'luxor', 'mansion', 'miserly', 'realty']
    >>> brute_force_diagonalize(metas)[0]   # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
    Cromulence Text    Info
    9.5        RUN EAST
    9.2        MIX LAST
    9.1        MAX LAST
    9.1        BE NOISY
    8.8        LINE TO I
    ...
    (9.5, 'RUN EAST', None)

    Of course we were looking for the famous red herring "BE NOISY", but
    "RUN EAST" sounds like a good way to find the coin also.
    """
    results = []
    seen = set()
    answers = [parse_cell(word) for word in answers]
    for i, permutation in enumerate(permutations(answers)):
        if not quiet and i > 0 and i % 10000 == 0:
            print("Tried %d permutations" % i)
        try:
            diag = diagonalize(permutation)
        except IndexError:
            continue
        found = wordlist.search(diag, count=1, use_cromulence=True)
        if found:
            logprob, text = found[0]
            slug = slugify(text)
            if slug not in seen:
                results.append((logprob, text, None))
                seen.add(slug)
    return wordlist.show_best_results(results)
Exemple #14
0
def search(pattern=None, clue=None, length=None, count=20):
    """
    Find words and phrases that match various criteria: a regex pattern,
    a clue phrase, and/or a length.

    >>> search('.a.b.c..')[0][1]
    'BARBECUE'
    >>> search('.a....e.', clue='US President')[0][1]
    'VAN BUREN'
    >>> search(clue='lincoln assassin', length=15)[0][1]
    'JOHN WILKES BOOTH'
    """
    global INDEX, QUERY_PARSER, NUMBERBATCH
    if clue is None:
        if pattern is None:
            return []
        else:
            return WORDS.search(pattern, count=count, use_cromulence=True)

    if pattern is not None:
        pattern = pattern.lstrip('^').rstrip('$').lower()
        pattern = re.compile('^' + pattern + '$')

    if INDEX is None:
        INDEX = open_dir(data_path('search'))
        QUERY_PARSER = simple_parser(fieldname="definition",
                                     schema=INDEX.schema,
                                     group=qparser.OrGroup.factory(0.9))
        QUERY_PARSER.add_plugin(qparser.GroupPlugin())
        QUERY_PARSER.add_plugin(qparser.BoostPlugin())

    if NUMBERBATCH is None:
        NUMBERBATCH = load_numberbatch()

    matches = {}
    with INDEX.searcher() as searcher:
        clue_parts = tokenize(clue)
        expanded, similar = query_expand(NUMBERBATCH, clue_parts)
        clue_slugs = [slugify(part) for part in clue_parts]
        new_clue = '%s, %s' % (sanitize(clue), expanded)
        results = searcher.search(QUERY_PARSER.parse(new_clue), limit=None)
        for word, weight in similar:
            slug = slugify(word)
            if slug not in clue_slugs:
                if length is None or length == len(slug):
                    if pattern is None or pattern.match(slug):
                        matches[word.upper()] = weight * 1000
        for i, result in enumerate(results):
            text = result['text']
            if any(c.isdigit() for c in text):
                continue
            slug = slugify(text)
            if length is None or length == len(slug):
                if pattern is None or pattern.match(slug):
                    score = results.score(i)
                    if text in matches:
                        matches[text] += score
                    else:
                        matches[text] = score
                    if len(matches) >= count:
                        break
        return sorted([(score, text) for (text, score) in matches.items()],
                      reverse=True)
Exemple #15
0
def find_by_consonantcy(text):
    return WORDS.find_by_consonantcy(consonantcy(slugify(text)))
Exemple #16
0
def find_by_alphagram(text):
    return WORDS.find_by_alphagram(alphagram(slugify(text)))
Exemple #17
0
def init_search_index():
    nltk.download('wordnet')
    from nltk.corpus import wordnet
    get_synset = wordnet._synset_from_pos_and_offset

    def get_adjacent(synset):
        return [
            name for pointer_tuples in synset._pointers.values()
            for pos, offset in pointer_tuples
            for name in get_synset(pos, offset).lemma_names()
        ]

    os.makedirs(data_path('search'), exist_ok=True)
    ix = create_in(data_path('search'), schema)
    writer = ix.writer(procs=4)

    # Add Wikipedia links
    for line in tqdm(open(data_path('corpora/wikipedia.txt')),
                     desc='wikipedia'):
        title, summary = line.split('\t', 1)
        summary = summary.rstrip()
        if title and summary:
            slug = slugify(title)
            writer.add_document(slug=slug,
                                text=title,
                                definition=summary,
                                length=len(slug))

    # Add lookups from a phrase to a word in that phrase
    for slug, freq, text in tqdm(WORDS.iter_all_by_freq(), desc='phrases'):
        words = text.split()
        if freq < 10000:
            break
        if len(words) > 1:
            for word in words:
                if WORDS.logprob(word) < -7:
                    writer.add_document(slug=slug,
                                        text=word,
                                        definition=text,
                                        length=len(slug))

    # Add crossword clues
    for corpus in ('crossword_clues.txt', 'more_crossword_clues.txt'):
        for line in tqdm(open(corpus_path(corpus), encoding='utf-8'),
                         desc=corpus):
            text, defn = line.rstrip().split('\t')
            slug = slugify(text)
            writer.add_document(slug=slug,
                                text=text,
                                definition=defn,
                                length=len(slug))

    # Add WordNet glosses and links
    synsets = wordnet.all_synsets()
    for syn in tqdm(synsets, desc='wordnet'):
        lemmas = [lem.replace('_', ' ') for lem in syn.lemma_names()]
        related = [lem.replace('_', ' ') for lem in get_adjacent(syn)]
        related2 = lemmas + related
        links = ', '.join(related2).upper()
        defn_parts = [syn.definition()]
        for example in syn.examples():
            defn_parts.append('"%s"' % example)
        defn_parts.append(links)
        defn = '; '.join(defn_parts)
        for name in lemmas:
            this_slug = slugify(name)

            writer.add_document(slug=this_slug,
                                text=name.upper(),
                                definition=defn,
                                length=len(this_slug))

    print("Committing.")
    writer.commit(optimize=True)
    return ix
Exemple #18
0
def phonespell(text):
    "Convert letters to the digits 2-9 on a phone keypad."
    return ''.join(PHONESPELL_MAP[ch] for ch in slugify(text))