def text_logprob(self, text): """ Get the log probability of this text, along with its most likely spacing, gluing it together with multiple "segments" if necessary. """ slug = slugify(text) n = len(slug) best_partial_results = [''] best_logprobs = [0.] for right_edge in range(1, n + 1): found = self.segment_logprob(slug[:right_edge]) if found: rprob, rtext = found best_partial_results.append(rtext) best_logprobs.append(rprob) else: best_logprobs.append(-1000.) best_partial_results.append(slug[:right_edge]) for left_edge in range(1, right_edge): lprob = best_logprobs[left_edge] found2 = self.segment_logprob(slug[left_edge:right_edge]) if found2: rprob, rtext = found2 totalprob = lprob + rprob - log(10) if totalprob > best_logprobs[right_edge]: best_logprobs[right_edge] = totalprob ltext = best_partial_results[left_edge] best_partial_results[right_edge] = ltext + ' ' + rtext return best_logprobs[-1], best_partial_results[-1]
def test_cromulence(self): """ This test runs a corpus of past Mystery Hunt answers through the cromulence function, so we can tune it to return positive numbers for real answers. It does this by generating fake answers with the lengths of real answers, but with the letters drawn randomly from a unigram distribution. Sometimes this comes up with neat fake answers such as: 7.4 ON FRODO 6.8 ENIAC 6.6 AS I CAN 6.4 IBM STOP 5.4 AIR LOL 3.8 USE MIT 3.7 VON POOPIN 2.1 NA BEER 0.2 DNA ARRRGH AH """ real_answers = [] years = ['1994', '1997'] + [str(year) for year in range(1999, 2021)] for year in years: with open(corpus_path('mh_answers/mystery%s.txt' % year)) as file: for line in file: line = line.strip() if line: answer, _typ = line.rsplit(',', 1) if slugify(answer): real_answers.append(answer) fake_answers = [random_letters(len(real)) for real in real_answers] results = [] for ans in real_answers: cromulence, spaced = self.cromulence(ans) logprob, _ = self.text_logprob(ans) if cromulence > 0: results.append((cromulence, logprob, spaced, 'true positive')) else: results.append((cromulence, logprob, spaced, 'false negative')) for ans in fake_answers: cromulence, spaced = self.cromulence(ans) logprob, _ = self.text_logprob(ans) if cromulence > 0: results.append((cromulence, logprob, spaced, 'false positive')) else: results.append((cromulence, logprob, spaced, 'true negative')) results.sort(reverse=True) counts = Counter([item[-1] for item in results]) precision = counts['true positive'] / (counts['true positive'] + counts['false positive']) recall = counts['true positive'] / (counts['true positive'] + counts['false negative']) f_score = 2 / (1 / precision + 1 / recall) for cromulence, logprob, spaced, category in results: print("%1.1f\t%2.2f\t%s\t%s" % (cromulence, logprob, category, spaced)) print("Precision: %2.2f%%" % (precision * 100)) print("Recall: %2.2f%%" % (recall * 100)) return f_score
def db_rank(clue): scores = defaultdict(float) for match, score in db_search(clue).items(): scores[slugify(match)] += score * 1000 parts = tokenize(match) for part in parts: scores[slugify(part)] += score * 1000 / len(parts) for word in tokenize(clue): logprob_result = WORDS.segment_logprob(slugify(word)) if logprob_result is not None: logprob, _ = logprob_result else: logprob = -1000. rare_boost = min(25., -logprob) for match, score in db_search(word).items(): scores[slugify(match)] += rare_boost * score * 10 parts = tokenize(match) for part in parts: scores[slugify(part)] += rare_boost * score * 10 / len(parts) query = query_expand(word) for match, score in db_search(query).items(): scores[slugify(match)] += rare_boost * score parts = tokenize(match) for part in parts: scores[slugify(part)] += rare_boost * score / len(parts) return scores
def anagram_single(text, wildcards=0, wordlist=WORDS, count=10, quiet=True): """ Search for anagrams that appear directly in the wordlist. """ return eval_anagrams(_anagram_single(alphagram(slugify(text)), wildcards, wordlist), wordlist, count, quiet=quiet)
def __contains__(self, word): """ `word in wordlist` is a quick, idiomatic way to tell if the given word (or phrase) appears in the wordlist. The word can be entered in natural form, possibly with capital letters and spaces. It will be converted to a lowercase, unspaced 'slug' here. """ slug = slugify(word) return self.lookup_slug(slug) is not None
def anagram_double(text, wildcards=0, wordlist=WORDS, count=100, quiet=False): """ Search for anagrams that can be made of two words or phrases from the wordlist. """ return eval_anagrams(_anagram_double(alphagram(slugify(text)), wildcards, wordlist), wordlist, count, quiet=quiet)
def search(self, pattern, length=None, count=10, use_cromulence=False): """ Find results matching a given pattern, returning the cromulence and the text of each. If the length is known, it can be specified as an additional argument. """ pattern = unspaced_lower(pattern) if is_exact(pattern): if use_cromulence: return [self.cromulence(pattern)] else: return [self.text_logprob(pattern)] minlen, maxlen = regex_len(pattern) if minlen != maxlen: # If there are variable-length matches, the dynamic programming # strategy won't work, so fall back on grepping for complete # matches in the wordlist. items = list(self.grep(pattern, length=length)) items.sort(reverse=True) found = items[:count] else: if length is not None and not (minlen <= length <= maxlen): # This length is impossible, so there are no results. return [] best_partial_results = [[]] for right_edge in range(1, maxlen + 1): segment = regex_slice(pattern, 0, right_edge) results_this_step = list(islice(self.grep(segment), count)) for left_edge in range(1, right_edge): if best_partial_results[left_edge]: segment = regex_slice(pattern, left_edge, right_edge) found = list(islice(self.grep(segment), count)) for lprob, ltext in best_partial_results[left_edge]: for rprob, rtext in found: results_this_step.append( (lprob + rprob - log(10), ltext + ' ' + rtext)) results_this_step.sort(reverse=True) best_partial_results.append(results_this_step[:count]) found = best_partial_results[-1] if not use_cromulence: return found else: results = [] for (logprob, text) in found: cromulence = self.logprob_to_cromulence( logprob, len(slugify(text))) results.append((cromulence, text)) results.sort(reverse=True) return results
def brute_force_diagonalize(answers, wordlist=WORDS, quiet=False): """ Find the most cromulent diagonalization for a set of answers, trying all possible orders. See README.md for a cool example of this with 10 answers. As a somewhat artificial example, let's suppose we have these seven answers from the 2000 metas, but don't remember their order: >>> metas = ['benjamins', 'billgates', 'donors', 'luxor', 'mansion', 'miserly', 'realty'] >>> brute_force_diagonalize(metas)[0] # doctest: +NORMALIZE_WHITESPACE Cromulence Text Info 15.4 BE NOISY 15.1 RUN EAST 14.7 MAX LAST 14.6 MIX LAST 14.5 MENOROT 14.3 BOX STAY 14.0 LINE TO I 14.0 DELLROY 13.9 LAS LAST 13.8 RUN SALT 13.4 BUS LIST 13.3 MALORY I 13.1 LES LIST 12.9 ME NOT AN 12.5 DEAL ROY 12.3 LIN LAST 12.2 RULE IS I 12.2 MENOGYN 12.2 LENORA I 12.1 RUNS RAY (15.4, 'BE NOISY', None) The best answer, of course, is "BE NOISY". And if that doesn't work, you can try to solve the hunt with other strategies such as RUN EAST. """ results = [] seen = set() answers = [parse_cell(word) for word in answers] for i, permutation in enumerate(permutations(answers)): if not quiet and i > 0 and i % 10000 == 0: print("Tried %d permutations" % i) try: diag = diagonalize(permutation) except IndexError: continue found = wordlist.search(diag, count=1, use_cromulence=True) if found: logprob, text = found[0] slug = slugify(text) if slug not in seen: results.append((logprob, text, None)) seen.add(slug) return wordlist.show_best_results(results)
def cromulence(self, text): """ Estimate how likely this text is to be an answer. The "cromulence" scale is defined at the top of this module. """ slug = slugify(text) if len(slug) == 0: return (0, '') logprob, found_text = self.text_logprob(slug) entropy = logprob / (len(slug) + 1) cromulence = round((entropy - NULL_HYPOTHESIS_ENTROPY) * DECIBEL_SCALE, 1) return cromulence, found_text
def freq(self, word): """ Get the frequency of a single item in the wordlist. Always returns just a number, which is 0 if it's not found. """ if self.logtotal is None: totalfreq, _ = self.lookup_slug('') self.logtotal = log(totalfreq) found = self.lookup_slug(slugify(word)) if found is None: return 0. else: return log(found[0]) - self.logtotal
def read_wordlist(name): """ Read a wordlist from a comma-separated plain-text file, and iterate its entries in order. """ filepath = wordlist_path_from_name(name) with open(filepath, encoding='utf-8') as wordfile: for i, line in enumerate(wordfile): if ',' not in line: continue line = line.rstrip() text, freq = line.split(',', 1) freq = int(freq) slug = slugify(text) if slug: yield (i, slug, freq, text)
def anagrams(text, wildcards=0, wordlist=WORDS, count=100, quiet=False, time_limit=None): """ Search for anagrams that are made of an arbitrary number of pieces from the wordlist. """ return eval_anagrams(_anagram_recursive(alphagram(slugify(text)), wildcards, wordlist), wordlist, count, quiet=quiet, time_limit=time_limit)
def brute_force_diagonalize(answers, wordlist=WORDS, quiet=False): """ Find the most cromulent diagonalization for a set of answers, trying all possible orders. See README.md for a cool example of this with 10 answers. As a somewhat artificial example, let's suppose we have these seven answers from the 2000 metas, but don't remember their order: >>> metas = ['benjamins', 'billgates', 'donors', 'luxor', 'mansion', 'miserly', 'realty'] >>> brute_force_diagonalize(metas)[0] # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS Cromulence Text Info 9.5 RUN EAST 9.2 MIX LAST 9.1 MAX LAST 9.1 BE NOISY 8.8 LINE TO I ... (9.5, 'RUN EAST', None) Of course we were looking for the famous red herring "BE NOISY", but "RUN EAST" sounds like a good way to find the coin also. """ results = [] seen = set() answers = [parse_cell(word) for word in answers] for i, permutation in enumerate(permutations(answers)): if not quiet and i > 0 and i % 10000 == 0: print("Tried %d permutations" % i) try: diag = diagonalize(permutation) except IndexError: continue found = wordlist.search(diag, count=1, use_cromulence=True) if found: logprob, text = found[0] slug = slugify(text) if slug not in seen: results.append((logprob, text, None)) seen.add(slug) return wordlist.show_best_results(results)
def search(pattern=None, clue=None, length=None, count=20): """ Find words and phrases that match various criteria: a regex pattern, a clue phrase, and/or a length. >>> search('.a.b.c..')[0][1] 'BARBECUE' >>> search('.a....e.', clue='US President')[0][1] 'VAN BUREN' >>> search(clue='lincoln assassin', length=15)[0][1] 'JOHN WILKES BOOTH' """ global INDEX, QUERY_PARSER, NUMBERBATCH if clue is None: if pattern is None: return [] else: return WORDS.search(pattern, count=count, use_cromulence=True) if pattern is not None: pattern = pattern.lstrip('^').rstrip('$').lower() pattern = re.compile('^' + pattern + '$') if INDEX is None: INDEX = open_dir(data_path('search')) QUERY_PARSER = simple_parser(fieldname="definition", schema=INDEX.schema, group=qparser.OrGroup.factory(0.9)) QUERY_PARSER.add_plugin(qparser.GroupPlugin()) QUERY_PARSER.add_plugin(qparser.BoostPlugin()) if NUMBERBATCH is None: NUMBERBATCH = load_numberbatch() matches = {} with INDEX.searcher() as searcher: clue_parts = tokenize(clue) expanded, similar = query_expand(NUMBERBATCH, clue_parts) clue_slugs = [slugify(part) for part in clue_parts] new_clue = '%s, %s' % (sanitize(clue), expanded) results = searcher.search(QUERY_PARSER.parse(new_clue), limit=None) for word, weight in similar: slug = slugify(word) if slug not in clue_slugs: if length is None or length == len(slug): if pattern is None or pattern.match(slug): matches[word.upper()] = weight * 1000 for i, result in enumerate(results): text = result['text'] if any(c.isdigit() for c in text): continue slug = slugify(text) if length is None or length == len(slug): if pattern is None or pattern.match(slug): score = results.score(i) if text in matches: matches[text] += score else: matches[text] = score if len(matches) >= count: break return sorted([(score, text) for (text, score) in matches.items()], reverse=True)
def find_by_consonantcy(text): return WORDS.find_by_consonantcy(consonantcy(slugify(text)))
def find_by_alphagram(text): return WORDS.find_by_alphagram(alphagram(slugify(text)))
def init_search_index(): nltk.download('wordnet') from nltk.corpus import wordnet get_synset = wordnet._synset_from_pos_and_offset def get_adjacent(synset): return [ name for pointer_tuples in synset._pointers.values() for pos, offset in pointer_tuples for name in get_synset(pos, offset).lemma_names() ] os.makedirs(data_path('search'), exist_ok=True) ix = create_in(data_path('search'), schema) writer = ix.writer(procs=4) # Add Wikipedia links for line in tqdm(open(data_path('corpora/wikipedia.txt')), desc='wikipedia'): title, summary = line.split('\t', 1) summary = summary.rstrip() if title and summary: slug = slugify(title) writer.add_document(slug=slug, text=title, definition=summary, length=len(slug)) # Add lookups from a phrase to a word in that phrase for slug, freq, text in tqdm(WORDS.iter_all_by_freq(), desc='phrases'): words = text.split() if freq < 10000: break if len(words) > 1: for word in words: if WORDS.logprob(word) < -7: writer.add_document(slug=slug, text=word, definition=text, length=len(slug)) # Add crossword clues for corpus in ('crossword_clues.txt', 'more_crossword_clues.txt'): for line in tqdm(open(corpus_path(corpus), encoding='utf-8'), desc=corpus): text, defn = line.rstrip().split('\t') slug = slugify(text) writer.add_document(slug=slug, text=text, definition=defn, length=len(slug)) # Add WordNet glosses and links synsets = wordnet.all_synsets() for syn in tqdm(synsets, desc='wordnet'): lemmas = [lem.replace('_', ' ') for lem in syn.lemma_names()] related = [lem.replace('_', ' ') for lem in get_adjacent(syn)] related2 = lemmas + related links = ', '.join(related2).upper() defn_parts = [syn.definition()] for example in syn.examples(): defn_parts.append('"%s"' % example) defn_parts.append(links) defn = '; '.join(defn_parts) for name in lemmas: this_slug = slugify(name) writer.add_document(slug=this_slug, text=name.upper(), definition=defn, length=len(this_slug)) print("Committing.") writer.commit(optimize=True) return ix
def phonespell(text): "Convert letters to the digits 2-9 on a phone keypad." return ''.join(PHONESPELL_MAP[ch] for ch in slugify(text))