def get_year_counts_for_phrase(corpus, phrase): phrase = phrase.lower() phrase = text.rewrite(phrase) tokens = text.tokenize(phrase) query = ' '.join(tokens) regex = '%s\t%s\t([0-9]+)\t([0-9]+)\t(.*)' % (query, corpus) regex = re.compile(regex) ngram_path = os.path.join(FLAGS.ngram_dir, 'ngrams_yearly.txt') counts = {} with open(ngram_path, 'rb') as ngrams: for line in ngrams: match = regex.match(line) if match: year = int(match.group(1)) count = int(match.group(2)) counts[year] = count return counts
def get_year_counts_for_phrases(corpus, phrases): # Counts is a map from [phrase][year] -> (count, postids) counts = {} tokenized_to_phrases = {} for phrase in phrases: cooked_phrase = phrase.lower() cooked_phrase = text.rewrite(cooked_phrase) tokens = text.tokenize(cooked_phrase) token_str = " ".join(tokens) tokenized_to_phrases[token_str] = phrase for phrase in phrases: counts[phrase] = {} query = NGramCount.all() query.filter("site =", corpus) query.filter("ngram IN", tokenized_to_phrases.keys()) for result in query.run(batch_size=10000): total = get_ngram_total_for_year(len(result.ngram.split()), corpus, result.year) counts[tokenized_to_phrases[result.ngram]][result.year] = (float(result.count) / total, result.postids) return counts
def test_rewrite(self): self.assertEqual(text.rewrite("isn't it"), 'is not it') self.assertEqual(text.rewrite("isn't it"), 'is not it') self.assertEqual(text.rewrite("john's"), 'john') self.assertEqual(text.rewrite("'s's's's"), '')