Beispiel #1
0
def get_year_counts_for_phrase(corpus, phrase):
  phrase = phrase.lower()
  phrase = text.rewrite(phrase)
  tokens = text.tokenize(phrase)
  query = ' '.join(tokens)
  regex = '%s\t%s\t([0-9]+)\t([0-9]+)\t(.*)' % (query, corpus)
  regex = re.compile(regex)
  ngram_path = os.path.join(FLAGS.ngram_dir, 'ngrams_yearly.txt')
  counts = {}
  with open(ngram_path, 'rb') as ngrams:
    for line in ngrams:
      match = regex.match(line)
      if match:
        year = int(match.group(1))
        count = int(match.group(2))
        counts[year] = count
  return counts
Beispiel #2
0
def get_year_counts_for_phrases(corpus, phrases):
    # Counts is a map from [phrase][year] -> (count, postids)
    counts = {}
    tokenized_to_phrases = {}
    for phrase in phrases:
        cooked_phrase = phrase.lower()
        cooked_phrase = text.rewrite(cooked_phrase)
        tokens = text.tokenize(cooked_phrase)
        token_str = " ".join(tokens)
        tokenized_to_phrases[token_str] = phrase
    for phrase in phrases:
        counts[phrase] = {}
    query = NGramCount.all()
    query.filter("site =", corpus)
    query.filter("ngram IN", tokenized_to_phrases.keys())
    for result in query.run(batch_size=10000):
        total = get_ngram_total_for_year(len(result.ngram.split()), corpus, result.year)
        counts[tokenized_to_phrases[result.ngram]][result.year] = (float(result.count) / total, result.postids)
    return counts
Beispiel #3
0
 def test_rewrite(self):
   self.assertEqual(text.rewrite("isn't it"), 'is not it')
   self.assertEqual(text.rewrite("isn't it"), 'is not it')
   self.assertEqual(text.rewrite("john's"), 'john')
   self.assertEqual(text.rewrite("'s's's's"), '')