Beispiel #1
0
 def test_tokenize(self):
   self.assertEqual(text.tokenize(u'one'), ['one'])
   self.assertEqual(text.tokenize(u'one two'), ['one', 'two'])
   self.assertEqual(text.tokenize(u'one  two'), ['one', 'two'])
   self.assertEqual(text.tokenize(u'    one  two  '), ['one', 'two'])
   self.assertEqual(text.tokenize(u'one.two'), ['one', 'two'])
   self.assertEqual(text.tokenize(u'one...two'), ['one', 'two'])
   self.assertEqual(text.tokenize(u'..one...two'), ['one', 'two'])
   self.assertRaises(TypeError, text.tokenize, 'one two')
Beispiel #2
0
def get_year_counts_for_phrase(corpus, phrase):
  phrase = phrase.lower()
  phrase = text.rewrite(phrase)
  tokens = text.tokenize(phrase)
  query = ' '.join(tokens)
  regex = '%s\t%s\t([0-9]+)\t([0-9]+)\t(.*)' % (query, corpus)
  regex = re.compile(regex)
  ngram_path = os.path.join(FLAGS.ngram_dir, 'ngrams_yearly.txt')
  counts = {}
  with open(ngram_path, 'rb') as ngrams:
    for line in ngrams:
      match = regex.match(line)
      if match:
        year = int(match.group(1))
        count = int(match.group(2))
        counts[year] = count
  return counts
Beispiel #3
0
def get_year_counts_for_phrases(corpus, phrases):
    # Counts is a map from [phrase][year] -> (count, postids)
    counts = {}
    tokenized_to_phrases = {}
    for phrase in phrases:
        cooked_phrase = phrase.lower()
        cooked_phrase = text.rewrite(cooked_phrase)
        tokens = text.tokenize(cooked_phrase)
        token_str = " ".join(tokens)
        tokenized_to_phrases[token_str] = phrase
    for phrase in phrases:
        counts[phrase] = {}
    query = NGramCount.all()
    query.filter("site =", corpus)
    query.filter("ngram IN", tokenized_to_phrases.keys())
    for result in query.run(batch_size=10000):
        total = get_ngram_total_for_year(len(result.ngram.split()), corpus, result.year)
        counts[tokenized_to_phrases[result.ngram]][result.year] = (float(result.count) / total, result.postids)
    return counts