def token_link_text(text, distance=10): '''Combine token frequency and token links together into a single JSON format.''' text = remove_punctuation(text) token_list = stop_word_placeheld(text) return link_op(token_list, distance=distance)
def test_freq_dist_dict_full(self): with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\ as f: text = f.read().decode('utf-8') text = remove_punctuation(text) stopped = stop_words(text) ti = token_index(stopped) #print(pformat(ti), file=stderr) with open('{}{}'.format(target_out, '2011-1-19token_index'),\ 'w') as out_file: out_file.write(pformat(ti))
def test_freq_dist_dict_full(self): with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\ as f: text = f.read().decode('utf-8') stopped = stop_words(text) freq_dist = freq_dist_dict(stopped.split()) #print(pformat(freq_dist), file=stderr) self.assertGreater(freq_dist[u'year'], 8) self.assertLess(freq_dist[u'year'], 12) text = remove_punctuation(text) stopped = stop_words(text) freq_dist = freq_dist_dict(stopped.split()) #print(pformat(freq_dist), file=stderr) self.assertGreater(freq_dist[u'year'], 16) with open('{}{}'.format(target_out, '2011-1-19freq_dist_dict'),\ 'w') as out_file: out_file.write(pformat(freq_dist))
def create_tokens(text): text = remove_punctuation(text) text = stop_word_placeheld(text) return freq_dist_dict(text)
def test_punctuation_removal_unicode(self): x = unicode(self.ick_str) out = remove_punctuation(x, punct=self.punct) self.assertEqual(out, unicode(self.good_str))