コード例 #1
0
def token_link_text(text, distance=10):
    '''Combine token frequency and token links together into a single JSON
    format.'''
    text = remove_punctuation(text)
    token_list = stop_word_placeheld(text)

    return link_op(token_list, distance=distance)
コード例 #2
0
 def test_freq_dist_dict_full(self):
     with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
     as f:
         text = f.read().decode('utf-8')
         text = remove_punctuation(text)
         stopped = stop_words(text)  
         ti = token_index(stopped)
         #print(pformat(ti), file=stderr)
         with open('{}{}'.format(target_out, '2011-1-19token_index'),\
         
         'w') as out_file:
             out_file.write(pformat(ti))
コード例 #3
0
    def test_freq_dist_dict_full(self):
        with open('{}{}'.format(base_resources, '2011-1-19raw.txt'), 'r')\
        as f:
            text = f.read().decode('utf-8')
            stopped = stop_words(text)  
            freq_dist = freq_dist_dict(stopped.split()) 
            #print(pformat(freq_dist), file=stderr)
            self.assertGreater(freq_dist[u'year'], 8)
            self.assertLess(freq_dist[u'year'], 12)

            text = remove_punctuation(text)
            stopped = stop_words(text)  
            freq_dist = freq_dist_dict(stopped.split())
            #print(pformat(freq_dist), file=stderr)
            self.assertGreater(freq_dist[u'year'], 16)

            with open('{}{}'.format(target_out, '2011-1-19freq_dist_dict'),\
            
            'w') as out_file:
                out_file.write(pformat(freq_dist))
コード例 #4
0
def create_tokens(text):
    text = remove_punctuation(text)
    text = stop_word_placeheld(text)
    return freq_dist_dict(text)
コード例 #5
0
 def test_punctuation_removal_unicode(self):
     x = unicode(self.ick_str)  
     out = remove_punctuation(x, punct=self.punct)
     self.assertEqual(out, unicode(self.good_str))