Esempio n. 1
0
 def test_tokenize_upper_case_times(self):
     # The list of different types of times
     text = ['2010-10-25 Jan 01 JUN 1st Its Fri jan 1st JAn 3rd 1995-01-13']
     self.assertEqual(pipe_token.tokenize_words(text, keep_date=True), [
         '2010-10-25', 'Jan 01', 'JUN 1st', 'Its', 'Fri', 'jan 1st',
         'JAn 3rd', '1995-01-13'
     ])
Esempio n. 2
0
 def test_tokenize_times(self):
     # The list of times and other information
     text = ['2010-10-25 Jan 01 Jun 1st Its Fri Jan 1st Jan 3rd 1995-01-13']
     self.assertEqual(pipe_token.tokenize_words(text, keep_date=True), [
         '2010-10-25', 'Jan 01', 'Jun 1st', 'Its', 'Fri', 'Jan 1st',
         'Jan 3rd', '1995-01-13'
     ])
Esempio n. 3
0
def count_word_frequency(text, file_name='word_frequency', static=False):
    """
    count the word frequency and return the dictionary of tokens and their corresponding frequency
    :param text: the list of tokens
    :param file_name: The default file name is word frequency
    :param static: if do statistic
    :return: The dictionary of tokens and each word corresponding frequency
    """
    # Do tokenization
    tokens = []
    for data in text:
        tokens = tokens + pipe_token.tokenize_words([data])
    # remove the punctuations
    print(tokens)
    tokens = pipe_token.remove_punct(tokens)
    if static:
        word_freq = statistic.do_statistic_single(tokens, file_name)
    else:
        word_freq = dict(Counter(tokens))
    return word_freq
Esempio n. 4
0
 def test_None_tokenize(self):
     # None
     with self.assertRaises(ValueError):
         pipe_token.tokenize_words(None)
Esempio n. 5
0
 def test_type_wrong_tokenize(self):
     # wrong type
     with self.assertRaises(ValueError):
         pipe_token.tokenize_words(123)
Esempio n. 6
0
 def test_empty_tokenize(self):
     with self.assertRaises(ValueError):
         pipe_token.tokenize_words('')
Esempio n. 7
0
 def test_tokenize_time(self):
     # The list contains the time information
     text = ['2010-10-25 Jan 01 Jun 1st Its Fri']
     self.assertEqual(pipe_token.tokenize_words(text, keep_date=True),
                      ['2010-10-25', 'Jan 01', 'Jun 1st', 'Its', 'Fri'])
Esempio n. 8
0
 def test_tokenize(self):
     # raw data
     text = ['I have a dream.']
     self.assertEqual(pipe_token.tokenize_words(text),
                      ['I', 'have', 'a', 'dream', '.'])