コード例 #1
0
 def test_tokenise_words(self):
     test_sentences = [
         ('', []),
         (' ', []),
         ("I don't like it. You'll agree?", ["I", "don", "'", "t", "like", "it", ".", "You", "'", "ll", "agree", "?"]),
         (test_text,
          ['I', 'am', 'Sam', '.', 'I', 'am', 'Sam', '.', 'Sam', '-', 'I', '-', 'Am', '.',
           'That', 'Sam', '-', 'I', '-', 'Am', '!', 'That', 'Sam', '-', 'I', '-', 'Am', '!',
           'I', 'do', 'not', 'like', 'that', 'Sam', '-', 'I', '-', 'Am', '!',
           'Do', 'you', 'like', 'green', 'eggs', 'and', 'ham', '?',
           'I', 'do', 'not', 'like', 'them', ',', 'Sam', '-', 'I', '-', 'Am', '.',
           'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '.'])
     ]
     for test_sentence, result in test_sentences:
         tokens = util.tokenize_words(test_sentence)
         self.assertEquals(tokens, result, msg=(test_sentence, tokens, '!=', result))
コード例 #2
0
 def test_pos_distribution(self):
     # TODO: more sentences
     tag_list = ["NN", "DT", "VBD", "PRP", "VBP", "RB", "IN", "NNS", "CC", "NNP", "JJ", "VB"]
     test_sentences = [
         ("dog",
          {key+"_pos_relative_frequency": 0 if key is not "NN" else 1.0 for key in tag_list}),
         ('', {key+"_pos_relative_frequency": 0 for key in tag_list}),
         ('the quick brown fox jumped.',
          {"PRP_pos_relative_frequency": 0,
           "NN_pos_relative_frequency": 2.0/5,
           "DT_pos_relative_frequency": 1.0/5,
           "VBP_pos_relative_frequency": 0,
           "RB_pos_relative_frequency": 0,
           "IN_pos_relative_frequency": 0,
           "NNS_pos_relative_frequency": 0,
           "CC_pos_relative_frequency": 0,
           "NNP_pos_relative_frequency": 0,
           "JJ_pos_relative_frequency": 1.0/5,
           "VB_pos_relative_frequency": 0,
           "VBD_pos_relative_frequency": 1.0/5}),
         (test_text,
          {"PRP_pos_relative_frequency": 12.0/48,
           "NN_pos_relative_frequency": 5.0/48,
           "DT_pos_relative_frequency": 3.0/48,
           "VBP_pos_relative_frequency": 8.0/48,
           "RB_pos_relative_frequency": 3.0/48,
           "IN_pos_relative_frequency": 3.0/48,
           "NNS_pos_relative_frequency": 2.0/48,
           "CC_pos_relative_frequency": 2.0/48,
           "NNP_pos_relative_frequency": 7.0/48,
           "JJ_pos_relative_frequency": 2.0/48,
           "VB_pos_relative_frequency": 1.0/48,
           "VBD_pos_relative_frequency": 0})
     ]
     for test_sentence, result in test_sentences:
         pos_freq_dis = compute_fingerprint.get_pos_counts(nltk.pos_tag(util.tokenize_words(test_sentence)), tag_list)
         print
         self.assertEquals(pos_freq_dis, result, msg=(test_sentence, pos_freq_dis, '!=', result))
コード例 #3
0
 def test_function_word_distribution(self):
     # TODO: more sentences
     test_sentences = [
         ("into the dog",
          {"into_relative_frequency": 1.0/3,
               "i_relative_frequency": 0,
               "that_relative_frequency": 0,
               "like_relative_frequency": 0,
               "you_relative_frequency": 0,
               "and_relative_frequency": 0,
               "them_relative_frequency": 0}),
         ('', {"into_relative_frequency": 0,
               "i_relative_frequency": 0,
               "that_relative_frequency": 0,
               "like_relative_frequency": 0,
               "you_relative_frequency": 0,
               "and_relative_frequency": 0,
               "them_relative_frequency": 0}),
         (test_text, {
             "into_relative_frequency": 0,
             "i_relative_frequency": 10.0/48,
             "that_relative_frequency": 3.0/48,
             "like_relative_frequency": 3.0/48,
             "you_relative_frequency": 1.0/48,
             "and_relative_frequency": 2.0/48,
             "them_relative_frequency": 1.0/48})
     ]
     tag_list = ["into", "i", "that", "like", "you", "and", "them"]
     for test_sentence, result in test_sentences:
         function_word_dist = compute_fingerprint.get_function_word_distribution(nltk.pos_tag(util.tokenize_words(test_sentence)), tag_list)
         self.assertEquals(function_word_dist, result, msg=(test_sentence, function_word_dist, '!=', result))