def test_tokenise_words(self): test_sentences = [ ('', []), (' ', []), ("I don't like it. You'll agree?", ["I", "don", "'", "t", "like", "it", ".", "You", "'", "ll", "agree", "?"]), (test_text, ['I', 'am', 'Sam', '.', 'I', 'am', 'Sam', '.', 'Sam', '-', 'I', '-', 'Am', '.', 'That', 'Sam', '-', 'I', '-', 'Am', '!', 'That', 'Sam', '-', 'I', '-', 'Am', '!', 'I', 'do', 'not', 'like', 'that', 'Sam', '-', 'I', '-', 'Am', '!', 'Do', 'you', 'like', 'green', 'eggs', 'and', 'ham', '?', 'I', 'do', 'not', 'like', 'them', ',', 'Sam', '-', 'I', '-', 'Am', '.', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'ham', '.']) ] for test_sentence, result in test_sentences: tokens = util.tokenize_words(test_sentence) self.assertEquals(tokens, result, msg=(test_sentence, tokens, '!=', result))
def test_pos_distribution(self): # TODO: more sentences tag_list = ["NN", "DT", "VBD", "PRP", "VBP", "RB", "IN", "NNS", "CC", "NNP", "JJ", "VB"] test_sentences = [ ("dog", {key+"_pos_relative_frequency": 0 if key is not "NN" else 1.0 for key in tag_list}), ('', {key+"_pos_relative_frequency": 0 for key in tag_list}), ('the quick brown fox jumped.', {"PRP_pos_relative_frequency": 0, "NN_pos_relative_frequency": 2.0/5, "DT_pos_relative_frequency": 1.0/5, "VBP_pos_relative_frequency": 0, "RB_pos_relative_frequency": 0, "IN_pos_relative_frequency": 0, "NNS_pos_relative_frequency": 0, "CC_pos_relative_frequency": 0, "NNP_pos_relative_frequency": 0, "JJ_pos_relative_frequency": 1.0/5, "VB_pos_relative_frequency": 0, "VBD_pos_relative_frequency": 1.0/5}), (test_text, {"PRP_pos_relative_frequency": 12.0/48, "NN_pos_relative_frequency": 5.0/48, "DT_pos_relative_frequency": 3.0/48, "VBP_pos_relative_frequency": 8.0/48, "RB_pos_relative_frequency": 3.0/48, "IN_pos_relative_frequency": 3.0/48, "NNS_pos_relative_frequency": 2.0/48, "CC_pos_relative_frequency": 2.0/48, "NNP_pos_relative_frequency": 7.0/48, "JJ_pos_relative_frequency": 2.0/48, "VB_pos_relative_frequency": 1.0/48, "VBD_pos_relative_frequency": 0}) ] for test_sentence, result in test_sentences: pos_freq_dis = compute_fingerprint.get_pos_counts(nltk.pos_tag(util.tokenize_words(test_sentence)), tag_list) print self.assertEquals(pos_freq_dis, result, msg=(test_sentence, pos_freq_dis, '!=', result))
def test_function_word_distribution(self): # TODO: more sentences test_sentences = [ ("into the dog", {"into_relative_frequency": 1.0/3, "i_relative_frequency": 0, "that_relative_frequency": 0, "like_relative_frequency": 0, "you_relative_frequency": 0, "and_relative_frequency": 0, "them_relative_frequency": 0}), ('', {"into_relative_frequency": 0, "i_relative_frequency": 0, "that_relative_frequency": 0, "like_relative_frequency": 0, "you_relative_frequency": 0, "and_relative_frequency": 0, "them_relative_frequency": 0}), (test_text, { "into_relative_frequency": 0, "i_relative_frequency": 10.0/48, "that_relative_frequency": 3.0/48, "like_relative_frequency": 3.0/48, "you_relative_frequency": 1.0/48, "and_relative_frequency": 2.0/48, "them_relative_frequency": 1.0/48}) ] tag_list = ["into", "i", "that", "like", "you", "and", "them"] for test_sentence, result in test_sentences: function_word_dist = compute_fingerprint.get_function_word_distribution(nltk.pos_tag(util.tokenize_words(test_sentence)), tag_list) self.assertEquals(function_word_dist, result, msg=(test_sentence, function_word_dist, '!=', result))