/
sentence_parser.py
127 lines (106 loc) · 4.45 KB
/
sentence_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from analogy_strings import analogy_string_list
from nltk.parse import stanford
import nltk
from nltk.tree import ParentedTree
from nltk.stem.wordnet import WordNetLemmatizer
from personal import root
import functions
noun_labels = {"NN", "NNP", "NNPS", "NNS", "NP"}
adj_labels = {"JJ", "JJR", "JJS"}
def is_noun(label):
return label in noun_labels
def is_verb(label):
verb_labels = {"VB", "VBD", "VBG", "VBN", "VBP", "VBZ"}
return label in verb_labels
def is_adj(label):
return label in adj_labels
def prepositions_set(filename):
prepositions = set()
prepositions_file = open(filename, "r")
for line in prepositions_file:
prepositions.add(line[:-1])
return prepositions
# prepositions = prepositions_set("prepositions.txt")
def is_conventional_preposition(word):
return word.lower() in prepositions
def convert_to_base_form(word, type):
# Convert a verb or adjective to its baseform.
# type should be either 'v' for 'verb' or 'a' for 'adjective'.
wnl = WordNetLemmatizer()
return wnl.lemmatize(word, type)
def get_analogy_sentence(para, pattern_list):
# Gets the sentence that contains the analogy phrases as specified
# in the analogy_string_list from a paragraph and its index within the para.
# The paragraph must be in NLTK format.
for i in range(len(para)):
for item in pattern_list:
pattern = " ".join(item)
whole_sentence = " ".join(para[i])
if whole_sentence.find(pattern) != -1:
return [whole_sentence, i + 1]
return ['', -1]
def get_speech_tags(sentence):
# Returns speech tags of a string.
# Based on http://www.nltk.org/book/ch05.html
result = []
text = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(text)
for item in tagged_sent:
result.append(item[1])
return result[:-1]
def get_subtree(sentence, tag):
# Returns a list of subtrees of the sentence with the specified tag.
parser = stanford.StanfordParser()
result = []
for tree in parser.parse(sentence.split()):
subtrees = tree.subtrees()
for subtree in subtrees:
if subtree.label() == tag:
result.append(subtree)
return result
def get_pp_old(text):
# Return: a list of prepositions inside PP's in
# the text. If the phrase is preceded by a VP/ADJP, the result
# include the verb/adj also. If the phrase is preceded by a NP,
# the noun is not included.
phrases = {}
for structure in parser.parse(nltk.word_tokenize(text)):
tree = ParentedTree.convert(structure)
for subtree in tree.subtrees():
if subtree.label() == "PP":
preposition = subtree.leaves()[0]
left_sibling = subtree.left_sibling()
if left_sibling != None:
left_sibling_label = left_sibling.label()
if is_noun(left_sibling_label):
phrases[preposition] = True
elif is_verb(left_sibling_label):
verb = convert_to_base_form(" ".join(left_sibling.leaves()), 'v')
word = verb + " " + preposition
phrases[word] = True
elif is_adj(left_sibling_label):
adj = convert_to_base_form(" ".join(left_sibling.leaves()), 'a')
word = adj + " " + preposition
phrases[word] = True
return phrases
def get_pp(text):
# Return: a list of prepositions inside PP's in
# the text. If the phrase is preceded by a VP/ADJP, the result
# include the verb/adj also. If the phrase is preceded by a NP,
# the noun is not included.
parser = stanford.StanfordParser()
phrases = {}
for structure in parser.parse(nltk.word_tokenize(text)):
tree = ParentedTree.convert(structure)
for subtree in tree.subtrees():
if subtree.label() == "PP":
preposition = subtree.leaves()[0] # first word of the prep phrase
left_sibling = subtree.left_sibling()
if left_sibling != None:
left_sibling_label = left_sibling.label()
if is_adj(left_sibling_label):
adj = " ".join(left_sibling.leaves())
preposition = adj + " " + preposition
if not is_conventional_preposition(preposition):
phrases[preposition] = True
return phrases