-
Notifications
You must be signed in to change notification settings - Fork 0
/
api.py
112 lines (102 loc) · 3.18 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from corenlp import StanfordCoreNLP
import simp
from nltk.corpus import wordnet as wn
CORENLPDIR = 'stanford-corenlp-full-2013-11-12'
corenlp = None
def showtags(sentences):
for s in sentences:
string = ""
for tok in s:
string += "("+tok['word']+","+tok['POS']+","+tok['NER']+")"
print string
def getcorenlp():
global corenlp
corenlp = StanfordCoreNLP(CORENLPDIR)
return corenlp
def parseS(s):
s = s
parse = corenlp.raw_parse(s)
sentences = []
for l in parse['sentences']:
sentence = []
for w in l['words']:
token = {}
token['word'] = w[0]
token['NER'] = w[1]['NamedEntityTag']
token['POS'] = w[1]['PartOfSpeech']
token['lemma'] = w[1]['Lemma']
if token['POS'] == 'FW':
token['lemma']=token['word']
if token['word'] == '-LRB-':
token['word'] = '('
if token['word'] == '-RRB-':
token['word'] = ')'
token['WS'] = simp.getSynset(toString2(l['words']), token['word'])
sentence.append(token)
sentences.append(sentence)
if 'coref' in parse:
coref(parse['coref'],sentences)
return sentences
header_len = 30
def parsefile(fileloc):
sentences=[]
if corenlp==None:
print "CoreNLP Not loaded"
return None
info=""
result = []
with open(fileloc) as file:
for line in file:
if len(line)>header_len:
sentences = parseS(line)
result.extend(sentences)
return result
#define a max reference length
#if the reference pointed to exceeds this number of tokens then we can assume coref failed
MAX = 4
def coref(cf, s):
for group in cf:
for entities in group:
a = entities[0]
b = entities[1]
sena = a[1]
senb = b[1]
heada = a[2]
enda = a[3]
headb = b[2]
if 'PRP' in s[sena][heada]['POS'] or 'PRP' in s[sena][a[3]]:
#replace a
if 'PRP$' in s[sena][heada]['POS'] or\
'PRP$' in s[senb][headb]['POS']:
continue
sub = s[senb][b[3]:b[4]]
if toString([sub[-1]]) == "'s":
sub = sub[:-1]
if len(sub)>MAX:
continue
p1 = s[sena][:a[3]]
p2 = s[sena][a[4]:]
s[sena] = p1+sub+p2
elif 'PRP' in s[senb][headb]['POS'] or 'PRP' in s[senb][b[3]]:
#replace b
if 'PRP$' in s[senb][headb]['POS'] or\
'PRP$' in s[sena][heada]['POS']:
continue
sub = s[sena][a[3]:a[4]]
if toString([sub[-1]]) == "'s":
sub = sub[:-1]
if len(sub)>MAX:
continue
p1 = s[senb][:b[3]]
p2 = s[senb][b[4]:]
s[senb] = p1+sub+p2
def toString(sentence):
s = ""
for t in sentence:
s+=t['word']+" "
return s[:-1]
def toString2(sentence):
s=""
for t in sentence:
s+=t[0]+" "
return s[:-1]