-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsing.py
50 lines (41 loc) · 1.7 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
import pickle
from linker import Linker
class Parsing:
def extractEntities(self,string):
entity = re.search('(.*): \d', string)
entity=entity.group(1)
return entity
def extractPredicate(self,string):
predicate = re.search('predicate: (.*)#location.*#location.*', string)
predicate=predicate.group(1)
return predicate
def extractCount(self,string):
count=re.search('.*: (.*).*', string)
count=count.group(1)
return int(float(count))
def parse(self, fileName,entitySet,vectorMap):
#unpickling dict for cheap link
with open("entityDictionary.dat", "rb") as r:
entDict=pickle.load(r)
with open(fileName) as f:
for line in f:
if 'inv idx' in line:
break
elif 'predicate' in line:
predicate=self.extractPredicate(line)
currentVector = []
vectorMap.put(predicate,currentVector)
elif ': ' in line and 'num preds' not in line:
#if linking:extract single words
#if linking: link single words
#if linking: concatenate the entity links
origEntities=self.extractEntities(line)
l = Linker()
entities=l.cheapLink(origEntities, entDict)
count=self.extractCount(line)
index = entitySet.getIndex(entities)
pair = (index,count)
currentVector.append(pair)
print("done parsing")
return vectorMap, entitySet