class graphFactory(): def __init__(self): self.nameDict = {} self.node_num = 999 #english_stopwords = stopwords.words('english') self.stopwords = set() sFile = open("../data/stopword.txt", "r").read().decode("utf8").split("\n") for s in sFile: if len(s)<1:continue self.stopwords.add(s) #self.stopwords = set(english_stopwords) def analysis(self, s, flag = 0): s = s.replace("/"," ") s = re.sub(r" {2,}", "", s) verb = [] noun = [] tokens = nltk.word_tokenize(s) pos = nltk.pos_tag(tokens) for p in pos: regRes = re.match(r"[\(\)|\{\}]+", p[0]) if regRes!=None:continue if p[0] in self.stopwords:continue if p[1].find("VB")==0 or p[1].find("JJ")==0 or p[1].find("RB")==0: if p[0] not in self.nameDict: if flag==1:continue self.nameDict[p[0]] = self.node_num self.node_num += 1 verb.append( self.nameDict[p[0]] ) elif p[1].find("NN")==0: if p[0] not in self.nameDict: if flag==1:continue self.nameDict[p[0]] = self.node_num self.node_num += 1 noun.append( self.nameDict[p[0]] ) return noun, verb def load(self): qfile = open("../data/DevSetL1All.integrated.txt").read().decode("utf8") qs = qfile.split('\n') pro_res = [] kb_idPattern = re.compile(r"\d+") kb_list = {} kb = [] cnt = 0 for q in qs: attr = q.split("\t") if len(attr)<3:continue noun, verb = self.analysis(attr[1]) pro_res.append([noun, verb]) l = len(attr) for i in xrange(2, l): regRes = kb_idPattern.match(attr[i]) if regRes==None: print "KB ID ERROR!" kb_id = int(regRes.group()) if kb_id not in kb_list: kb_list[kb_id] = [] kb_list[kb_id].append(cnt) cnt += 1 kb_dict = {} kb_r_dict = {} kb_start = self.node_num for i in kb_list.keys(): kb_dict[i] = self.node_num kb_r_dict[self.node_num] = i self.node_num += 1 print "total node:", self.node_num kb_end = self.node_num self.graph = GR(self.node_num) cnt = 0 for q in qs: attr = q.split("\t") if len(attr)<3:continue l = len(attr) for i in xrange(2, l): regRes = kb_idPattern.match(attr[i]) kb_id = kb_dict[int(regRes.group())] for n in pro_res[cnt][0]: self.graph.add_edge(n, kb_id, 2) for v in pro_res[cnt][1]: self.graph.add_edge(v, kb_id, 2) cnt += 1 for i in xrange(cnt): noun, verb = pro_res[i] l = len(noun) for j in xrange(l): for k in xrange(j+1, l): self.graph.add_edge(noun[j], noun[k], 1) self.graph.add_edge(noun[k], noun[j], 1) l = len(verb) for j in xrange(l): for k in xrange(j+1, l): self.graph.add_edge(verb[j], verb[k], 1) self.graph.add_edge(verb[k], verb[j], 1) for i in kb_list.keys(): kb_node = kb_dict[i] li = kb_list[i] l = len(li) for j in xrange(l): for k in xrange(j+1, l): n1, v1 = pro_res[li[j]] n2, v2 = pro_res[li[k]] for i1 in xrange(len(n1)): for i2 in xrange(len(n2)): self.graph.add_edge(n1[i1], n2[i2], 1) self.graph.add_edge(n2[i2], n1[i1], 1) for i1 in xrange(len(v1)): for i2 in xrange(len(v2)): self.graph.add_edge(v1[i1], v2[i2], 1) self.graph.add_edge(v2[i2], v1[i1], 1) testFile = open("../data/MSS.Test.L1.txt", "r").read().decode("utf8").split("\n") cnt = 0 for t in testFile: attr = t.split("\t") if len(attr)!=2:continue clist = attr[1].split(";") now = 7.0 alph = 0.68 noun, verb = self.analysis(attr[0], 1) for n in noun: self.graph.add_edge(cnt, n, 1) for v in verb: self.graph.add_edge(cnt, v, 1) for c in clist[:15]: if len(c)<1:continue kb_id = int(c) if kb_id not in kb_dict: continue if now<"0.1": continue self.graph.add_edge(cnt, kb_dict[kb_id], now) now *= alph cnt += 1 oFile = open("../data/basic_randomWalk_2.txt","w") for i in xrange(self.node_num): self.graph.node[i].normalize() cnt = 0 for i in xrange(999): ans = self.graph.walk(i, kb_start, kb_end) for a in ans: print >> oFile, ("1\t" + str(600000 + cnt) + "\t" + str(kb_r_dict[a[0]]) + "\t" + str(a[1])).encode("utf8") cnt += 1 if cnt==264: cnt += 1 oFile.close() qfile.close()
def load(self): qfile = open("../data/DevSetL1All.integrated.txt").read().decode("utf8") qs = qfile.split('\n') pro_res = [] kb_idPattern = re.compile(r"\d+") kb_list = {} kb = [] cnt = 0 for q in qs: attr = q.split("\t") if len(attr)<3:continue noun, verb = self.analysis(attr[1]) pro_res.append([noun, verb]) l = len(attr) for i in xrange(2, l): regRes = kb_idPattern.match(attr[i]) if regRes==None: print "KB ID ERROR!" kb_id = int(regRes.group()) if kb_id not in kb_list: kb_list[kb_id] = [] kb_list[kb_id].append(cnt) cnt += 1 kb_dict = {} kb_r_dict = {} kb_start = self.node_num for i in kb_list.keys(): kb_dict[i] = self.node_num kb_r_dict[self.node_num] = i self.node_num += 1 print "total node:", self.node_num kb_end = self.node_num self.graph = GR(self.node_num) cnt = 0 for q in qs: attr = q.split("\t") if len(attr)<3:continue l = len(attr) for i in xrange(2, l): regRes = kb_idPattern.match(attr[i]) kb_id = kb_dict[int(regRes.group())] for n in pro_res[cnt][0]: self.graph.add_edge(n, kb_id, 2) for v in pro_res[cnt][1]: self.graph.add_edge(v, kb_id, 2) cnt += 1 for i in xrange(cnt): noun, verb = pro_res[i] l = len(noun) for j in xrange(l): for k in xrange(j+1, l): self.graph.add_edge(noun[j], noun[k], 1) self.graph.add_edge(noun[k], noun[j], 1) l = len(verb) for j in xrange(l): for k in xrange(j+1, l): self.graph.add_edge(verb[j], verb[k], 1) self.graph.add_edge(verb[k], verb[j], 1) for i in kb_list.keys(): kb_node = kb_dict[i] li = kb_list[i] l = len(li) for j in xrange(l): for k in xrange(j+1, l): n1, v1 = pro_res[li[j]] n2, v2 = pro_res[li[k]] for i1 in xrange(len(n1)): for i2 in xrange(len(n2)): self.graph.add_edge(n1[i1], n2[i2], 1) self.graph.add_edge(n2[i2], n1[i1], 1) for i1 in xrange(len(v1)): for i2 in xrange(len(v2)): self.graph.add_edge(v1[i1], v2[i2], 1) self.graph.add_edge(v2[i2], v1[i1], 1) testFile = open("../data/MSS.Test.L1.txt", "r").read().decode("utf8").split("\n") cnt = 0 for t in testFile: attr = t.split("\t") if len(attr)!=2:continue clist = attr[1].split(";") now = 7.0 alph = 0.68 noun, verb = self.analysis(attr[0], 1) for n in noun: self.graph.add_edge(cnt, n, 1) for v in verb: self.graph.add_edge(cnt, v, 1) for c in clist[:15]: if len(c)<1:continue kb_id = int(c) if kb_id not in kb_dict: continue if now<"0.1": continue self.graph.add_edge(cnt, kb_dict[kb_id], now) now *= alph cnt += 1 oFile = open("../data/basic_randomWalk_2.txt","w") for i in xrange(self.node_num): self.graph.node[i].normalize() cnt = 0 for i in xrange(999): ans = self.graph.walk(i, kb_start, kb_end) for a in ans: print >> oFile, ("1\t" + str(600000 + cnt) + "\t" + str(kb_r_dict[a[0]]) + "\t" + str(a[1])).encode("utf8") cnt += 1 if cnt==264: cnt += 1 oFile.close() qfile.close()