コード例 #1
0
class graphFactory():
	
	def __init__(self):
		self.nameDict = {}
		self.node_num = 999
		#english_stopwords = stopwords.words('english')
		self.stopwords = set()
		sFile = open("../data/stopword.txt", "r").read().decode("utf8").split("\n")
		for s in sFile:
			if len(s)<1:continue
			self.stopwords.add(s)
		#self.stopwords = set(english_stopwords)

	def analysis(self, s, flag = 0):
		s = s.replace("/"," ")
		s = re.sub(r" {2,}", "", s)
		verb = []
		noun = []
		
		tokens = nltk.word_tokenize(s)
		pos = nltk.pos_tag(tokens)

		for p in pos:
			regRes = re.match(r"[\(\)|\{\}]+", p[0])
			if regRes!=None:continue
			if p[0] in self.stopwords:continue
			if p[1].find("VB")==0 or p[1].find("JJ")==0 or p[1].find("RB")==0:
				if p[0] not in self.nameDict:
					if flag==1:continue
					self.nameDict[p[0]] = self.node_num
					self.node_num += 1
				verb.append( self.nameDict[p[0]] )

			elif p[1].find("NN")==0:
				if p[0] not in self.nameDict:
					if flag==1:continue
					self.nameDict[p[0]] = self.node_num
					self.node_num += 1
				noun.append( self.nameDict[p[0]] )

		return noun, verb

	def load(self):
		qfile = open("../data/DevSetL1All.integrated.txt").read().decode("utf8")
		qs = qfile.split('\n')
		pro_res = []
		kb_idPattern = re.compile(r"\d+")
		kb_list = {}
		kb = []
		cnt = 0
		for q in qs:
			attr = q.split("\t")
			if len(attr)<3:continue
			noun, verb = self.analysis(attr[1])
			pro_res.append([noun, verb])
			l = len(attr)
			for i in xrange(2, l):
				regRes = kb_idPattern.match(attr[i])
				if regRes==None:
					print "KB ID ERROR!"
				kb_id = int(regRes.group())
				if kb_id not in kb_list:
					kb_list[kb_id] = []
				kb_list[kb_id].append(cnt)
			cnt += 1
		kb_dict = {}
		kb_r_dict = {}
		kb_start = self.node_num

		for i in kb_list.keys():
			kb_dict[i] = self.node_num
			kb_r_dict[self.node_num] = i
			self.node_num += 1

		print "total node:", self.node_num
		kb_end = self.node_num
		self.graph = GR(self.node_num)


		cnt = 0
		for q in qs:
			attr = q.split("\t")
			if len(attr)<3:continue
			l = len(attr)
			for i in xrange(2, l):
				regRes = kb_idPattern.match(attr[i])
				kb_id = kb_dict[int(regRes.group())]
				for n in pro_res[cnt][0]:
					self.graph.add_edge(n, kb_id, 2)
				
				for v in pro_res[cnt][1]:
					self.graph.add_edge(v, kb_id, 2)
			cnt += 1

		for i in xrange(cnt):
			
			noun, verb = pro_res[i]
			l = len(noun)
			for j in xrange(l):
				for k in xrange(j+1, l):
					self.graph.add_edge(noun[j], noun[k], 1)
					self.graph.add_edge(noun[k], noun[j], 1)

			l = len(verb)
			for j in xrange(l):
				for k in xrange(j+1, l):
					self.graph.add_edge(verb[j], verb[k], 1)
					self.graph.add_edge(verb[k], verb[j], 1)

		for i in kb_list.keys():
			kb_node = kb_dict[i]
			li = kb_list[i]
			l = len(li)
			for j in xrange(l):
				for k in xrange(j+1, l):
					n1, v1 = pro_res[li[j]]
					n2, v2 = pro_res[li[k]]
					for i1 in xrange(len(n1)):
						for i2 in xrange(len(n2)):
							self.graph.add_edge(n1[i1], n2[i2], 1)
							self.graph.add_edge(n2[i2], n1[i1], 1)

					for i1 in xrange(len(v1)):
						for i2 in xrange(len(v2)):
							self.graph.add_edge(v1[i1], v2[i2], 1)
							self.graph.add_edge(v2[i2], v1[i1], 1)

		testFile = open("../data/MSS.Test.L1.txt", "r").read().decode("utf8").split("\n")
		cnt = 0
		for t in testFile:
			attr = t.split("\t")
			if len(attr)!=2:continue
			clist = attr[1].split(";")

			now = 7.0
			alph = 0.68
			noun, verb = self.analysis(attr[0], 1)
			for n in noun:
				self.graph.add_edge(cnt, n, 1)
			for v in verb:
				self.graph.add_edge(cnt, v, 1)

			for c in clist[:15]:
				if len(c)<1:continue
				kb_id = int(c)
				if kb_id not in kb_dict:
					continue
				if now<"0.1":
					continue
				self.graph.add_edge(cnt, kb_dict[kb_id], now)
				now *= alph
			cnt += 1

		oFile = open("../data/basic_randomWalk_2.txt","w")
		for i in xrange(self.node_num):
			self.graph.node[i].normalize()

		
		cnt = 0
		for i in xrange(999):
			ans = self.graph.walk(i, kb_start, kb_end)
			for a in ans:
				print >> oFile, ("1\t" + str(600000 + cnt) + "\t" + str(kb_r_dict[a[0]]) + "\t" + str(a[1])).encode("utf8")
			cnt += 1
			if cnt==264:
				cnt += 1
		
		oFile.close()
		qfile.close()
コード例 #2
0
	def load(self):
		qfile = open("../data/DevSetL1All.integrated.txt").read().decode("utf8")
		qs = qfile.split('\n')
		pro_res = []
		kb_idPattern = re.compile(r"\d+")
		kb_list = {}
		kb = []
		cnt = 0
		for q in qs:
			attr = q.split("\t")
			if len(attr)<3:continue
			noun, verb = self.analysis(attr[1])
			pro_res.append([noun, verb])
			l = len(attr)
			for i in xrange(2, l):
				regRes = kb_idPattern.match(attr[i])
				if regRes==None:
					print "KB ID ERROR!"
				kb_id = int(regRes.group())
				if kb_id not in kb_list:
					kb_list[kb_id] = []
				kb_list[kb_id].append(cnt)
			cnt += 1
		kb_dict = {}
		kb_r_dict = {}
		kb_start = self.node_num

		for i in kb_list.keys():
			kb_dict[i] = self.node_num
			kb_r_dict[self.node_num] = i
			self.node_num += 1

		print "total node:", self.node_num
		kb_end = self.node_num
		self.graph = GR(self.node_num)


		cnt = 0
		for q in qs:
			attr = q.split("\t")
			if len(attr)<3:continue
			l = len(attr)
			for i in xrange(2, l):
				regRes = kb_idPattern.match(attr[i])
				kb_id = kb_dict[int(regRes.group())]
				for n in pro_res[cnt][0]:
					self.graph.add_edge(n, kb_id, 2)
				
				for v in pro_res[cnt][1]:
					self.graph.add_edge(v, kb_id, 2)
			cnt += 1

		for i in xrange(cnt):
			
			noun, verb = pro_res[i]
			l = len(noun)
			for j in xrange(l):
				for k in xrange(j+1, l):
					self.graph.add_edge(noun[j], noun[k], 1)
					self.graph.add_edge(noun[k], noun[j], 1)

			l = len(verb)
			for j in xrange(l):
				for k in xrange(j+1, l):
					self.graph.add_edge(verb[j], verb[k], 1)
					self.graph.add_edge(verb[k], verb[j], 1)

		for i in kb_list.keys():
			kb_node = kb_dict[i]
			li = kb_list[i]
			l = len(li)
			for j in xrange(l):
				for k in xrange(j+1, l):
					n1, v1 = pro_res[li[j]]
					n2, v2 = pro_res[li[k]]
					for i1 in xrange(len(n1)):
						for i2 in xrange(len(n2)):
							self.graph.add_edge(n1[i1], n2[i2], 1)
							self.graph.add_edge(n2[i2], n1[i1], 1)

					for i1 in xrange(len(v1)):
						for i2 in xrange(len(v2)):
							self.graph.add_edge(v1[i1], v2[i2], 1)
							self.graph.add_edge(v2[i2], v1[i1], 1)

		testFile = open("../data/MSS.Test.L1.txt", "r").read().decode("utf8").split("\n")
		cnt = 0
		for t in testFile:
			attr = t.split("\t")
			if len(attr)!=2:continue
			clist = attr[1].split(";")

			now = 7.0
			alph = 0.68
			noun, verb = self.analysis(attr[0], 1)
			for n in noun:
				self.graph.add_edge(cnt, n, 1)
			for v in verb:
				self.graph.add_edge(cnt, v, 1)

			for c in clist[:15]:
				if len(c)<1:continue
				kb_id = int(c)
				if kb_id not in kb_dict:
					continue
				if now<"0.1":
					continue
				self.graph.add_edge(cnt, kb_dict[kb_id], now)
				now *= alph
			cnt += 1

		oFile = open("../data/basic_randomWalk_2.txt","w")
		for i in xrange(self.node_num):
			self.graph.node[i].normalize()

		
		cnt = 0
		for i in xrange(999):
			ans = self.graph.walk(i, kb_start, kb_end)
			for a in ans:
				print >> oFile, ("1\t" + str(600000 + cnt) + "\t" + str(kb_r_dict[a[0]]) + "\t" + str(a[1])).encode("utf8")
			cnt += 1
			if cnt==264:
				cnt += 1
		
		oFile.close()
		qfile.close()