def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass for relaA in projizz.getYagoRelation(): j = [] for relaB in projizz.getYagoRelation(): overC = overlay[relaA][relaB] j.append((relaB,overC)) j.sort(key=lambda x:x[1],reverse=True) a = overlay[relaA][relaA] for relaB,overC in j: b = overlay[relaB][relaB] print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,a,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b)
def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass for relaA in projizz.getYagoRelation(): j = [] for relaB in projizz.getYagoRelation(): overC = overlay[relaA][relaB] j.append((relaB, overC)) j.sort(key=lambda x: x[1], reverse=True) a = overlay[relaA][relaA] for relaB, overC in j: b = overlay[relaB][relaB] print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % ( relaA, a, relaB, b, overC, a, float(overC) / a, overC, b, float(overC) / b)
def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass yagoRela = projizz.getYagoRelation() yagoRela.sort() yagoRela.remove("produced") #print yagoRela print " ", for i in range(13, 25): print "& (%d)" % (i), print "\\\\" for relaA in yagoRela: j = [] for relaB in yagoRela: overC = overlay[relaA][relaB] j.append((relaB, overC)) #j.sort(key=lambda x:x[1],reverse=True) a = overlay[relaA][relaA] _id = yagoRela.index(relaA) + 1 print "(%d) %s" % (_id, relaA), for relaB, overC in j: b = overlay[relaB][relaB] _tid = yagoRela.index(relaB) + 1 if _tid < 13: continue #print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,_id,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b) print " & %2.2f" % (float(overC) / a), print "\\\\"
def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass yagoRela = projizz.getYagoRelation() yagoRela.sort() yagoRela.remove("produced") #print yagoRela print " ", for i in range(13,25): print "& (%d)" % (i), print "\\\\" for relaA in yagoRela: j = [] for relaB in yagoRela: overC = overlay[relaA][relaB] j.append((relaB,overC)) #j.sort(key=lambda x:x[1],reverse=True) a = overlay[relaA][relaA] _id = yagoRela.index(relaA) + 1 print "(%d) %s" % (_id,relaA), for relaB,overC in j: b = overlay[relaB][relaB] _tid = yagoRela.index(relaB) + 1 if _tid < 13: continue #print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,_id,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b) print " & %2.2f" % (float(overC)/a), print "\\\\"
def buildModels(inputpath, outputPath): projizz.checkPath(outputPath) cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 relations = projizz.getYagoRelation() for relation in relations: if relation == "produced": continue pool.apply_async(trainModel, (t, relation, inputpath, outputPath)) t += 1 pool.close() pool.join() print "Done training all classifiers"
def buildModels(inputpath,outputPath): projizz.checkPath(outputPath) cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 relations = projizz.getYagoRelation() for relation in relations: if relation == "produced": continue pool.apply_async(trainModel, (t,relation,inputpath,outputPath)) t += 1 pool.close() pool.join() print "Done training all classifiers"
def parseYagoData(): phase = "used" if phase == "build": model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") else: model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table") # function testing. #test = "has appeared like [[num]]" ##test = "has appeared like [[num]" #i = projizz.naiveMatchPattern(test,model) #print i a = table.keys() originL = len(a) ptnByRelation = {} for relation in projizz.getYagoRelation(): if not phase == "build": break f = open("./yagoRela/%s.txt" % (relation)) print relation text = f.readline() ptnSynsetTxt = text.split("\",\" ")[1:] ptnSynsetTxt = ptnSynsetTxt[:-1] + [ptnSynsetTxt[-1][:-7]] ptnByRelation[relation] = [] evC = 0 for text in ptnSynsetTxt: ptns = text.split("#") # ptns[1] : pattern synset id in patty # ptns[3] : pattern domain # ptns[4] : pattern plain text # ptns[5] : pattern range # pnts[6] : confidence # ptns[7] : support co-occurrence # ptns[8] : some has, I guess it is eval result. if len(ptns) > 8: evC += 1 patterns = ptns[4].split(";%") patterns = patterns[:-1] + [patterns[-1][:-1]] for pattern in patterns: pid = projizz.naiveMatchPattern(pattern, model) if pid < 0: pass #print relation,pattern else: pid = str(pid) if pid in a: a.remove(pid) if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) if not relation in table[pid]["relations"]: table[pid]["relations"].append(relation) #print relation,pid,pattern ptnS = table[pid] if not "confidence" in ptnS: table[pid]["confidence"] = float(ptns[6]) table[pid]["support"] = int(ptns[7]) table[pid]["used"] = True if len(ptns) > 8: if ptns[8] == "false": table[pid]["eval"] = False #print pid,table[pid]["relations"],pattern,ptns[8] else: table[pid]["eval"] = True f.close() if phase == "build": for pid in a: table[pid]["used"] = False for pid in table: if table[pid]["used"]: needRemove = [] for relation in table[pid]["relations"]: if not pid in ptnByRelation[relation]: print pid, table[pid]["pattern"], relation needRemove.append(relation) for p in needRemove: table[pid]["relations"].remove(p) if len(table[pid]["relations"]) == 0: print pid, table[pid]["pattern"], "!!!" else: pass projizz.jsonWrite(table, "./yagoPatternTreeWithConfidence.table") else: c = 0 used = 0 for pid in table: if table[pid]["used"]: # 如果有true或false在,就只留True的Pattern if "eval" in table[pid]: if not table[pid]["eval"]: continue used += 1 for relation in table[pid]["relations"]: if not relation in ptnByRelation: ptnByRelation[relation] = [] if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) else: c += 1 # 一些小計算 #for relation in ptnByRelation: # print relation,len(ptnByRelation[relation]) # 找最高(意思就是不能再更高了)信心值 # 每組Relation的最高之中最小的那一個 minC = 1.0 minCR = "" for relation in ptnByRelation: c75 = 0 c50 = 0 ptns = [] for pid in ptnByRelation[relation]: ptns.append(table[pid]) ptns[-1]["pid"] = pid ptns.sort(key=lambda x: x["confidence"], reverse=True) if ptns[0]["confidence"] < minC: minC = ptns[0]["confidence"] minCR = relation #print relation,ptns[0] f = open("./yagoSortedRela/%s.txt" % (relation), "w") for ptn in ptns: if ptn["confidence"] > .75: c75 += 1 if ptn["confidence"] > .5: c50 += 1 f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"], ptn["pattern"], ptn["confidence"], ptn["support"], ptn["relations"])) f.close() print relation, len(ptns), c75, c50 print minCR, minC, "pattern used:", used
def parseYagoData(): phase = "used" if phase == "build": model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") else: model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table") # function testing. #test = "has appeared like [[num]]" ##test = "has appeared like [[num]" #i = projizz.naiveMatchPattern(test,model) #print i a = table.keys() originL = len(a) ptnByRelation = {} for relation in projizz.getYagoRelation(): if not phase == "build": break f = open("./yagoRela/%s.txt" % (relation)) print relation text = f.readline() ptnSynsetTxt = text.split("\",\" ")[1:] ptnSynsetTxt = ptnSynsetTxt[:-1] + [ ptnSynsetTxt[-1][:-7] ] ptnByRelation[relation] = [] evC = 0 for text in ptnSynsetTxt: ptns = text.split("#") # ptns[1] : pattern synset id in patty # ptns[3] : pattern domain # ptns[4] : pattern plain text # ptns[5] : pattern range # pnts[6] : confidence # ptns[7] : support co-occurrence # ptns[8] : some has, I guess it is eval result. if len(ptns) > 8: evC += 1 patterns = ptns[4].split(";%") patterns = patterns[:-1] + [patterns[-1][:-1]] for pattern in patterns: pid = projizz.naiveMatchPattern(pattern,model) if pid < 0: pass #print relation,pattern else: pid = str(pid) if pid in a: a.remove(pid) if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) if not relation in table[pid]["relations"]: table[pid]["relations"].append(relation) #print relation,pid,pattern ptnS = table[pid] if not "confidence" in ptnS: table[pid]["confidence"] = float(ptns[6]) table[pid]["support"] = int(ptns[7]) table[pid]["used"] = True if len(ptns) > 8: if ptns[8] == "false": table[pid]["eval"] = False #print pid,table[pid]["relations"],pattern,ptns[8] else: table[pid]["eval"] = True f.close() if phase == "build": for pid in a: table[pid]["used"] = False for pid in table: if table[pid]["used"]: needRemove = [] for relation in table[pid]["relations"]: if not pid in ptnByRelation[relation]: print pid,table[pid]["pattern"],relation needRemove.append(relation) for p in needRemove: table[pid]["relations"].remove(p) if len(table[pid]["relations"]) == 0: print pid,table[pid]["pattern"],"!!!" else: pass projizz.jsonWrite(table,"./yagoPatternTreeWithConfidence.table") else: c = 0 used = 0 for pid in table: if table[pid]["used"]: # 如果有true或false在,就只留True的Pattern if "eval" in table[pid]: if not table[pid]["eval"]: continue used += 1 for relation in table[pid]["relations"]: if not relation in ptnByRelation: ptnByRelation[relation] = [] if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) else: c += 1 # 一些小計算 #for relation in ptnByRelation: # print relation,len(ptnByRelation[relation]) # 找最高(意思就是不能再更高了)信心值 # 每組Relation的最高之中最小的那一個 minC = 1.0 minCR = "" for relation in ptnByRelation: c75 = 0 c50 = 0 ptns = [] for pid in ptnByRelation[relation]: ptns.append(table[pid]) ptns[-1]["pid"] = pid ptns.sort(key=lambda x:x["confidence"],reverse=True) if ptns[0]["confidence"] < minC: minC = ptns[0]["confidence"] minCR = relation #print relation,ptns[0] f = open("./yagoSortedRela/%s.txt" % (relation),"w") for ptn in ptns: if ptn["confidence"] > .75: c75 += 1 if ptn["confidence"] > .5: c50 += 1 f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"],ptn["pattern"],ptn["confidence"],ptn["support"],ptn["relations"])) f.close() print relation,len(ptns),c75,c50 print minCR,minC,"pattern used:",used
#request = urllib2.Request(url,data,headers) #response = urlopener.open(request) #result = response.read() #print result #f = open("./patty.dbpedia.Relations.json") #for line in f: # if "," in line: # relaName = line[2:-3] # print relaName # request = urllib2.Request(url,getData(relaName),headers) # g = open("./relationships/%s.txt" % (relaName), "w") # response = urlopener.open(request) # result = response.read() # g.write(result) # g.close() #f.close() for relaName in projizz.getYagoRelation(): print relaName request = urllib2.Request(url,getYAGOData(relaName),headers) g = open("./yagoRela/%s.txt" % (relaName), "w") response = urlopener.open(request) result = response.read() g.write(result) g.close()