def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass for relaA in projizz.getYagoRelation(): j = [] for relaB in projizz.getYagoRelation(): overC = overlay[relaA][relaB] j.append((relaB,overC)) j.sort(key=lambda x:x[1],reverse=True) a = overlay[relaA][relaA] for relaB,overC in j: b = overlay[relaB][relaB] print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,a,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b)
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def ambiguityDegree(): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") maxDegree = 0 degree = {} for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue d = len(table[pid]["relations"]) if not d in degree: degree[d] = [] if d > maxDegree: maxDegree = d degree[d].append(pid) for d in range(1,maxDegree+1): if not d in degree: print d,"0" continue #print d,len(degree[d]) for pid in degree[d]: print "%d\t%s\t%s\t%s\t%.5f\t%d" % (d,pid,table[pid]["pattern"],table[pid]["relations"],table[pid]["confidence"],table[pid]["support"])
def main(inputPtnPath,outputPath): start_time = datetime.now() model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for rela in r: for ptnId in r[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total":0,"support":0} properties[rela][ptnId]["total"] += r[rela][ptnId]["total"] properties[rela][ptnId]["support"] += r[rela][ptnId]["support"] json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers )) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath, outputPath, pspath): model, table = projizz.readPrefixTreeModelWithTable( "./yagoPatternTree.model", "./yagoPatternTree.table") properties = projizz.buildYagoProperties({ "tp": [], "tn": [], "fp": [], "fn": [] }) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 200 Relation for relation in sp: count = 0 for ptnId, ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 200: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, partAns, validate))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(): connect = pymongo.Connection() db = connect.projizz collection = db.patty.wiki.pattern model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") for ptnId in table: ptnText = table[ptnId]["pattern"] queryRegexp = ptnText.replace("[","\\\\[").replace("]","\\\\]") + ";.*" itr = collection.find({"patterntext":{"$regex":queryRegexp}}) print queryRegexp,itr.count()
def preprocess(inputPath,inputPtnPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table)) ) t += 1 pool.close() pool.join() patternInstances = {} # Reducer for r in result: sibf = r.get() for key in sibf: for ptnId in sibf[key]: if not ptnId in patternInstances: patternInstances[ptnId] = {} for rela in sibf[key][ptnId]: for inst in sibf[key][ptnId][rela]: if not rela in patternInstances[ptnId]: patternInstances[ptnId][rela] = {} if not key in patternInstances[ptnId][rela]: patternInstances[ptnId][rela][key] = [] patternInstances[ptnId][rela][key].append(inst) # Write to files # NOTE # Output Format: # ptnId.json (json) # rela: keys # key: line text for ptnId in patternInstances: projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def calculateOverlay(): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") overlay = {} for relation in projizz.getYagoRelation(): overlay[relation] = {} for rela in projizz.getYagoRelation(): overlay[relation][rela] = 0 # Build table for pid in table: if table[pid]["used"]: if "eval" in table[pid] and not table[pid]["eval"]: continue for relation in table[pid]["relations"]: for rela in table[pid]["relations"]: overlay[relation][rela] += 1 else: pass yagoRela = projizz.getYagoRelation() yagoRela.sort() yagoRela.remove("produced") #print yagoRela print " ", for i in range(13,25): print "& (%d)" % (i), print "\\\\" for relaA in yagoRela: j = [] for relaB in yagoRela: overC = overlay[relaA][relaB] j.append((relaB,overC)) #j.sort(key=lambda x:x[1],reverse=True) a = overlay[relaA][relaA] _id = yagoRela.index(relaA) + 1 print "(%d) %s" % (_id,relaA), for relaB,overC in j: b = overlay[relaB][relaB] _tid = yagoRela.index(relaB) + 1 if _tid < 13: continue #print "%s(%d) -> %s(%d) %d/%d %.5f (%d/%d,%.5f)" % (relaA,_id,relaB,b,overC,a,float(overC)/a,overC,b,float(overC)/b) print " & %2.2f" % (float(overC)/a), print "\\\\"
def main(inputPtnPath,outputPath,pspath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 100 Relation for relation in sp: count = 0 for ptnId,ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 100: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(dataInputPath,ptnOutputPath): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table") if not os.path.isdir(ptnOutputPath): os.mkdir(ptnOutputPath) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) print "Number of core: %d" % (multiprocessing.cpu_count()) start_time = datetime.now() jobN = 0 for filename in os.listdir(dataInputPath): if ".json" in filename: pool.apply_async(tryToFindRela, (jobN, filename, dataInputPath, ptnOutputPath, model, table)) jobN+=1 pool.close() pool.join() diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds,diff.microseconds)
def main(inputPtnPath, outputPath, confidence): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") # using new table (2014.06.16) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, copy.deepcopy(properties), confidence))) t += 1 pool.close() pool.join() for res in result: r = res.get() for rela in r: for ptnId in r[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total": 0, "support": 0} properties[rela][ptnId]["total"] += r[rela][ptnId]["total"] properties[rela][ptnId]["support"] += r[rela][ptnId]["support"] json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(part, revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/yago-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv1/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid": revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key] article = projizz.articleSimpleSentenceFileter( projizz.jsonRead(inputPath + targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange() model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename) for ans in itr: targetName = ans["_id"].replace("(", "").replace(")", "").split( "_") # get entity name's part types = ans["type"] answers = ans["properties"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"), targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"]) / float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]), lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % ( isIn, ptnId, table[ptnId]["pattern"], ps[1]["support"], ps[1]["total"], ps[0]) pass # select top 1 break # prevent second ans break
def preprocess(inputPath,inputPtnPath,outputPath,confidence): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence)) ) #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence)) t += 1 pool.close() pool.join() modelArticles = {} negAritcles = {} POSArticles = {} NEGArticles = {} # Reducer for r in result: sibr, osibr, p, n = r.get() for rela in sibr: if not rela in modelArticles: modelArticles[rela] = [] modelArticles[rela] += sibr[rela] for rela in osibr: if not rela in negAritcles: negAritcles[rela] = [] negAritcles[rela] += osibr[rela] for rela in p: if not rela in POSArticles: POSArticles[rela] = [] POSArticles[rela] += p[rela] for rela in n: if not rela in NEGArticles: NEGArticles[rela] = [] NEGArticles[rela] += n[rela] # # relation.json: [line, line, line, ....] # for rela in modelArticles: print rela projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) for rela in negAritcles: print rela projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) for rela in POSArticles: print rela projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela))) for rela in NEGArticles: print rela projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(part,revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/y-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv2/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid":revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key] article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange(); model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename) for ans in itr: targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] answers = ans["observed"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"])/float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]),lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0]) pass # select top 1 break # prevent second ans break
def preprocess(inputPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table)) ) t += 1 pool.close() pool.join() words = {} idf = {} tfs = {} # Reducer - DF types = 0 for r in result: fn,tks = r.get() tfs[fn] = tks types += 1 for t in tks: if t not in words: words[t] = 0 words[t] += 1 print "Doc#",types,"words#",len(words) projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df")) # Calculate idf for w in words: if words[w] == 0: continue idf[w] = math.log(float(types)/float(words[w]),10) projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf")) print "Write out idf file" # Calculate td-idf weight for fn in tfs: tks = tfs[fn] weight = {} for t in tks: tf = tks[t] if t not in idf: continue weight[t] = tf * idf[t] projizz.jsonWrite(weight,os.path.join(outputPath,fn)) print "build",fn,"tf-idf weight" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) ))) t += 1 pool.close() pool.join() statistics = {} counter = 0 for res in result: r,s = res.get() counter += 1 print "%d / %d ..." % (counter,len(result)) for degree in r: if not degree in properties: properties[degree] = {} for ptnId in r[degree]: if not ptnId in properties[degree]: properties[degree][ptnId] = {"occ":[],"sup":{}} # give up the code below for speed up continue for occId in r[degree][ptnId]["occ"]: if not occId in properties[degree][ptnId]["occ"]: properties[degree][ptnId]["occ"].append(occId) for rela in r[degree][ptnId]["sup"]: if not rela in properties[degree][ptnId]["sup"]: properties[degree][ptnId]["sup"][rela] = [] for supId in r[degree][ptnId]["sup"][rela]: if not supId in properties[degree][ptnId]["sup"][rela]: properties[degree][ptnId]["sup"][rela].append(supId) for aid in s: statistics[aid] = s[aid] json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds) ptnNum = 0 occDocs = [] ptnDeg = {} for ptnId in table: if table[ptnId]["used"]: if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue if table[ptnId]["confidence"] > c_confidence: deg = len(table[ptnId]["relations"]) if not deg in ptnDeg: ptnDeg[deg] = 0 ptnDeg[deg] += 1 for degree in range(1,18): if not degree in properties: print "%d\t%d\t%d\t%d\t%d" % (degree,0,0,0,0) else: occ = 0 occs = 0 for a in statistics: ds = statistics[a] if degree in ds: occ += 1 for i in ds: if i <= degree: occs += 1 break print "%d\t%d\t%d\t%d\t%d" % (degree,ptnDeg[degree],len(properties[degree]),occ,occs) # Calculate the number of articles which has no pattern. noPtnCount = 0 for a in statistics: ds = statistics[a] if len(ds) == 0: noPtnCount += 1 print noPtnCount,len(statistics)
# -*- coding: utf-8 -*- # qcl # import sys import projizz if len(sys.argv) <= 1: print "$ python ./simpleSortedViewer.py [ps json]" else: filename = sys.argv[1] ps = projizz.jsonRead(filename) sortedp = projizz.getSortedStatistic(ps) model,table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") for relation in sortedp: print relation for ptnId,ptnS in sortedp[relation]: print "%s\t%s %s %s" % (relation,table[ptnId]["pattern"],ptnId,ptnS)
def parseYagoData(): phase = "used" if phase == "build": model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") else: model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table") # function testing. #test = "has appeared like [[num]]" ##test = "has appeared like [[num]" #i = projizz.naiveMatchPattern(test,model) #print i a = table.keys() originL = len(a) ptnByRelation = {} for relation in projizz.getYagoRelation(): if not phase == "build": break f = open("./yagoRela/%s.txt" % (relation)) print relation text = f.readline() ptnSynsetTxt = text.split("\",\" ")[1:] ptnSynsetTxt = ptnSynsetTxt[:-1] + [ ptnSynsetTxt[-1][:-7] ] ptnByRelation[relation] = [] evC = 0 for text in ptnSynsetTxt: ptns = text.split("#") # ptns[1] : pattern synset id in patty # ptns[3] : pattern domain # ptns[4] : pattern plain text # ptns[5] : pattern range # pnts[6] : confidence # ptns[7] : support co-occurrence # ptns[8] : some has, I guess it is eval result. if len(ptns) > 8: evC += 1 patterns = ptns[4].split(";%") patterns = patterns[:-1] + [patterns[-1][:-1]] for pattern in patterns: pid = projizz.naiveMatchPattern(pattern,model) if pid < 0: pass #print relation,pattern else: pid = str(pid) if pid in a: a.remove(pid) if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) if not relation in table[pid]["relations"]: table[pid]["relations"].append(relation) #print relation,pid,pattern ptnS = table[pid] if not "confidence" in ptnS: table[pid]["confidence"] = float(ptns[6]) table[pid]["support"] = int(ptns[7]) table[pid]["used"] = True if len(ptns) > 8: if ptns[8] == "false": table[pid]["eval"] = False #print pid,table[pid]["relations"],pattern,ptns[8] else: table[pid]["eval"] = True f.close() if phase == "build": for pid in a: table[pid]["used"] = False for pid in table: if table[pid]["used"]: needRemove = [] for relation in table[pid]["relations"]: if not pid in ptnByRelation[relation]: print pid,table[pid]["pattern"],relation needRemove.append(relation) for p in needRemove: table[pid]["relations"].remove(p) if len(table[pid]["relations"]) == 0: print pid,table[pid]["pattern"],"!!!" else: pass projizz.jsonWrite(table,"./yagoPatternTreeWithConfidence.table") else: c = 0 used = 0 for pid in table: if table[pid]["used"]: # 如果有true或false在,就只留True的Pattern if "eval" in table[pid]: if not table[pid]["eval"]: continue used += 1 for relation in table[pid]["relations"]: if not relation in ptnByRelation: ptnByRelation[relation] = [] if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) else: c += 1 # 一些小計算 #for relation in ptnByRelation: # print relation,len(ptnByRelation[relation]) # 找最高(意思就是不能再更高了)信心值 # 每組Relation的最高之中最小的那一個 minC = 1.0 minCR = "" for relation in ptnByRelation: c75 = 0 c50 = 0 ptns = [] for pid in ptnByRelation[relation]: ptns.append(table[pid]) ptns[-1]["pid"] = pid ptns.sort(key=lambda x:x["confidence"],reverse=True) if ptns[0]["confidence"] < minC: minC = ptns[0]["confidence"] minCR = relation #print relation,ptns[0] f = open("./yagoSortedRela/%s.txt" % (relation),"w") for ptn in ptns: if ptn["confidence"] > .75: c75 += 1 if ptn["confidence"] > .5: c50 += 1 f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"],ptn["pattern"],ptn["confidence"],ptn["support"],ptn["relations"])) f.close() print relation,len(ptns),c75,c50 print minCR,minC,"pattern used:",used
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def main(inputPtnPath, outputPath): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, copy.deepcopy(properties)))) t += 1 pool.close() pool.join() statistics = {} counter = 0 for res in result: r, s = res.get() counter += 1 print "%d / %d ..." % (counter, len(result)) for degree in r: if not degree in properties: properties[degree] = {} for ptnId in r[degree]: if not ptnId in properties[degree]: properties[degree][ptnId] = {"occ": [], "sup": {}} # give up the code below for speed up continue for occId in r[degree][ptnId]["occ"]: if not occId in properties[degree][ptnId]["occ"]: properties[degree][ptnId]["occ"].append(occId) for rela in r[degree][ptnId]["sup"]: if not rela in properties[degree][ptnId]["sup"]: properties[degree][ptnId]["sup"][rela] = [] for supId in r[degree][ptnId]["sup"][rela]: if not supId in properties[degree][ptnId]["sup"][rela]: properties[degree][ptnId]["sup"][rela].append( supId) for aid in s: statistics[aid] = s[aid] json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds) ptnNum = 0 occDocs = [] ptnDeg = {} for ptnId in table: if table[ptnId]["used"]: if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue if table[ptnId]["confidence"] > c_confidence: deg = len(table[ptnId]["relations"]) if not deg in ptnDeg: ptnDeg[deg] = 0 ptnDeg[deg] += 1 for degree in range(1, 18): if not degree in properties: print "%d\t%d\t%d\t%d\t%d" % (degree, 0, 0, 0, 0) else: occ = 0 occs = 0 for a in statistics: ds = statistics[a] if degree in ds: occ += 1 for i in ds: if i <= degree: occs += 1 break print "%d\t%d\t%d\t%d\t%d" % (degree, ptnDeg[degree], len(properties[degree]), occ, occs) # Calculate the number of articles which has no pattern. noPtnCount = 0 for a in statistics: ds = statistics[a] if len(ds) == 0: noPtnCount += 1 print noPtnCount, len(statistics)
# -*- coding: utf-8 -*- # qcl # import sys import projizz if len(sys.argv) <= 1: print "$ python ./simpleSortedViewer.py [ps json]" else: filename = sys.argv[1] ps = projizz.jsonRead(filename) sortedp = projizz.getSortedStatistic(ps) model, table = projizz.readPrefixTreeModelWithTable( "./yagoPatternTree.model", "./yagoPatternTree.table") for relation in sortedp: print relation for ptnId, ptnS in sortedp[relation]: print "%s\t%s %s %s" % (relation, table[ptnId]["pattern"], ptnId, ptnS)
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))