def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers )) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []}) domainRange = projizz.getYagoRelationDomainRange() idf, docs, lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile)) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath(os.path.join(outputPath, keydirName)) projizz.jsonWrite(p, os.path.join(outputPath, keydirName, outputFilename)) print "start write out to %s" % (os.path.join(outputPath, keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(part,revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/y-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/y-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv2/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid":revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid,inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath+targetFilename)[key] article = projizz.articleSimpleSentenceFileter(projizz.jsonRead(inputPath+targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange(); model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part,revid,targetFilename) for ans in itr: targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] answers = ans["observed"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"),targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"])/float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]),lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % (isIn,ptnId,table[ptnId]["pattern"],ps[1]["support"],ps[1]["total"],ps[0]) pass # select top 1 break # prevent second ans break
def main(part, revid): # Paths (on NLG workstation) inputPath = "/tmp2/ccli/yago-part-%s/" % (part) inputPtnPath = "/tmp2/ccli/yago-ptn-part-%s/" % (part) spPath = "../yago/yagoPSv1/ps.%s.json" % (part) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer itr = collection.find({"revid": revid}) # find filename a = os.popen("grep -nr \"%s\" %s" % (revid, inputPath)).readline() targetFilename = a.split(":")[0].split("/")[-1] key = "%s.txt" % (revid) pattern = projizz.jsonRead(inputPtnPath + targetFilename)[key] article = projizz.articleSimpleSentenceFileter( projizz.jsonRead(inputPath + targetFilename)[key]) st = projizz.getSortedPatternStatistic(projizz.jsonRead(spPath)) domainRange = projizz.getYagoRelationDomainRange() model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") print "Part %s, RevID=%s, in %s" % (part, revid, targetFilename) for ans in itr: targetName = ans["_id"].replace("(", "").replace(")", "").split( "_") # get entity name's part types = ans["type"] answers = ans["properties"] print "Target=%s\nTarget token=%s" % (ans["_id"].encode("utf-8"), targetName) print "Type=%s" % (types) print "Answer=%s" % (answers) for line in pattern: lineText = article[line[0]] named = False for namedToken in targetName: if namedToken in lineText: named = True break if not named: # No target name in line text continue # go to next line. for ptn in line[1]: ptnId = "%d" % (ptn[0]) #rfp = table[ptnId]["relations"] if not ptnId in st: continue for ps in st[ptnId]: if float(ps[1]["support"]) / float(ps[1]["total"]) > 0: if domainRange[ps[0]]["domain"] in types: print "#%d" % (line[0]), lineText.encode("utf-8") isIn = "(X)" if ps[0] in answers: isIn = "(O)" print "%s %s/%s/{%d,%d}/ %s" % ( isIn, ptnId, table[ptnId]["pattern"], ps[1]["support"], ps[1]["total"], ps[0]) pass # select top 1 break # prevent second ans break
# -*- coding:utf-8 -*- # qcl import projizz dr = projizz.getYagoRelationDomainRange(); result = {} for r in dr: if r == "produced": continue l = 0 f = open("../patty/yagoSortedRela/%s.txt" % (r), "r") for line in f: l += 1 f.close() domain = dr[r]["domain"] rang = dr[r]["range"] if "wl:" in rang: rang = "Thing" if "wordnet_" in domain: domain = domain.split("_") domain = "%s\_%s" % (domain[0],domain[1]) if "wordnet_" in rang: rang = rang.split("_") rang = "%s\_%s" % (rang[0],rang[1]) result[r] = (domain,rang,l)