def buildModel(inputPtnPath, outputPath): if not os.path.isdir(outputPath): os.mkdir(outputPath) statisticFiles = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: statisticFiles.append(filename) statisticFiles.sort() for filename in statisticFiles: properties = projizz.buildYagoProperties({}) for anotherFilename in statisticFiles: if filename == anotherFilename: continue p = json.load( open(os.path.join(inputPtnPath, anotherFilename), "r")) for rela in p: for ptnId in p[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total": 0, "support": 0} properties[rela][ptnId]["total"] += p[rela][ptnId]["total"] properties[rela][ptnId]["support"] += p[rela][ptnId][ "support"] #for relation in properties: # ptns = properties[relation] # print ptns.items() print filename json.dump(properties, open(os.path.join(outputPath, filename), "w"))
def buildModel(inputPtnPath, outputPath): if not os.path.isdir(outputPath): os.mkdir(outputPath) statisticFiles = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: statisticFiles.append(filename) statisticFiles.sort() for filename in statisticFiles: properties = projizz.buildYagoProperties({}) for anotherFilename in statisticFiles: if filename == anotherFilename: continue p = json.load(open(os.path.join(inputPtnPath,anotherFilename),"r")) for rela in p: for ptnId in p[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total":0,"support":0} properties[rela][ptnId]["total"] += p[rela][ptnId]["total"] properties[rela][ptnId]["support"] += p[rela][ptnId]["support"] #for relation in properties: # ptns = properties[relation] # print ptns.items() print filename json.dump(properties,open(os.path.join(outputPath,filename),"w"))
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath): start_time = datetime.now() model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for rela in r: for ptnId in r[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total":0,"support":0} properties[rela][ptnId]["total"] += r[rela][ptnId]["total"] properties[rela][ptnId]["support"] += r[rela][ptnId]["support"] json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers )) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []}) domainRange = projizz.getYagoRelationDomainRange() idf, docs, lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile)) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath(os.path.join(outputPath, keydirName)) projizz.jsonWrite(p, os.path.join(outputPath, keydirName, outputFilename)) print "start write out to %s" % (os.path.join(outputPath, keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath, outputPath, pspath): model, table = projizz.readPrefixTreeModelWithTable( "./yagoPatternTree.model", "./yagoPatternTree.table") properties = projizz.buildYagoProperties({ "tp": [], "tn": [], "fp": [], "fn": [] }) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 200 Relation for relation in sp: count = 0 for ptnId, ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 200: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, partAns, validate))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputFile): fp = open(inputFile, 'r') yagoRelations = projizz.buildYagoProperties({}) c = [] j = 0 for line in fp: if line[0] == "#" or line[0] == "@" or line[0] == "\n": continue #if line[0:29] != "<http://dbpedia.org/resource/": # continue prefix = "@prefix dbp: <http://dbpedia.org/ontology/> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ." g = rdflib.Graph() r = g.parse(data=prefix + line, format="n3") for instance in r: if not "rdf-schema#" in instance[1] or not instance[0][ 42:] in yagoRelations: continue j += 1 yagoRelation = "%s" % (instance[0][42:]) relaProperty = "%s" % (instance[1][37:]) if "wl#Thin" in "%s" % (instance[2]): content = "wl:Thin" else: content = "%s" % (instance[2][42:]) if not "domain" in relaProperty and not "range" in relaProperty: continue yagoRelations[yagoRelation][relaProperty] = content print "Read", j, "lines. Finished." yagoRelations["produced"]["domain"] = "" yagoRelations["produced"]["range"] = "" for prop in yagoRelations: print prop, yagoRelations[prop]["domain"], yagoRelations[prop]["range"] f = open("../projizz/yagoRelation.py", "w") f.write( "# -*- coding: utf-8 -*-\n# qcl, automatically generated by ../yago/parse.yago.schema.py\n# the domain and range of each yago relation used in projizz\n" ) f.write("yagoDomainRange = %s\n" % (yagoRelations))
def main(inputFile): fp = open(inputFile,'r') yagoRelations = projizz.buildYagoProperties({}) c = [] j = 0 for line in fp: if line[0] == "#" or line[0] == "@" or line[0] == "\n": continue #if line[0:29] != "<http://dbpedia.org/resource/": # continue prefix = "@prefix dbp: <http://dbpedia.org/ontology/> .\n@prefix owl: <http://www.w3.org/2002/07/owl#> .\n@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ." g = rdflib.Graph() r = g.parse(data=prefix+line,format="n3") for instance in r: if not "rdf-schema#" in instance[1] or not instance[0][42:] in yagoRelations: continue j += 1 yagoRelation = "%s" % (instance[0][42:]) relaProperty = "%s" % (instance[1][37:]) if "wl#Thin" in "%s" % (instance[2]): content = "wl:Thin" else: content = "%s" % (instance[2][42:]) if not "domain" in relaProperty and not "range" in relaProperty: continue yagoRelations[yagoRelation][relaProperty] = content print "Read",j,"lines. Finished." yagoRelations["produced"]["domain"] = "" yagoRelations["produced"]["range"] = "" for prop in yagoRelations: print prop,yagoRelations[prop]["domain"],yagoRelations[prop]["range"] f = open("../projizz/yagoRelation.py","w") f.write("# -*- coding: utf-8 -*-\n# qcl, automatically generated by ../yago/parse.yago.schema.py\n# the domain and range of each yago relation used in projizz\n") f.write("yagoDomainRange = %s\n" % (yagoRelations))
def main(inputPtnPath,outputPath,pspath): model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") properties = projizz.buildYagoProperties({"tp":[],"tn":[],"fp":[],"fn":[]}) sp = projizz.getSortedStatistic(projizz.jsonRead(pspath)) validate = [] # Get Top 100 Relation for relation in sp: count = 0 for ptnId,ptnS in sp[relation]: ptnData = table[ptnId] if len(ptnData["relations"]) == 1: count += 1 validate.append(ptnId) if count >= 100: break start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,validate ))) t += 1 pool.close() pool.join() for res in result: r = res.get() for m in r: properties[m]["tp"] += r[m]["tp"] properties[m]["tn"] += r[m]["tn"] properties[m]["fp"] += r[m]["fp"] properties[m]["fn"] += r[m]["fn"] print "start write out to %s" % (outputPath) json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath, outputPath, confidence): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") # using new table (2014.06.16) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, copy.deepcopy(properties), confidence))) t += 1 pool.close() pool.join() for res in result: r = res.get() for rela in r: for ptnId in r[rela]: if not ptnId in properties[rela]: properties[rela][ptnId] = {"total": 0, "support": 0} properties[rela][ptnId]["total"] += r[rela][ptnId]["total"] properties[rela][ptnId]["support"] += r[rela][ptnId]["support"] json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def main(inputPtnPath, outputPath): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(filterFunction, (t, filename, inputPtnPath, model, table, copy.deepcopy(properties)))) t += 1 pool.close() pool.join() statistics = {} counter = 0 for res in result: r, s = res.get() counter += 1 print "%d / %d ..." % (counter, len(result)) for degree in r: if not degree in properties: properties[degree] = {} for ptnId in r[degree]: if not ptnId in properties[degree]: properties[degree][ptnId] = {"occ": [], "sup": {}} # give up the code below for speed up continue for occId in r[degree][ptnId]["occ"]: if not occId in properties[degree][ptnId]["occ"]: properties[degree][ptnId]["occ"].append(occId) for rela in r[degree][ptnId]["sup"]: if not rela in properties[degree][ptnId]["sup"]: properties[degree][ptnId]["sup"][rela] = [] for supId in r[degree][ptnId]["sup"][rela]: if not supId in properties[degree][ptnId]["sup"][rela]: properties[degree][ptnId]["sup"][rela].append( supId) for aid in s: statistics[aid] = s[aid] json.dump(properties, open(outputPath, "w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds) ptnNum = 0 occDocs = [] ptnDeg = {} for ptnId in table: if table[ptnId]["used"]: if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue if table[ptnId]["confidence"] > c_confidence: deg = len(table[ptnId]["relations"]) if not deg in ptnDeg: ptnDeg[deg] = 0 ptnDeg[deg] += 1 for degree in range(1, 18): if not degree in properties: print "%d\t%d\t%d\t%d\t%d" % (degree, 0, 0, 0, 0) else: occ = 0 occs = 0 for a in statistics: ds = statistics[a] if degree in ds: occ += 1 for i in ds: if i <= degree: occs += 1 break print "%d\t%d\t%d\t%d\t%d" % (degree, ptnDeg[degree], len(properties[degree]), occ, occs) # Calculate the number of articles which has no pattern. noPtnCount = 0 for a in statistics: ds = statistics[a] if len(ds) == 0: noPtnCount += 1 print noPtnCount, len(statistics)
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))
def main(inputPtnPath,outputPath): start_time = datetime.now() #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({}) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,copy.deepcopy(properties) ))) t += 1 pool.close() pool.join() statistics = {} counter = 0 for res in result: r,s = res.get() counter += 1 print "%d / %d ..." % (counter,len(result)) for degree in r: if not degree in properties: properties[degree] = {} for ptnId in r[degree]: if not ptnId in properties[degree]: properties[degree][ptnId] = {"occ":[],"sup":{}} # give up the code below for speed up continue for occId in r[degree][ptnId]["occ"]: if not occId in properties[degree][ptnId]["occ"]: properties[degree][ptnId]["occ"].append(occId) for rela in r[degree][ptnId]["sup"]: if not rela in properties[degree][ptnId]["sup"]: properties[degree][ptnId]["sup"][rela] = [] for supId in r[degree][ptnId]["sup"][rela]: if not supId in properties[degree][ptnId]["sup"][rela]: properties[degree][ptnId]["sup"][rela].append(supId) for aid in s: statistics[aid] = s[aid] json.dump(properties,open(outputPath,"w")) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds) ptnNum = 0 occDocs = [] ptnDeg = {} for ptnId in table: if table[ptnId]["used"]: if "eval" in table[ptnId] and not table[ptnId]["eval"]: continue if table[ptnId]["confidence"] > c_confidence: deg = len(table[ptnId]["relations"]) if not deg in ptnDeg: ptnDeg[deg] = 0 ptnDeg[deg] += 1 for degree in range(1,18): if not degree in properties: print "%d\t%d\t%d\t%d\t%d" % (degree,0,0,0,0) else: occ = 0 occs = 0 for a in statistics: ds = statistics[a] if degree in ds: occ += 1 for i in ds: if i <= degree: occs += 1 break print "%d\t%d\t%d\t%d\t%d" % (degree,ptnDeg[degree],len(properties[degree]),occ,occs) # Calculate the number of articles which has no pattern. noPtnCount = 0 for a in statistics: ds = statistics[a] if len(ds) == 0: noPtnCount += 1 print noPtnCount,len(statistics)