def preprocess(inputPath, inputPtnPath, outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, model, table))) t += 1 pool.close() pool.join() patternInstances = {} # Reducer for r in result: sibf = r.get() for key in sibf: for ptnId in sibf[key]: if not ptnId in patternInstances: patternInstances[ptnId] = {} for rela in sibf[key][ptnId]: for inst in sibf[key][ptnId][rela]: if not rela in patternInstances[ptnId]: patternInstances[ptnId][rela] = {} if not key in patternInstances[ptnId][rela]: patternInstances[ptnId][rela][key] = [] patternInstances[ptnId][rela][key].append(inst) # Write to files # NOTE # Output Format: # ptnId.json (json) # rela: keys # key: line text for ptnId in patternInstances: projizz.jsonWrite(patternInstances[ptnId], os.path.join(outputPath, "%s.json" % (ptnId))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []}) domainRange = projizz.getYagoRelationDomainRange() idf, docs, lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile)) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath(os.path.join(outputPath, keydirName)) projizz.jsonWrite(p, os.path.join(outputPath, keydirName, outputFilename)) print "start write out to %s" % (os.path.join(outputPath, keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath,inputPtnPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table)) ) t += 1 pool.close() pool.join() patternInstances = {} # Reducer for r in result: sibf = r.get() for key in sibf: for ptnId in sibf[key]: if not ptnId in patternInstances: patternInstances[ptnId] = {} for rela in sibf[key][ptnId]: for inst in sibf[key][ptnId][rela]: if not rela in patternInstances[ptnId]: patternInstances[ptnId][rela] = {} if not key in patternInstances[ptnId][rela]: patternInstances[ptnId][rela][key] = [] patternInstances[ptnId][rela][key].append(inst) # Write to files # NOTE # Output Format: # ptnId.json (json) # rela: keys # key: line text for ptnId in patternInstances: projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def buildModels(inputpath, outputPath): projizz.checkPath(outputPath) cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 relations = projizz.getYagoRelation() for relation in relations: if relation == "produced": continue pool.apply_async(trainModel, (t, relation, inputpath, outputPath)) t += 1 pool.close() pool.join() print "Done training all classifiers"
def buildModels(inputpath,outputPath): projizz.checkPath(outputPath) cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 relations = projizz.getYagoRelation() for relation in relations: if relation == "produced": continue pool.apply_async(trainModel, (t,relation,inputpath,outputPath)) t += 1 pool.close() pool.join() print "Done training all classifiers"
def preprocess(inputPath, topN, outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t, filename, inputPath, topN, outputPath, model, table))) t += 1 pool.close() pool.join() words = {} idf = {} tfs = {} # Reducer - DF types = 0 for r in result: fn, tks = r.get() tfs[fn] = tks types += 1 for t in tks: if t not in words: words[t] = 0 words[t] += 1 print "Doc#", types, "words#", len(words) projizz.jsonWrite(words, os.path.join(outputPath, "documentFreq.df")) # Calculate idf for w in words: if words[w] == 0: continue idf[w] = math.log(float(types) / float(words[w]), 10) projizz.jsonWrite(idf, os.path.join(outputPath, "idf.idf")) print "Write out idf file" # Calculate td-idf weight for fn in tfs: tks = tfs[fn] weight = {} for t in tks: tf = tks[t] if t not in idf: continue weight[t] = tf * idf[t] projizz.jsonWrite(weight, os.path.join(outputPath, fn)) print "build", fn, "tf-idf weight" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath, inputPtnPath, outputPath, confidence): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputPath, inputPtnPath, model, table, confidence))) #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence)) t += 1 pool.close() pool.join() modelArticles = {} negAritcles = {} POSArticles = {} NEGArticles = {} # Reducer for r in result: sibr, osibr, p, n = r.get() for rela in sibr: if not rela in modelArticles: modelArticles[rela] = [] modelArticles[rela] += sibr[rela] for rela in osibr: if not rela in negAritcles: negAritcles[rela] = [] negAritcles[rela] += osibr[rela] for rela in p: if not rela in POSArticles: POSArticles[rela] = [] POSArticles[rela] += p[rela] for rela in n: if not rela in NEGArticles: NEGArticles[rela] = [] NEGArticles[rela] += n[rela] # # relation.json: [line, line, line, ....] # for rela in modelArticles: print rela projizz.jsonWrite(modelArticles[rela], os.path.join(outputPath, "%s.json" % (rela))) for rela in negAritcles: print rela projizz.jsonWrite(negAritcles[rela], os.path.join(outputPath, "%s.other" % (rela))) for rela in POSArticles: print rela projizz.jsonWrite(POSArticles[rela], os.path.join(outputPath, "%s.pos" % (rela))) for rela in NEGArticles: print rela projizz.jsonWrite(NEGArticles[rela], os.path.join(outputPath, "%s.neg" % (rela))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def preprocess(inputPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table)) ) t += 1 pool.close() pool.join() words = {} idf = {} tfs = {} # Reducer - DF types = 0 for r in result: fn,tks = r.get() tfs[fn] = tks types += 1 for t in tks: if t not in words: words[t] = 0 words[t] += 1 print "Doc#",types,"words#",len(words) projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df")) # Calculate idf for w in words: if words[w] == 0: continue idf[w] = math.log(float(types)/float(words[w]),10) projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf")) print "Write out idf file" # Calculate td-idf weight for fn in tfs: tks = tfs[fn] weight = {} for t in tks: tf = tks[t] if t not in idf: continue weight[t] = tf * idf[t] projizz.jsonWrite(weight,os.path.join(outputPath,fn)) print "build",fn,"tf-idf weight" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath,inputPtnPath,outputPath,confidence): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence)) ) #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence)) t += 1 pool.close() pool.join() modelArticles = {} negAritcles = {} POSArticles = {} NEGArticles = {} # Reducer for r in result: sibr, osibr, p, n = r.get() for rela in sibr: if not rela in modelArticles: modelArticles[rela] = [] modelArticles[rela] += sibr[rela] for rela in osibr: if not rela in negAritcles: negAritcles[rela] = [] negAritcles[rela] += osibr[rela] for rela in p: if not rela in POSArticles: POSArticles[rela] = [] POSArticles[rela] += p[rela] for rela in n: if not rela in NEGArticles: NEGArticles[rela] = [] NEGArticles[rela] += n[rela] # # relation.json: [line, line, line, ....] # for rela in modelArticles: print rela projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) for rela in negAritcles: print rela projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) for rela in POSArticles: print rela projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela))) for rela in NEGArticles: print rela projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)