def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename,nbcPath): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() cpuCount = multiprocessing.cpu_count() if cpuCount > 8: cpuCount = 8 pool = multiprocessing.Pool(processes=cpuCount) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) #result.append(filterFunction(t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,classifiers )) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath, inputPtnPath, outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, model, table))) t += 1 pool.close() pool.join() patternInstances = {} # Reducer for r in result: sibf = r.get() for key in sibf: for ptnId in sibf[key]: if not ptnId in patternInstances: patternInstances[ptnId] = {} for rela in sibf[key][ptnId]: for inst in sibf[key][ptnId][rela]: if not rela in patternInstances[ptnId]: patternInstances[ptnId][rela] = {} if not key in patternInstances[ptnId][rela]: patternInstances[ptnId][rela][key] = [] patternInstances[ptnId][rela][key].append(inst) # Write to files # NOTE # Output Format: # ptnId.json (json) # rela: keys # key: line text for ptnId in patternInstances: projizz.jsonWrite(patternInstances[ptnId], os.path.join(outputPath, "%s.json" % (ptnId))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp": [], "fp": [], "fn": []}) domainRange = projizz.getYagoRelationDomainRange() idf, docs, lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic(projizz.jsonRead(psfile)) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append( pool.apply_async( mapper, (t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath(os.path.join(outputPath, keydirName)) projizz.jsonWrite(p, os.path.join(outputPath, keydirName, outputFilename)) print "start write out to %s" % (os.path.join(outputPath, keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath,inputPtnPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table)) ) t += 1 pool.close() pool.join() patternInstances = {} # Reducer for r in result: sibf = r.get() for key in sibf: for ptnId in sibf[key]: if not ptnId in patternInstances: patternInstances[ptnId] = {} for rela in sibf[key][ptnId]: for inst in sibf[key][ptnId][rela]: if not rela in patternInstances[ptnId]: patternInstances[ptnId][rela] = {} if not key in patternInstances[ptnId][rela]: patternInstances[ptnId][rela][key] = [] patternInstances[ptnId][rela][key].append(inst) # Write to files # NOTE # Output Format: # ptnId.json (json) # rela: keys # key: line text for ptnId in patternInstances: projizz.jsonWrite(patternInstances[ptnId],os.path.join(outputPath,"%s.json" % (ptnId))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPtnPath,outputPath,pspath,inputPath,confidence,outputFilename): #model, table = projizz.readPrefixTreeModelWithTable("./yagoPatternTree.model","./yagoPatternTree.table") model, table = projizz.readPrefixTreeModelWithTable("../yago//yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[],"et1":[],"et2":[],"et3":[]}) st = projizz.getSortedPatternStatistic(projizz.jsonRead(pspath)) domainRange = projizz.getYagoRelationDomainRange() start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(filterFunction, (t,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence ))) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] expResult[keyname][m]["et1"] += r[keyname][m]["et1"] expResult[keyname][m]["et2"] += r[keyname][m]["et2"] expResult[keyname][m]["et3"] += r[keyname][m]["et3"] if not os.path.isdir(outputPath): os.mkdir(outputPath) for keyname in expResult: p = expResult[keyname] if not os.path.isdir(os.path.join(outputPath,keyname)): os.mkdir(os.path.join(outputPath,keyname)) projizz.jsonWrite(p,os.path.join(outputPath,keyname,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keyname)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, vsmPath, confidence, psfile, outputPath, outputFilename): model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model","../patty/yagoPatternTreeWithConfidence.table") properties = projizz.buildYagoProperties({"tp":[],"fp":[],"fn":[]}) domainRange = projizz.getYagoRelationDomainRange() idf,docs,lens = projizz.getVSMmodels(vsmPath) st = projizz.getSortedPatternStatistic( projizz.jsonRead(psfile) ) vsmData = (idf, docs, lens) projizz.checkPath(outputPath) start_time = datetime.now() pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: partAns = copy.deepcopy(properties) result.append(pool.apply_async(mapper, ( t, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData ))) #result.append( mapper( t, filename, inputPath, inputPtnPath, table, partAns, domainRange, confidence, vsmData )) t += 1 pool.close() pool.join() expResult = {} for res in result: r = res.get() for keyname in r: if not keyname in expResult: expResult[keyname] = copy.deepcopy(properties) for m in r[keyname]: if m == "produced": continue expResult[keyname][m]["tp"] += r[keyname][m]["tp"] expResult[keyname][m]["fp"] += r[keyname][m]["fp"] expResult[keyname][m]["fn"] += r[keyname][m]["fn"] for keyname in expResult: p = expResult[keyname] keydirName = "vsm-%d" % (keyname) projizz.checkPath( os.path.join(outputPath,keydirName)) projizz.jsonWrite(p,os.path.join(outputPath,keydirName,outputFilename)) print "start write out to %s" % (os.path.join(outputPath,keydirName)) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def parseYagoData(): phase = "used" if phase == "build": model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") else: model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table") # function testing. #test = "has appeared like [[num]]" ##test = "has appeared like [[num]" #i = projizz.naiveMatchPattern(test,model) #print i a = table.keys() originL = len(a) ptnByRelation = {} for relation in projizz.getYagoRelation(): if not phase == "build": break f = open("./yagoRela/%s.txt" % (relation)) print relation text = f.readline() ptnSynsetTxt = text.split("\",\" ")[1:] ptnSynsetTxt = ptnSynsetTxt[:-1] + [ ptnSynsetTxt[-1][:-7] ] ptnByRelation[relation] = [] evC = 0 for text in ptnSynsetTxt: ptns = text.split("#") # ptns[1] : pattern synset id in patty # ptns[3] : pattern domain # ptns[4] : pattern plain text # ptns[5] : pattern range # pnts[6] : confidence # ptns[7] : support co-occurrence # ptns[8] : some has, I guess it is eval result. if len(ptns) > 8: evC += 1 patterns = ptns[4].split(";%") patterns = patterns[:-1] + [patterns[-1][:-1]] for pattern in patterns: pid = projizz.naiveMatchPattern(pattern,model) if pid < 0: pass #print relation,pattern else: pid = str(pid) if pid in a: a.remove(pid) if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) if not relation in table[pid]["relations"]: table[pid]["relations"].append(relation) #print relation,pid,pattern ptnS = table[pid] if not "confidence" in ptnS: table[pid]["confidence"] = float(ptns[6]) table[pid]["support"] = int(ptns[7]) table[pid]["used"] = True if len(ptns) > 8: if ptns[8] == "false": table[pid]["eval"] = False #print pid,table[pid]["relations"],pattern,ptns[8] else: table[pid]["eval"] = True f.close() if phase == "build": for pid in a: table[pid]["used"] = False for pid in table: if table[pid]["used"]: needRemove = [] for relation in table[pid]["relations"]: if not pid in ptnByRelation[relation]: print pid,table[pid]["pattern"],relation needRemove.append(relation) for p in needRemove: table[pid]["relations"].remove(p) if len(table[pid]["relations"]) == 0: print pid,table[pid]["pattern"],"!!!" else: pass projizz.jsonWrite(table,"./yagoPatternTreeWithConfidence.table") else: c = 0 used = 0 for pid in table: if table[pid]["used"]: # 如果有true或false在,就只留True的Pattern if "eval" in table[pid]: if not table[pid]["eval"]: continue used += 1 for relation in table[pid]["relations"]: if not relation in ptnByRelation: ptnByRelation[relation] = [] if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) else: c += 1 # 一些小計算 #for relation in ptnByRelation: # print relation,len(ptnByRelation[relation]) # 找最高(意思就是不能再更高了)信心值 # 每組Relation的最高之中最小的那一個 minC = 1.0 minCR = "" for relation in ptnByRelation: c75 = 0 c50 = 0 ptns = [] for pid in ptnByRelation[relation]: ptns.append(table[pid]) ptns[-1]["pid"] = pid ptns.sort(key=lambda x:x["confidence"],reverse=True) if ptns[0]["confidence"] < minC: minC = ptns[0]["confidence"] minCR = relation #print relation,ptns[0] f = open("./yagoSortedRela/%s.txt" % (relation),"w") for ptn in ptns: if ptn["confidence"] > .75: c75 += 1 if ptn["confidence"] > .5: c50 += 1 f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"],ptn["pattern"],ptn["confidence"],ptn["support"],ptn["relations"])) f.close() print relation,len(ptns),c75,c50 print minCR,minC,"pattern used:",used
def mapper(jobid, filename, inputPath, topN, outputPath, model, table): # Read article article = projizz.jsonRead(os.path.join(inputPath, filename)) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid, count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 1 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue #if tks[t] > maxTF: # maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tfc"))) ### select top N words # sort by tfc sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True) tks = {} maxTF = sortedTks[0][1] # Calculate tf top = 0 for t, c in sortedTks: top += 1 tks[t] = float(c) / float(maxTF) if top == topN: break projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tf"))) print "worker %d write out." % (jobid) return (filename, tks)
def preprocess(inputPath, topN, outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t, filename, inputPath, topN, outputPath, model, table))) t += 1 pool.close() pool.join() words = {} idf = {} tfs = {} # Reducer - DF types = 0 for r in result: fn, tks = r.get() tfs[fn] = tks types += 1 for t in tks: if t not in words: words[t] = 0 words[t] += 1 print "Doc#", types, "words#", len(words) projizz.jsonWrite(words, os.path.join(outputPath, "documentFreq.df")) # Calculate idf for w in words: if words[w] == 0: continue idf[w] = math.log(float(types) / float(words[w]), 10) projizz.jsonWrite(idf, os.path.join(outputPath, "idf.idf")) print "Write out idf file" # Calculate td-idf weight for fn in tfs: tks = tfs[fn] weight = {} for t in tks: tf = tks[t] if t not in idf: continue weight[t] = tf * idf[t] projizz.jsonWrite(weight, os.path.join(outputPath, fn)) print "build", fn, "tf-idf weight" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def main(inputPath, inputPtnPath, outputPath, outputPtnPath): debug = False if not os.path.isdir(outputPath): os.mkdir(outputPath) if not os.path.isdir(outputPtnPath): os.mkdir(outputPtnPath) result = [] count = 0 # Update answer cpuN = multiprocessing.cpu_count() print "CoreNumber = %d" % (cpuN) pool = multiprocessing.Pool(processes=12) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: t += 1 if debug: result.append(updateAnswer(t, inputPath, filename)) else: result.append( pool.apply_async(updateAnswer, (t, inputPath, filename))) pool.close() pool.join() # Rebuild articles and patterns tmpArticle = {} tmpPtn = {} dataSize = 0 for res in result: if debug: filename, articles = res else: filename, articles = res.get() print filename, len(articles) a = projizz.jsonRead(os.path.join(inputPath, filename)) p = projizz.jsonRead(os.path.join(inputPtnPath, filename)) for key in articles: dataSize += 1 tmpArticle[key] = a[key] tmpPtn[key] = p[key] if len(tmpPtn) == 1000: print "write to %05d.json" % (count) projizz.jsonWrite( tmpArticle, os.path.join(outputPath, "%05d.json" % (count))) projizz.jsonWrite( tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 if len(tmpPtn) > 0: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle, os.path.join(outputPath, "%05d.json" % (count))) projizz.jsonWrite(tmpPtn, os.path.join(outputPtnPath, "%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 # Split to 5 splitTo5part("/tmp2/r01922024", "y-all", "/tmp2/r01922024", "y") splitTo5part("/tmp2/r01922024", "y-ptn-all", "/tmp2/r01922024", "y-ptn") print "write %d files. (%d)" % (count, dataSize)
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def preprocess(inputPath,outputPath): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,outputPath, model, table)) ) t += 1 pool.close() pool.join() words = {} idf = {} tfs = {} # Reducer - DF types = 0 for r in result: fn,tks = r.get() tfs[fn] = tks types += 1 for t in tks: if t not in words: words[t] = 0 words[t] += 1 print "Doc#",types,"words#",len(words) projizz.jsonWrite(words,os.path.join(outputPath,"documentFreq.df")) # Calculate idf for w in words: if words[w] == 0: continue idf[w] = math.log(float(types)/float(words[w]),10) projizz.jsonWrite(idf,os.path.join(outputPath,"idf.idf")) print "Write out idf file" # Calculate td-idf weight for fn in tfs: tks = tfs[fn] weight = {} for t in tks: tf = tks[t] if t not in idf: continue weight[t] = tf * idf[t] projizz.jsonWrite(weight,os.path.join(outputPath,fn)) print "build",fn,"tf-idf weight" diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def mapper(jobid,filename,inputPath,outputPath,model,table): # Read article article = projizz.jsonRead( os.path.join(inputPath,filename) ) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid,count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 0 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue if tks[t] > maxTF: maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc"))) # Calculate tf for t in tks: tc = tks[t] tks[t] = float(tc)/float(maxTF) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf"))) print "worker %d write out." % (jobid) return (filename,tks)
def preprocess(inputPath,inputPtnPath,outputPath,confidence): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputPath,inputPtnPath, model, table, confidence)) ) #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence)) t += 1 pool.close() pool.join() modelArticles = {} negAritcles = {} POSArticles = {} NEGArticles = {} # Reducer for r in result: sibr, osibr, p, n = r.get() for rela in sibr: if not rela in modelArticles: modelArticles[rela] = [] modelArticles[rela] += sibr[rela] for rela in osibr: if not rela in negAritcles: negAritcles[rela] = [] negAritcles[rela] += osibr[rela] for rela in p: if not rela in POSArticles: POSArticles[rela] = [] POSArticles[rela] += p[rela] for rela in n: if not rela in NEGArticles: NEGArticles[rela] = [] NEGArticles[rela] += n[rela] # # relation.json: [line, line, line, ....] # for rela in modelArticles: print rela projizz.jsonWrite(modelArticles[rela],os.path.join(outputPath,"%s.json" % (rela))) for rela in negAritcles: print rela projizz.jsonWrite(negAritcles[rela],os.path.join(outputPath,"%s.other" % (rela))) for rela in POSArticles: print rela projizz.jsonWrite(POSArticles[rela],os.path.join(outputPath,"%s.pos" % (rela))) for rela in NEGArticles: print rela projizz.jsonWrite(NEGArticles[rela],os.path.join(outputPath,"%s.neg" % (rela))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def preprocess(inputPath, inputPtnPath, outputPath, confidence): # Checking output path projizz.checkPath(outputPath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") start_time = datetime.now() # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) t = 0 result = [] for filename in os.listdir(inputPtnPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputPath, inputPtnPath, model, table, confidence))) #result.append( mapper(t,filename,inputPath,inputPtnPath, model, table, confidence)) t += 1 pool.close() pool.join() modelArticles = {} negAritcles = {} POSArticles = {} NEGArticles = {} # Reducer for r in result: sibr, osibr, p, n = r.get() for rela in sibr: if not rela in modelArticles: modelArticles[rela] = [] modelArticles[rela] += sibr[rela] for rela in osibr: if not rela in negAritcles: negAritcles[rela] = [] negAritcles[rela] += osibr[rela] for rela in p: if not rela in POSArticles: POSArticles[rela] = [] POSArticles[rela] += p[rela] for rela in n: if not rela in NEGArticles: NEGArticles[rela] = [] NEGArticles[rela] += n[rela] # # relation.json: [line, line, line, ....] # for rela in modelArticles: print rela projizz.jsonWrite(modelArticles[rela], os.path.join(outputPath, "%s.json" % (rela))) for rela in negAritcles: print rela projizz.jsonWrite(negAritcles[rela], os.path.join(outputPath, "%s.other" % (rela))) for rela in POSArticles: print rela projizz.jsonWrite(POSArticles[rela], os.path.join(outputPath, "%s.pos" % (rela))) for rela in NEGArticles: print rela projizz.jsonWrite(NEGArticles[rela], os.path.join(outputPath, "%s.neg" % (rela))) diff = datetime.now() - start_time print "Spend %d.%d seconds" % (diff.seconds, diff.microseconds)
def parseYagoData(): phase = "used" if phase == "build": model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../yago/yagoPatternTree.table") else: model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "./yagoPatternTreeWithConfidence.table") # function testing. #test = "has appeared like [[num]]" ##test = "has appeared like [[num]" #i = projizz.naiveMatchPattern(test,model) #print i a = table.keys() originL = len(a) ptnByRelation = {} for relation in projizz.getYagoRelation(): if not phase == "build": break f = open("./yagoRela/%s.txt" % (relation)) print relation text = f.readline() ptnSynsetTxt = text.split("\",\" ")[1:] ptnSynsetTxt = ptnSynsetTxt[:-1] + [ptnSynsetTxt[-1][:-7]] ptnByRelation[relation] = [] evC = 0 for text in ptnSynsetTxt: ptns = text.split("#") # ptns[1] : pattern synset id in patty # ptns[3] : pattern domain # ptns[4] : pattern plain text # ptns[5] : pattern range # pnts[6] : confidence # ptns[7] : support co-occurrence # ptns[8] : some has, I guess it is eval result. if len(ptns) > 8: evC += 1 patterns = ptns[4].split(";%") patterns = patterns[:-1] + [patterns[-1][:-1]] for pattern in patterns: pid = projizz.naiveMatchPattern(pattern, model) if pid < 0: pass #print relation,pattern else: pid = str(pid) if pid in a: a.remove(pid) if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) if not relation in table[pid]["relations"]: table[pid]["relations"].append(relation) #print relation,pid,pattern ptnS = table[pid] if not "confidence" in ptnS: table[pid]["confidence"] = float(ptns[6]) table[pid]["support"] = int(ptns[7]) table[pid]["used"] = True if len(ptns) > 8: if ptns[8] == "false": table[pid]["eval"] = False #print pid,table[pid]["relations"],pattern,ptns[8] else: table[pid]["eval"] = True f.close() if phase == "build": for pid in a: table[pid]["used"] = False for pid in table: if table[pid]["used"]: needRemove = [] for relation in table[pid]["relations"]: if not pid in ptnByRelation[relation]: print pid, table[pid]["pattern"], relation needRemove.append(relation) for p in needRemove: table[pid]["relations"].remove(p) if len(table[pid]["relations"]) == 0: print pid, table[pid]["pattern"], "!!!" else: pass projizz.jsonWrite(table, "./yagoPatternTreeWithConfidence.table") else: c = 0 used = 0 for pid in table: if table[pid]["used"]: # 如果有true或false在,就只留True的Pattern if "eval" in table[pid]: if not table[pid]["eval"]: continue used += 1 for relation in table[pid]["relations"]: if not relation in ptnByRelation: ptnByRelation[relation] = [] if not pid in ptnByRelation[relation]: ptnByRelation[relation].append(pid) else: c += 1 # 一些小計算 #for relation in ptnByRelation: # print relation,len(ptnByRelation[relation]) # 找最高(意思就是不能再更高了)信心值 # 每組Relation的最高之中最小的那一個 minC = 1.0 minCR = "" for relation in ptnByRelation: c75 = 0 c50 = 0 ptns = [] for pid in ptnByRelation[relation]: ptns.append(table[pid]) ptns[-1]["pid"] = pid ptns.sort(key=lambda x: x["confidence"], reverse=True) if ptns[0]["confidence"] < minC: minC = ptns[0]["confidence"] minCR = relation #print relation,ptns[0] f = open("./yagoSortedRela/%s.txt" % (relation), "w") for ptn in ptns: if ptn["confidence"] > .75: c75 += 1 if ptn["confidence"] > .5: c50 += 1 f.write("%s\t%s\t%.3f\t%d\t%s\n" % (ptn["pid"], ptn["pattern"], ptn["confidence"], ptn["support"], ptn["relations"])) f.close() print relation, len(ptns), c75, c50 print minCR, minC, "pattern used:", used
def main(inputPath,inputPtnPath,outputPath,outputPtnPath): debug = False if not os.path.isdir(outputPath): os.mkdir(outputPath) if not os.path.isdir(outputPtnPath): os.mkdir(outputPtnPath) result = [] count = 0 # Update answer cpuN = multiprocessing.cpu_count() print "CoreNumber = %d" % (cpuN) pool = multiprocessing.Pool(processes=12) t = 0 for filename in os.listdir(inputPath): if ".json" in filename: t += 1 if debug: result.append(updateAnswer(t,inputPath,filename)) else: result.append(pool.apply_async(updateAnswer, (t,inputPath,filename))) pool.close() pool.join() # Rebuild articles and patterns tmpArticle = {} tmpPtn = {} dataSize = 0 for res in result: if debug: filename,articles = res else: filename,articles = res.get() print filename,len(articles) a = projizz.jsonRead(os.path.join(inputPath,filename)) p = projizz.jsonRead(os.path.join(inputPtnPath,filename)) for key in articles: dataSize += 1 tmpArticle[key] = a[key] tmpPtn[key] = p[key] if len(tmpPtn) == 1000: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count))) projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 if len(tmpPtn) > 0: print "write to %05d.json" % (count) projizz.jsonWrite(tmpArticle,os.path.join(outputPath,"%05d.json" % (count))) projizz.jsonWrite(tmpPtn,os.path.join(outputPtnPath,"%05d.json" % (count))) tmpArticle = {} tmpPtn = {} count += 1 # Split to 5 splitTo5part("/tmp2/r01922024","y-all","/tmp2/r01922024","y") splitTo5part("/tmp2/r01922024","y-ptn-all","/tmp2/r01922024","y-ptn") print "write %d files. (%d)" % (count,dataSize)