def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0, 51, 5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity(lineText, vsmData, relas=rfp, ptntext=ptntks) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname) / 100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence,nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId,rela,record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx and not classifiers[rfp[0]] == None and classifiers[rfp[0]].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ptnst[0]] == None and classifiers[ptnst[0]].classify(lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela(ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append(ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append(ans["revid"]) else: found = False ptns = ptnExRela[attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR]["domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute]["et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute]["et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append({"text": article[lineN], "label": "pos"}) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append({"text": article[lineN], "label": "neg"}) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return linesByRelations, linesNoRelaByRelations, POS, NEG
def mapper(jobid, filename, inputPath, inputPtnPath, model, table): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][ rela]: supportInstanceByFile[key][ptnId][rela].append( line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append( article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return supportInstanceByFile
def generate(inputSPIpath, inputTestPath, outputVSMpath, confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable( "../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async(mapper, (t, filename, inputTestPath))) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue count += 1 print count, ptnId ptnInstance = projizz.jsonRead(os.path.join( inputSPIpath, filename)) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count % 100 == 0: print "Read", count, "files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation], os.path.join(outputVSMpath, "%s.txt" % (relation)))
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence, nbcPath): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] ptnExRela = {} # rela: ptns def recordPtnMakeRela(ptnId, rela, record): if not rela in record: record[rela] = [] if not ptnId in record[rela]: record[rela].append(ptnId) for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) # validate the pattern if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue # get all possible relation of this pattern rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue # # Decide to choice relation # if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: pr = rfp[0] if not classifiers[ pr] == None and classifiers[ pr].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela( ptnId, rfp[0], ptnExRela) else: pr = rfp[0] if not classifiers[pr] == None and classifiers[ pr].classify(lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx and not classifiers[ rfp[0]] == None and classifiers[ rfp[0]].classify( lineText) == "pos": relaEx.append(rfp[0]) # FIXME For error checking recordPtnMakeRela(ptnId, rfp[0], ptnExRela) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[ 0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx and not classifiers[ ptnst[0]] == None and classifiers[ ptnst[ 0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) else: if not ptnst[0] in relaEx and not classifiers[ ptnst[ 0]] == None and classifiers[ ptnst[0]].classify( lineText ) == "pos": relaEx.append(ptnst[0]) # FIXME For error checking recordPtnMakeRela( ptnId, ptnst[0], ptnExRela) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: # False Positive expResult[keyname][attribute]["fp"].append( ans["revid"]) # TODO - 分析錯誤原因 if attribute in ptnExRela: if attribute in originRela: # type 2 error expResult[keyname][attribute]["et2"].append( ans["revid"]) else: found = False ptns = ptnExRela[ attribute] # get the patterns raise the Relation for pid in ptns: for psbR in table[pid]["relations"]: if psbR == attribute: continue # here means that the pattern can raise a `correct' relation in answer, may it choice or not if domainRange[psbR][ "domain"] in types and psbR in relation: found = True break if found: # type 1 error expResult[keyname][attribute][ "et1"].append(ans["revid"]) else: # type 3 error expResult[keyname][attribute][ "et3"].append(ans["revid"]) else: # 這是什麼情況?@@ 這種狀況基`本不可能發生吧XD pass else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def filterFunction(jobid,filename,inputPtnPath,model,table,partAns,st,domainRange,inputPath,confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath,filename),"r")) contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) print "Worker %d : Read %s into filter" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1,6): for typ in ["n","t"]: if not deg == 1: for amb in ["one","50","75","all"]: keyname = "%d-%s-%s" % (deg,amb,typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg,typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"])/float(b) >= th: if typ == "t": if domainRange[ptnst[0]]["domain"] in types and not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def generate(inputSPIpath,inputTestPath,outputVSMpath,confidence): # Checking output path projizz.checkPath(outputVSMpath) model, table = projizz.readPrefixTreeModelWithTable("../yago/yagoPatternTree.model", "../patty/yagoPatternTreeWithConfidence.table") # Processes pool proceessorNumber = multiprocessing.cpu_count() if proceessorNumber > 20: proceessorNumber = 20 pool = multiprocessing.Pool(processes=proceessorNumber) # Collect not used keys # because using 5-fold CV t = 0 result = [] for filename in os.listdir(inputTestPath): if ".json" in filename: result.append( pool.apply_async( mapper, (t,filename,inputTestPath) ) ) t += 1 pool.close() pool.join() notUsedKeys = [] for r in result: ks = r.get() notUsedKeys += ks ### Build Model # Paatern Selection modelArticles = projizz.buildYagoProperties([]) words = [] count = 0 for filename in os.listdir(inputSPIpath): if ".json" in filename: ptnId = filename[:-5] # ignore invalidate pattern if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue count += 1 print count,ptnId ptnInstance = projizz.jsonRead( os.path.join(inputSPIpath,filename) ) for rela in ptnInstance: for key in ptnInstance[rela]: # ignore in testing data's key if key in notUsedKeys: continue for line in ptnInstance[rela][key]: modelArticles[rela].append(line) if count%100 == 0: print "Read",count,"files" for relation in modelArticles: print relation projizz.jsonWrite(modelArticles[relation],os.path.join(outputVSMpath,"%s.txt" % (relation)))
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, vsmData): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath,filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath,filename)) print "Worker %d : Read %s" % (jobid,filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 expResult = {} relaEx = {} # set thresholds for th in range(0,51,5): expResult[th] = copy.deepcopy(partAns) relaEx[th] = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # TODO # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] lineText = article[line[0]] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue # TODO - Modlify string, remove pattern text in string? cosRlt = projizz.vsmSimilarity( lineText, vsmData, relas=rfp, ptntext=ptntks ) # NOTE - if cosine value > threshold then there is a relation (?) for keyname in expResult: threshold = float(keyname)/100.0 for pr in cosRlt: # Check type if domainRange[pr]["domain"] in types: if cosRlt[pr] > threshold: if pr not in relaEx[keyname]: relaEx[keyname].append(pr) #### Evaluation for keyname in expResult: for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx[keyname]: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append(ans["revid"]) else: expResult[keyname][attribute]["fn"].append(ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return expResult
def filterFunction(jobid, filename, inputPtnPath, model, table, partAns, st, domainRange, inputPath, confidence): # read patterns in articles contentPtnJson = json.load(open(os.path.join(inputPtnPath, filename), "r")) contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 # prepare keys for multiple-exp # degree: 1 ~ 5 # ambigu: select 1, select n (threshold:.5, .75), select all # type or not: no type info, type info expResult = {} for deg in range(1, 6): for typ in ["n", "t"]: if not deg == 1: for amb in ["one", "50", "75", "all"]: keyname = "%d-%s-%s" % (deg, amb, typ) expResult[keyname] = copy.deepcopy(partAns) else: keyname = "%d-1-%s" % (deg, typ) expResult[keyname] = copy.deepcopy(partAns) print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid #targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) for keyname in expResult: args = keyname.split("-") degree = int(args[0]) ambigu = args[1] typ = args[2] # Relation extraction relaEx = [] for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[ 0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > degree: continue if len(rfp) == 1: # or degree == 1 if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: if typ == "t": if domainRange[rfp[0]]["domain"] in types: relaEx.append(rfp[0]) else: relaEx.append(rfp[0]) else: if ambigu == "one": if typ == "t": for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support": , "total": } if ptnst[1]["support"] > 0 and domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) break else: if st[ptnId][0][1]["support"] > 0 and not rfp[ 0] in relaEx: relaEx.append(rfp[0]) elif ambigu == "all": for ptnst in st[ptnId]: if typ == "t": if domainRange[ ptnst[0]]["domain"] in types: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) else: th = 0.75 if ambigu == "50": th = 0.5 b = st[ptnId][0][1]["support"] if b > 0: for ptnst in st[ptnId]: if float(ptnst[1]["support"]) / float( b) >= th: if typ == "t": if domainRange[ptnst[0]][ "domain"] in types and not ptnst[ 0] in relaEx: relaEx.append(ptnst[0]) else: if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) # Evaluation for attribute in expResult[keyname]: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[keyname][attribute]["tp"].append( ans["revid"]) else: expResult[keyname][attribute]["fn"].append( ans["revid"]) else: if postive: expResult[keyname][attribute]["fp"].append( ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append( {"text":article[lineN],"label":"pos"} ) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append( {"text":article[lineN],"label":"neg"} ) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return linesByRelations,linesNoRelaByRelations,POS,NEG
def mapper(jobid,filename,inputPath,inputPtnPath,model,table): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table): continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: if not ptnId in supportInstanceByFile[key]: supportInstanceByFile[key][ptnId] = {} if not rela in supportInstanceByFile[key][ptnId]: supportInstanceByFile[key][ptnId][rela] = [] if not line[0] in supportInstanceByFile[key][ptnId][rela]: supportInstanceByFile[key][ptnId][rela].append(line[0]) for ptnId in supportInstanceByFile[key]: for rela in supportInstanceByFile[key][ptnId]: lines = supportInstanceByFile[key][ptnId][rela] supportInstanceByFile[key][ptnId][rela] = [] for lineN in lines: supportInstanceByFile[key][ptnId][rela].append(article[lineN]) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return supportInstanceByFile
def mapper(jobid, filename, inputPath, inputPtnPath, table, st, partAns, domainRange, confidence, nbcPath): # read articles and patterns contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) classifiers = projizz.getNBClassifiers(nbcPath) print "Worker %d : Read %s" % (jobid, filename) # connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 expResult = partAns relaEx = [] print "worker %d build expResult" % (jobid) for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) # get revid # targetName = ans["_id"].replace("(","").replace(")","").split("_") # get entity name's part types = ans["type"] # Now only consider properties, no references. relation = ans["observed"] # origin properties, 理論上應該會比 observed 還要多 originRela = ans["properties"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) # Relation extraction for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] if lineText[0] == "^": # It's a wikipeida reference comments, ignore it! continue for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) ptntks = table[ptnId]["pattern"] if not projizz.isPatternValidate(ptnId, table, confidence=confidence, st=st): continue rfp = table[ptnId]["relations"] # check degree if len(rfp) > 5: continue # if no support, ignore this pattern if st[ptnId][0][1]["support"] <= 0: continue for ptnst in st[ptnId]: # ptnst[0] = relation # ptnst[1] = {"support":,"total": } if domainRange[ptnst[0]] not in types: continue if classifiers[ptnst[0]] == None: continue if classifiers[ptnst[0]].classify(lineText) == "pos": if not ptnst[0] in relaEx: relaEx.append(ptnst[0]) #### Evaluation for attribute in expResult: # special case, ignore. if attribute == "produced": continue postive = False true = False if attribute in relaEx: postive = True if attribute in relation: true = True if true: if postive: expResult[attribute]["tp"].append(ans["revid"]) else: expResult[attribute]["fn"].append(ans["revid"]) else: if postive: expResult[attribute]["fp"].append(ans["revid"]) else: # ignore true-negative pass if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return expResult