def mapper(jobid, filename, inputPath, topN, outputPath, model, table): # Read article article = projizz.jsonRead(os.path.join(inputPath, filename)) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid, filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid, count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 1 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue #if tks[t] > maxTF: # maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tfc"))) ### select top N words # sort by tfc sortedTks = sorted(tks.items(), key=lambda x: x[1], reverse=True) tks = {} maxTF = sortedTks[0][1] # Calculate tf top = 0 for t, c in sortedTks: top += 1 tks[t] = float(c) / float(maxTF) if top == topN: break projizz.jsonWrite( tks, os.path.join(outputPath, filename.replace(".json", ".tf"))) print "worker %d write out." % (jobid) return (filename, tks)
def mapper(jobid, filename, inputPath, inputPtnPath, model, table, confidence): # Read article contentJson = projizz.jsonRead(os.path.join(inputPath, filename)) # Read ptn contentPtnJson = projizz.jsonRead(os.path.join(inputPtnPath, filename)) print "Worker %d : Read %s into filter" % (jobid, filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid": {"$in": queries}}) print "worker %d query=%d, result=%d" % (jobid, len(queries), itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate( ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens(article[lineN].lower()) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append({"text": article[lineN], "label": "pos"}) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append({"text": article[lineN], "label": "neg"}) if count % 100 == 0: print "worker #%d done %d." % (jobid, count) return linesByRelations, linesNoRelaByRelations, POS, NEG
def tokenize(words): if isinstance(words, basestring): return projizz.getTokens(words) else: return (w for w in words)
def mapper(jobid,filename,inputPath,outputPath,model,table): # Read article article = projizz.jsonRead( os.path.join(inputPath,filename) ) stemmer = PorterStemmer() tks = {} print "Worker %d : Read %s into filter" % (jobid,filename) count = 0 total = 0 for line in article: count += 1 tokens = projizz.getTokens(line) for token in tokens: t = stemmer.stem(token) if t not in tks: tks[t] = 0 tks[t] += 1 total += 1 if count % 1000 == 0: print "worker %d done %d lines" % (jobid,count) # Remove stopwords for sw in projizz._stopwords: _sw = stemmer.stem(sw) if _sw in tks: total -= tks[_sw] tks.pop(_sw) needRemove = [] maxTF = 0 for t in tks: # ignore only one time word if tks[t] <= 1: needRemove.append(t) total -= tks[t] continue # ignore the case contain number if "0" in t or "1" in t or "2" in t or "3" in t or "4" in t or "5" in t or "6" in t or "7" in t or "8" in t or "9" in t: needRemove.append(t) total -= tks[t] continue if tks[t] > maxTF: maxTF = tks[t] for rm in needRemove: tks.pop(rm) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tfc"))) # Calculate tf for t in tks: tc = tks[t] tks[t] = float(tc)/float(maxTF) projizz.jsonWrite(tks,os.path.join(outputPath,filename.replace(".json",".tf"))) print "worker %d write out." % (jobid) return (filename,tks)
def mapper(jobid,filename,inputPath,inputPtnPath,model,table,confidence): # Read article contentJson = projizz.jsonRead( os.path.join(inputPath,filename) ) # Read ptn contentPtnJson = projizz.jsonRead( os.path.join(inputPtnPath,filename) ) print "Worker %d : Read %s into filter" % (jobid,filename) ### Connect to database connect = pymongo.Connection() db = connect.projizz collection = db.result.yago.answer queries = map(lambda x: x[:-4], contentPtnJson) itr = collection.find({"revid":{"$in":queries}}) print "worker %d query=%d, result=%d" % (jobid,len(queries),itr.count()) count = 0 supportInstanceByFile = {} linesByRelations = {} linesNoRelaByRelations = {} POS = {} NEG = {} for ans in itr: count += 1 key = "%s.txt" % (ans["revid"]) relation = ans["observed"] ptnEx = contentPtnJson[key] article = projizz.articleSimpleSentenceFileter(contentJson[key]) supportInstanceByFile[key] = {} linesByRela = {} linesByNoRela = {} pos = {} neg = {} for line in ptnEx: # line[0]: line number # line[1]: array of patterns lineText = article[line[0]] for ptn in line[1]: # ptn[0]: pattern ID # ptn[1]: start position in line # ptn[2]: end position in line ptnId = "%d" % (ptn[0]) if not projizz.isPatternValidate(ptnId, table, confidence=confidence): continue # give up degree > 5 's pattern if len(table[ptnId]["relations"]) > 5: continue for rela in table[ptnId]["relations"]: # it's a support instance if rela in relation: # NOTE - remove pattern text. if not rela in linesByRela: linesByRela[rela] = {} if not line[0] in linesByRela[rela]: linesByRela[rela][line[0]] = [] if not ptnId in linesByRela[rela][line[0]]: linesByRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in pos: pos[rela] = [] if not lineText[0] == "^" and line[0] not in pos[rela]: pos[rela].append(line[0]) else: if not rela in linesByNoRela: linesByNoRela[rela] = {} if not line[0] in linesByNoRela[rela]: linesByNoRela[rela][line[0]] = [] if not ptnId in linesByNoRela[rela][line[0]]: linesByNoRela[rela][line[0]].append(ptnId) # For binary classifier if not rela in neg: neg[rela] = [] if not lineText[0] == "^" and line[0] not in neg[rela]: neg[rela].append(line[0]) for rela in linesByRela: if not rela in linesByRelations: linesByRelations[rela] = [] for lineN in linesByRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesByRelations[rela].append(l) for rela in linesByNoRela: if not rela in linesNoRelaByRelations: linesNoRelaByRelations[rela] = [] for lineN in linesByNoRela[rela]: text = projizz.getTokens( article[lineN].lower() ) for ptnId in linesByNoRela[rela][lineN]: ptntext = table[ptnId]["pattern"].split() for ptntk in ptntext: if ptntk in text: text.remove(ptntk) l = ' '.join(text) linesNoRelaByRelations[rela].append(l) # For binary classifier for rela in pos: if not rela in POS: POS[rela] = [] for lineN in pos[rela]: POS[rela].append( {"text":article[lineN],"label":"pos"} ) for rela in neg: if not rela in NEG: NEG[rela] = [] for lineN in neg[rela]: NEG[rela].append( {"text":article[lineN],"label":"neg"} ) if count % 100 == 0: print "worker #%d done %d." % (jobid,count) return linesByRelations,linesNoRelaByRelations,POS,NEG