class BoeLmWeighterC(BoeLmC): def __init__(self,ConfIn = ""): self.Init() if "" != ConfIn: self.SetConf(ConfIn) def Init(self): BoeLmC.Init(self) self.DocTextDir = "" self.ObjCenter = FbObjCacheCenterC() self.CtfCenter = TermCtfC() self.lInferenceWeight = [1,0,0] self.hDocText = {} def SetConf(self,ConfIn): conf = cxConfC(ConfIn) self.DocTextDir = conf.GetConf('doctextdir') self.LoadDocText() self.ObjCenter.SetConf(ConfIn) CtfInName = conf.GetConf('objctf') self.CtfCenter.Load(CtfInName) @classmethod def ShowConf(): print 'doctextdir\nobjctf' FbObjCacheCenterC.ShowConf() def LoadDocText(self): for fname in WalkDir(self.DocTextDir): for line in open(fname): DocNo,text = line.strip().split('\t') self.hDocText[DocNo] = text logging.info('doc text loaded') def GetAllIdf(self,DocKg): lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] lRes = [] for ObjId in lObjId: idf = self.CtfCenter.GetLogIdf(ObjId) lRes.append(idf) return lRes def GetAllTf(self,DocKg): return list(DocKg.vNodeWeight) def GetAllTextCosine(self,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] lCos = [] if "" == DocText: return [0] * len(DocKg) DocLm = LmBaseC(DocText) lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] for ObjId in lObjId: desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) lCos.append(LmBaseC.Cosine(lm, DocLm)) return lCos def GetTextCosine(self,ObjId,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] DocLm = LmBaseC(DocText) desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) score = LmBaseC.Cosine(lm, DocLm) if 0 == score: return self.MinLogProb return math.log(score) def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0): if not ObjId in DocKg: return self.MinLogProb lTf = np.zeros(len(DocKg)) lIdf = np.zeros(len(DocKg)) if TfScore != 0: lTf = np.array(self.GetAllTf(DocKg)) if IdfScore != 0: lIdf = np.array(self.GetAllIdf(DocKg)) # lCos = np.array(self.GetAllTextCosine(DocKg)) TextSim = 0 if TextSimScore != 0: TextSim = self.GetTextCosine(ObjId,DocKg) W = np.array([TfScore,IdfScore,TextSimScore]) W = W / float(sum(W)) lScore = lTf * W[0] + lIdf * W[1] res = self.MinLogProb * (W[0] + W[1]) if ObjId in DocKg: p = DocKg.hNodeId[ObjId] res = lScore[p] res = res + TextSim * TextSimScore return res def inference(self, ObjId, DocKg): return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
class EntityCorrelationFromTextSimC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NeighborNum = 50 def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum) @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'neighbornum' def ProcessOneObj(self, ObjId, name): ''' return lObjNeighbor=[objid,KL score] top self.NeighborNum ''' #search in index, get top 1000 query = TextBaseC.RawClean(name) if "" == query: return [] lObjDoc = self.Searcher.RunQuery(query) lObjNeighbor = [] ThisDesp = self.ObjCenter.FetchObjDesp(ObjId) ThisLm = LmBaseC(ThisDesp) ThisVec = VectorC(ThisLm.hTermTF) # print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp) if len(ThisLm.hTermTF) == 0: return [] for ObjDoc in lObjDoc: Id = ObjDoc.DocNo if Id == ObjId: continue if not Id.startswith('/m/'): print "[%s %s] neighbor id [%s] format error" % (ObjId, name, Id) continue # print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent()) # NeighborDesp = ObjDoc.GetContent() NeighborLm = LmBaseC(ObjDoc) NeighborVec = VectorC(NeighborLm.hTermTF) if len(NeighborVec.hDim) == 0: continue score = VectorC.KL(ThisVec, NeighborVec) lObjNeighbor.append([Id, -score]) # print "[%s %s] KL [%f]" %(ObjId,Id,score) # print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim)) lObjNeighbor.sort(key=lambda item: item[1], reverse=True) print "[%s:%s] neighbor id score get" % (ObjId, name) return lObjNeighbor def Process(self, ObjInName, OutName): out = open(OutName, 'w') for line in open(ObjInName): vCol = line.strip().split('\t') if len(vCol) < 2: continue lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1]) for NeighborId, score in lObjNeighbor[:self.NeighborNum]: print >> out, '%s\t%s\t%f\t%s\t%s' % ( vCol[0], NeighborId, score, vCol[1], self.ObjCenter.FetchObjName(NeighborId)) print "[%s:%s] done" % (vCol[0], vCol[1]) out.close() print "finished"