Esempio n. 1
0
class BoeLmWeighterC(BoeLmC):
    
    def __init__(self,ConfIn = ""):
        self.Init()
        if "" != ConfIn:
            self.SetConf(ConfIn)
    
    def Init(self):
        BoeLmC.Init(self)
        self.DocTextDir = ""
        self.ObjCenter = FbObjCacheCenterC()
        self.CtfCenter = TermCtfC()
        self.lInferenceWeight = [1,0,0]
        self.hDocText = {}
        
        
        
    def SetConf(self,ConfIn):
        conf = cxConfC(ConfIn)
        
        self.DocTextDir = conf.GetConf('doctextdir')
        self.LoadDocText()
        
        self.ObjCenter.SetConf(ConfIn)
        
        CtfInName = conf.GetConf('objctf')
        self.CtfCenter.Load(CtfInName)
        
    @classmethod
    def ShowConf():
        print 'doctextdir\nobjctf'
        FbObjCacheCenterC.ShowConf()
        
    def LoadDocText(self):
        for fname in WalkDir(self.DocTextDir):
            for line in open(fname):
                DocNo,text = line.strip().split('\t')
                self.hDocText[DocNo] = text    
        logging.info('doc text loaded')
        
    def GetAllIdf(self,DocKg):
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        
        
        lRes = []
        for ObjId in lObjId:
            idf  = self.CtfCenter.GetLogIdf(ObjId)
            lRes.append(idf)
        return lRes
    
    def GetAllTf(self,DocKg):
        return list(DocKg.vNodeWeight)
    
    def GetAllTextCosine(self,DocKg):
        
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        
        lCos = []
        if "" == DocText:
            return [0] * len(DocKg)
        DocLm = LmBaseC(DocText)
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        for ObjId in lObjId:
            desp = self.ObjCenter.FetchObjDesp(ObjId)
            lm = LmBaseC(desp)
            lCos.append(LmBaseC.Cosine(lm, DocLm))
            
            
        
        return lCos
    
    def GetTextCosine(self,ObjId,DocKg):
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        DocLm = LmBaseC(DocText)
        desp = self.ObjCenter.FetchObjDesp(ObjId)
        lm = LmBaseC(desp)
        score = LmBaseC.Cosine(lm, DocLm)
        if 0 == score:
            return self.MinLogProb
        return math.log(score)
        
    
    def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0):
        
        if not ObjId in DocKg:
            return self.MinLogProb
        
        
        lTf = np.zeros(len(DocKg))
        lIdf = np.zeros(len(DocKg))
        if TfScore != 0:
            lTf = np.array(self.GetAllTf(DocKg))
        if IdfScore != 0:
            lIdf = np.array(self.GetAllIdf(DocKg))
#         lCos = np.array(self.GetAllTextCosine(DocKg))
        TextSim = 0
        if TextSimScore != 0:
            TextSim = self.GetTextCosine(ObjId,DocKg)
        W = np.array([TfScore,IdfScore,TextSimScore])
        
        W = W / float(sum(W))
        
        lScore = lTf * W[0] + lIdf * W[1]
        
        res = self.MinLogProb * (W[0] + W[1])
        if ObjId in DocKg:
            p = DocKg.hNodeId[ObjId]
            res = lScore[p]
        res = res + TextSim * TextSimScore
        return res
        
    def inference(self, ObjId, DocKg):
        return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
Esempio n. 2
0
class EntityCorrelationFromTextSimC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NeighborNum = 50

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum)

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        print 'neighbornum'

    def ProcessOneObj(self, ObjId, name):
        '''
        return lObjNeighbor=[objid,KL score] top self.NeighborNum
        '''

        #search in index, get top 1000
        query = TextBaseC.RawClean(name)
        if "" == query:
            return []
        lObjDoc = self.Searcher.RunQuery(query)

        lObjNeighbor = []

        ThisDesp = self.ObjCenter.FetchObjDesp(ObjId)
        ThisLm = LmBaseC(ThisDesp)
        ThisVec = VectorC(ThisLm.hTermTF)
        #         print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp)
        if len(ThisLm.hTermTF) == 0:
            return []
        for ObjDoc in lObjDoc:
            Id = ObjDoc.DocNo
            if Id == ObjId:
                continue
            if not Id.startswith('/m/'):
                print "[%s %s] neighbor id [%s] format error" % (ObjId, name,
                                                                 Id)
                continue
#             print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent())
#             NeighborDesp = ObjDoc.GetContent()
            NeighborLm = LmBaseC(ObjDoc)
            NeighborVec = VectorC(NeighborLm.hTermTF)
            if len(NeighborVec.hDim) == 0:
                continue
            score = VectorC.KL(ThisVec, NeighborVec)
            lObjNeighbor.append([Id, -score])


#             print "[%s %s] KL [%f]" %(ObjId,Id,score)
#             print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim))

        lObjNeighbor.sort(key=lambda item: item[1], reverse=True)
        print "[%s:%s] neighbor id score get" % (ObjId, name)
        return lObjNeighbor

    def Process(self, ObjInName, OutName):
        out = open(OutName, 'w')

        for line in open(ObjInName):
            vCol = line.strip().split('\t')
            if len(vCol) < 2:
                continue
            lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1])
            for NeighborId, score in lObjNeighbor[:self.NeighborNum]:
                print >> out, '%s\t%s\t%f\t%s\t%s' % (
                    vCol[0], NeighborId, score, vCol[1],
                    self.ObjCenter.FetchObjName(NeighborId))
            print "[%s:%s] done" % (vCol[0], vCol[1])

        out.close()
        print "finished"