site.addsitedir('/bos/usr0/cx/PyCode/cxPyLib')
site.addsitedir('/bos/usr0/cx/PyCode/GoogleAPI')

from ObjCenter.FbObjCacheCenter import FbObjCacheCenterC
from cxBase.Conf import cxConfC
import sys

if 2 != len(sys.argv):
    FbObjCacheCenterC.ShowConf()
    print "in\nout"
    sys.exit()

ObjCenter = FbObjCacheCenterC(sys.argv[1])
conf = cxConfC(sys.argv[1])
InName = conf.GetConf('in')
OutName = conf.GetConf('out')

out = open(OutName, 'w')

cnt = 0
for line in open(InName):
    line = line.strip()
    vCol = line.split('\t')
    lName = [ObjCenter.FetchObjName(ObjId) for ObjId in vCol[:2]]
    print >> out, line + '\t' + '\t'.join(lName)
    cnt += 1
    if 0 == (cnt % 100):
        print "processed [%d] line" % (cnt)

out.close()
예제 #2
0
class EntityCorrelationFromTextSimC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NeighborNum = 50

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum)

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        print 'neighbornum'

    def ProcessOneObj(self, ObjId, name):
        '''
        return lObjNeighbor=[objid,KL score] top self.NeighborNum
        '''

        #search in index, get top 1000
        query = TextBaseC.RawClean(name)
        if "" == query:
            return []
        lObjDoc = self.Searcher.RunQuery(query)

        lObjNeighbor = []

        ThisDesp = self.ObjCenter.FetchObjDesp(ObjId)
        ThisLm = LmBaseC(ThisDesp)
        ThisVec = VectorC(ThisLm.hTermTF)
        #         print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp)
        if len(ThisLm.hTermTF) == 0:
            return []
        for ObjDoc in lObjDoc:
            Id = ObjDoc.DocNo
            if Id == ObjId:
                continue
            if not Id.startswith('/m/'):
                print "[%s %s] neighbor id [%s] format error" % (ObjId, name,
                                                                 Id)
                continue
#             print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent())
#             NeighborDesp = ObjDoc.GetContent()
            NeighborLm = LmBaseC(ObjDoc)
            NeighborVec = VectorC(NeighborLm.hTermTF)
            if len(NeighborVec.hDim) == 0:
                continue
            score = VectorC.KL(ThisVec, NeighborVec)
            lObjNeighbor.append([Id, -score])


#             print "[%s %s] KL [%f]" %(ObjId,Id,score)
#             print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim))

        lObjNeighbor.sort(key=lambda item: item[1], reverse=True)
        print "[%s:%s] neighbor id score get" % (ObjId, name)
        return lObjNeighbor

    def Process(self, ObjInName, OutName):
        out = open(OutName, 'w')

        for line in open(ObjInName):
            vCol = line.strip().split('\t')
            if len(vCol) < 2:
                continue
            lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1])
            for NeighborId, score in lObjNeighbor[:self.NeighborNum]:
                print >> out, '%s\t%s\t%f\t%s\t%s' % (
                    vCol[0], NeighborId, score, vCol[1],
                    self.ObjCenter.FetchObjName(NeighborId))
            print "[%s:%s] done" % (vCol[0], vCol[1])

        out.close()
        print "finished"