Example #1
0
 def Init(self):
     BoeLmC.Init(self)
     self.DocTextDir = ""
     self.ObjCenter = FbObjCacheCenterC()
     self.CtfCenter = TermCtfC()
     self.lInferenceWeight = [1,0,0]
     self.hDocText = {}
Example #2
0
 def Init(self):
     cxBaseC.Init(self)
     
     self.ObjCenter = FbObjCacheCenterC()
     self.Inferener = LESInferencerC()
     self.DocKgDir = ""
     self.hQObj = {}
     self.OrigQWeight = 0.5
Example #3
0
    def Init(self):
        cxBaseC.Init(self)

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.Evaluator = AdhocEvaC()

        self.Inferener = LESInferencerC()

        self.QDocNodeDataDir = ""
        self.OrigQWeight = 0.5
        self.UseQObjOnly = True
Example #4
0
    def Init(self):
        cxBaseC.Init(self)

        self.NodeDir = ""

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()

        self.QDocFeatureExtractor = LeToRFeatureExtractCenterC()
        self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC()
        self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC()
        self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC()
Example #5
0
    def Init(self):
        cxBaseC.Init(self)

        self.lQObjFeatureGroup = []
        self.lObjObjFeatureGroup = []
        self.lDocObjFeatureGroup = []

        self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC()
        self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC()
        self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC()
        self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC()
        self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC()

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NodeDir = ""
Example #6
0
    def ShowConf(cls):
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        AdhocEvaC.ShowConf()

        print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'
class RawGraphPerEdgeFeatureConstructorC(SearchResDocGraphConstructorC):
    def Init(self):
        SearchResDocGraphConstructorC.Init(self)
        self.EdgeFeatureCenter = ObjObjFeatureExtractCenterC()
        self.ObjCenter = FbObjCacheCenterC()

    def SetConf(self, ConfIn):
        SearchResDocGraphConstructorC.SetConf(self, ConfIn)
        self.EdgeFeatureCenter.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)

    @staticmethod
    def ShowConf():
        SearchResDocGraphConstructorC.ShowConf()
        ObjObjFeatureExtractCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

    def FormForOneQ(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)
        lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc]

        for DocKg in lDocKg:
            logging.info('forming edge mtx for [%s] [%d] obj', DocKg.DocNo,
                         len(DocKg.hNodeId))
            lObjId = DocKg.hNodeId.keys()
            lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId]
            mhFeature = self.EdgeFeatureCenter.ExtractObjObjFeature(
                lObj, query)
            for FeatureName in self.EdgeFeatureCenter.FeatureDims():
                OutDir = self.OutDir + '/' + FeatureName + '/' + qid
                if not os.path.exists(OutDir):
                    os.makedirs(OutDir)

                llEdgeFeatureScore = [[
                    hFeature[FeatureName] for hFeature in lhFeature
                ] for lhFeature in mhFeature]
                DocKg.mEdgeMatrix = np.array(llEdgeFeatureScore)
                DocKg.dump(OutDir + '/' + DocKg.DocNo)
                logging.debug('[%s] feature for doc [%s] dummped', FeatureName,
                              DocKg.DocNo)

            logging.info('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg))
        logging.info('[%s-%s] doc kg formed', qid, query)
        return True
Example #8
0
 def ShowConf(cls):
     print cls.__name__
     FbObjCacheCenterC.ShowConf()
     
     print 'objobjfeaturegroup'
     
     ObjObjEdgeFeatureKGExtractorC.ShowConf()
     ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf()
     ObjObjEdgeFeatureTextSimExtractorC.ShowConf()
     ObjObjEdgeFeatureTypeExtractorC.ShowConf()
     ObjObjEdgeFeatureEmbSimExtractorC.ShowConf()
Example #9
0
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup'

        QueryObjEdgeFeatureAnaExtractorC.ShowConf()
        DocObjEdgeFeatureFaccExtractorC.ShowConf()
        ObjObjEdgeFeatureKGExtractorC.ShowConf()
        ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf()
        ObjObjEdgeFeatureTextSimExtractorC.ShowConf()
Example #10
0
    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'nodedir'

        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        LeToRFeatureExtractCenterC.ShowConf()
        FbQObjFeatureExtractCenterC.ShowConf()
        FbObjDocFeatureExtractCenterC.ShowConf()
        ObjObjFeatureExtractCenterC.ShowConf()
Example #11
0
class LESRanker(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.Evaluator = AdhocEvaC()

        self.Inferener = LESInferencerC()

        self.QDocNodeDataDir = ""
        self.OrigQWeight = 0.5
        self.UseQObjOnly = True

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        AdhocEvaC.ShowConf()

        print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.Evaluator.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/'
        self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight)
        self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1))

    def LoadQDocObj(self, query):
        InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        hQDocObj = {}
        for line in open(InName):
            key, ObjId = line.strip().split('\t')
            if not key in hQDocObj:
                hQDocObj[key] = [ObjId]
            else:
                hQDocObj[key].append(ObjId)
        logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj))
        return hQDocObj

    def RankingForOneQ(self, qid, query):
        logging.info('Start LES ranking for [%s-%s]', qid, query)

        lDoc = self.Searcher.RunQuery(query, qid)
        logging.info('doc fetched')

        hQDocObj = self.LoadQDocObj(query)

        QKey = 'q_%s' % (qid)
        if not QKey in hQDocObj:
            #do nothing
            logging.info('query [%s] has no object, return raw raning', qid)
            return [doc.DocNo for doc in lDoc]

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]]

        lDocLESScore = []
        LesCnt = 0
        for doc in lDoc:
            if self.UseQObjOnly:
                lDocObj = lQObj
            else:
                if not doc.DocNo in hQDocObj:
                    lDocLESScore.append(0)
                    continue
                lDocObj = [
                    self.ObjCenter.FetchObj(ObjId)
                    for ObjId in hQDocObj[doc.DocNo]
                ]

            score = self.Inferener.inference(query, doc, lQObj, lDocObj)
            if score != 0:
                #if 0, means the obj has no desp (or very short one), doesn't count as valid score
                LesCnt += 1
            lDocLESScore.append(score)

        #add average score to doc without annotation
        #using zero is not very proper
        AvgScore = sum(lDocLESScore) / float(LesCnt)

        lDocLESScore = [
            item if item != 0 else AvgScore for item in lDocLESScore
        ]

        lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \
                     for doc,LESScore in zip(lDoc,lDocLESScore)]

        lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore)
        lDocNoScore.sort(key=lambda item: item[1], reverse=True)
        lRankedDocNo = [item[0] for item in lDocNoScore]

        logging.info('query [%s] ranked', qid)

        return lRankedDocNo

    def Process(self, QIn, OutName):

        lQidQuery = [
            line.split('\t') for line in open(QIn).read().splitlines()
        ]

        llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery]

        logging.info('start evaluation')

        lQid = [item[0] for item in lQidQuery]
        lQuery = [item[1] for item in lQidQuery]
        lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo)

        out = open(OutName, 'w')
        for qid, EvaRes in lPerQEvaRes:
            print >> out, qid + '\t' + EvaRes.dumps()

        out.close()
        logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps())

        return True
what's my output:
    add "\t name a \t name b" in the end of each line

'''

import site

site.addsitedir('/bos/usr0/cx/PyCode/cxPyLib')
site.addsitedir('/bos/usr0/cx/PyCode/GoogleAPI')

from ObjCenter.FbObjCacheCenter import FbObjCacheCenterC
from cxBase.Conf import cxConfC
import sys

if 2 != len(sys.argv):
    FbObjCacheCenterC.ShowConf()
    print "in\nout"
    sys.exit()

ObjCenter = FbObjCacheCenterC(sys.argv[1])
conf = cxConfC(sys.argv[1])
InName = conf.GetConf('in')
OutName = conf.GetConf('out')

out = open(OutName, 'w')

cnt = 0
for line in open(InName):
    line = line.strip()
    vCol = line.split('\t')
    lName = [ObjCenter.FetchObjName(ObjId) for ObjId in vCol[:2]]
Example #13
0
 def ShowConf(cls):
     cxBaseC.ShowConf()
     FbObjCacheCenterC.ShowConf()
     print 'origqweight 0.5'
Example #14
0
class LESRanker(cxBaseC):
    
    def Init(self):
        cxBaseC.Init(self)
        
        self.ObjCenter = FbObjCacheCenterC()
        self.Inferener = LESInferencerC()
        self.DocKgDir = ""
        self.hQObj = {}
        self.OrigQWeight = 0.5
        
    
    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        print 'origqweight 0.5'
        
        
    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.DocKgDir = self.conf.GetConf('dockgdir')
        QAnaInName = self.conf.GetConf('qanain')
        self.LoadQObj(QAnaInName)
        self.ObjCenter.SetConf(ConfIn)
        self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight)
        
        
    def LoadQObj(self,QAnaInName):
        for line in open(QAnaInName).read().splitlines():
            vCol = line.strip().split('\t')
            qid = vCol[0]
            ObjId = vCol[2]
            score = vCol[-1]
            if not qid in self.hQObj:
                self.hQObj[qid] = [[ObjId,score]]
            else:
                self.hQObj[qid].append([ObjId,score])
                
        logging.info('qobj loaded from [%s]',QAnaInName)
        return True
    
    def RankScoreForDoc(self,qid,query,doc):
        DocKg = SearchResDocGraphConstructorC.LoadDocGraph(self.DocKgDir, qid, doc.DocNo)

        lQObjId = [item[0] for item in self.hQObj[qid]]
        lDocObjId = DocKg.hNodeId.keys()
        
        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId]
        lDocObj =  [self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId]
        
        score = self.Inferener.inference(query, doc, lQObj, lDocObj)
        
        return score
    
    def Rank(self,qid,query,lDoc):
        if not qid in self.hQObj:
            logging.warn('qid [%s] no ana obj, withdraw to given score',qid)
            return [doc.DocNo for doc in lDoc]
        lScore = [self.RankScoreForDoc(qid, query, doc) for doc in lDoc]
        lDocNoScore = zip([doc.DocNo for doc in lDoc],lScore)
        lDocNoScore.sort(key=lambda item: item[1], reverse = True)
        lRankRes = [item[0] for item in lDocNoScore]
        return lRankRes
 def ShowConf():
     SearchResDocGraphConstructorC.ShowConf()
     ObjObjFeatureExtractCenterC.ShowConf()
     FbObjCacheCenterC.ShowConf()
Example #16
0
class BoeLmWeighterC(BoeLmC):
    
    def __init__(self,ConfIn = ""):
        self.Init()
        if "" != ConfIn:
            self.SetConf(ConfIn)
    
    def Init(self):
        BoeLmC.Init(self)
        self.DocTextDir = ""
        self.ObjCenter = FbObjCacheCenterC()
        self.CtfCenter = TermCtfC()
        self.lInferenceWeight = [1,0,0]
        self.hDocText = {}
        
        
        
    def SetConf(self,ConfIn):
        conf = cxConfC(ConfIn)
        
        self.DocTextDir = conf.GetConf('doctextdir')
        self.LoadDocText()
        
        self.ObjCenter.SetConf(ConfIn)
        
        CtfInName = conf.GetConf('objctf')
        self.CtfCenter.Load(CtfInName)
        
    @classmethod
    def ShowConf():
        print 'doctextdir\nobjctf'
        FbObjCacheCenterC.ShowConf()
        
    def LoadDocText(self):
        for fname in WalkDir(self.DocTextDir):
            for line in open(fname):
                DocNo,text = line.strip().split('\t')
                self.hDocText[DocNo] = text    
        logging.info('doc text loaded')
        
    def GetAllIdf(self,DocKg):
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        
        
        lRes = []
        for ObjId in lObjId:
            idf  = self.CtfCenter.GetLogIdf(ObjId)
            lRes.append(idf)
        return lRes
    
    def GetAllTf(self,DocKg):
        return list(DocKg.vNodeWeight)
    
    def GetAllTextCosine(self,DocKg):
        
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        
        lCos = []
        if "" == DocText:
            return [0] * len(DocKg)
        DocLm = LmBaseC(DocText)
        lItem = DocKg.hNodeId.items()
        lItem.sort(key=lambda item:item[1])
        lObjId = [item[0] for item in lItem]
        
        for ObjId in lObjId:
            desp = self.ObjCenter.FetchObjDesp(ObjId)
            lm = LmBaseC(desp)
            lCos.append(LmBaseC.Cosine(lm, DocLm))
            
            
        
        return lCos
    
    def GetTextCosine(self,ObjId,DocKg):
        DocText = ""
        if DocKg.DocNo in self.hDocText:
            DocText = self.hDocText[DocKg.DocNo]
        DocLm = LmBaseC(DocText)
        desp = self.ObjCenter.FetchObjDesp(ObjId)
        lm = LmBaseC(desp)
        score = LmBaseC.Cosine(lm, DocLm)
        if 0 == score:
            return self.MinLogProb
        return math.log(score)
        
    
    def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0):
        
        if not ObjId in DocKg:
            return self.MinLogProb
        
        
        lTf = np.zeros(len(DocKg))
        lIdf = np.zeros(len(DocKg))
        if TfScore != 0:
            lTf = np.array(self.GetAllTf(DocKg))
        if IdfScore != 0:
            lIdf = np.array(self.GetAllIdf(DocKg))
#         lCos = np.array(self.GetAllTextCosine(DocKg))
        TextSim = 0
        if TextSimScore != 0:
            TextSim = self.GetTextCosine(ObjId,DocKg)
        W = np.array([TfScore,IdfScore,TextSimScore])
        
        W = W / float(sum(W))
        
        lScore = lTf * W[0] + lIdf * W[1]
        
        res = self.MinLogProb * (W[0] + W[1])
        if ObjId in DocKg:
            p = DocKg.hNodeId[ObjId]
            res = lScore[p]
        res = res + TextSim * TextSimScore
        return res
        
    def inference(self, ObjId, DocKg):
        return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
Example #17
0
 def Init(self):
     cxBaseC.Init(self)
     self.Searcher = IndriSearchCenterC()
     self.ObjCenter = FbObjCacheCenterC()
     self.NeighborNum = 50
Example #18
0
class GraphFullFeatureExtractCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.NodeDir = ""

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()

        self.QDocFeatureExtractor = LeToRFeatureExtractCenterC()
        self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC()
        self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC()
        self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC()

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.NodeDir = self.conf.GetConf('nodedir') + '/'

        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)

        self.QDocFeatureExtractor.SetConf(ConfIn)
        self.QObjFeatureExtractor.SetConf(ConfIn)
        self.DocObjFeatureExtractor.SetConf(ConfIn)
        self.ObjObjFeatureExtractor.SetConf(ConfIn)

        logging.info('graph full feature extractor conf setted')

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'nodedir'

        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        LeToRFeatureExtractCenterC.ShowConf()
        FbQObjFeatureExtractCenterC.ShowConf()
        FbObjDocFeatureExtractCenterC.ShowConf()
        ObjObjFeatureExtractCenterC.ShowConf()

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes(
            query, self.NodeDir)

        #match lDoc dim lDocNo dim
        lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo)

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId]
        llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId]
                    for lDocObjId in llDocObjId]
        while len(llDocObj) < len(lDoc):
            #add empty list for docs have no objects (thus will restrict to EsdRank)
            #if lQObj is also empty, then it is LeToR
            llDocObj.append([])

        logging.info('q[%s] all node fetched, q node %s', qid,
                     json.dumps([Obj.GetId() for Obj in lQObj]))
        return lDoc, lQObj, llDocObj

    def Process(self, qid, query, OutDir):
        '''
        
        '''
        lDoc, lQObj, llDocObj = self.FormulateNodes(qid, query)

        for doc, lDocObj in zip(lDoc, llDocObj):
            hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature = self.ExtractFeatureForOneQDoc(
                qid, query, doc, lQObj + lDocObj)
            self.DumpPerQRes(qid, query, doc, lQObj + lDocObj, hQDocFeature,
                             lhQObjFeature, lhDocObjFeature, llhObjObjFeature,
                             OutDir)

        logging.info('q [%s] processed')
        return True

    def PipeRun(self, QInName, OutDir):
        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.Process(qid, query, OutDir)

        logging.info('queries in [%s] processed features at [%s]', QInName,
                     OutDir)
        return True

    def ExtractFeatureForOneQDoc(self, qid, query, doc, lObj):
        #if wanna speed up, cache features
        #for clearity, now just extract multiple times

        hQDocFeature = self.QDocFeatureExtractor.Process(qid, query, doc)
        logging.debug('q[%s][%s] ltr feature extracted', query, doc.DocNo)

        lhQObjFeature = self.QObjFeatureExtractor.ProcessOneQuery([qid, query],
                                                                  lObj)
        logging.debug('q[%s][%s]  obj feature extracted', query, doc.DocNo)

        lhDocObjFeature = self.DocObjFeatureExtractor.ProcessOneQueryDocPair(
            [qid, query], doc, lObj)
        logging.debug('q[%s][%s]  doc obj feature extracted', query, doc.DocNo)

        llhObjObjFeature = self.ObjObjFeatureExtractor.Process(
            qid, query, lObj)  #symetric matrix
        logging.debug('q[%s] [%s] obj obj feature extracted', query, doc.DocNo)

        logging.debug('q [%s][%s]  all doc graph feature extracted', query,
                      doc.DocNo)

        return hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature

    def DumpPerQRes(self, qid, query, doc, lObj, hQDocFeature, lhQObjFeature,
                    lhDocObjFeature, llhObjObjFeature, OutDir):
        '''
        raw:
            a dir for this q
                a file for each doc
                    node a, node b, hFeature.json
        '''

        if not os.path.exists(OutDir + '/' + qid):
            os.makedirs(OutDir + '/' + qid)

        OutName = OutDir + '/' + qid + '/' + doc.DocNo
        out = open(OutName, 'w')

        #q doc
        print >> out, 'q_%s' % (qid) + '\t' + doc.DocNo + '\t' + json.dumps(
            hQDocFeature)

        #obj doc
        for Obj, hDocObjFeature in zip(lObj, lhDocObjFeature):
            print >> out, Obj.GetId() + '\t' + doc.DocNo + '\t' + json.dumps(
                hDocObjFeature)

        #q obj
        for Obj, hQObjFeature in zip(lObj, lhQObjFeature):
            print >> out, 'q_%s' % (
                qid) + '\t' + Obj.GetId() + '\t' + json.dumps(hQObjFeature)
            print >> out, Obj.GetId() + '\t' + 'q_%s' % (
                qid) + '\t' + json.dumps(hQObjFeature)
            #make it symmetric

        #obj obj
        for i in range(len(lObj)):
            for j in range(len(lObj)):
                if i == j:
                    continue
                print >> out, lObj[i].GetId() + '\t' + lObj[j].GetId(
                ) + '\t' + json.dumps(llhObjObjFeature[i][j])

        logging.info('q[%s] doc [%s] graph dumped to file [%s]', qid,
                     doc.DocNo, OutName)
        return True
Example #19
0
columns are the same as original lines, 
but with an additional column saying which edge it is

'''

import site
site.addsitedir('/bos/usr0/cx/PyCode/cxPyLib')
site.addsitedir('/bos/usr0/cx/PyCode/SemanticSearch')
site.addsitedir('/bos/usr0/cx/PyCode/GoogleAPI')

from ObjCenter.FbObjCacheCenter import FbObjCacheCenterC
from cxBase.Conf import cxConfC
import sys

if 2 != len(sys.argv):
    FbObjCacheCenterC.ShowConf()
    print 'in\nout'
    sys.exit()

conf = cxConfC(sys.argv[1])
CacheCenter = FbObjCacheCenterC(sys.argv[1])

InName = conf.GetConf('in')
OutName = conf.GetConf('out')

out = open(OutName, 'w')

for line in open(InName):
    line = line.strip()
    vCol = line.split('\t')
    ObjId = vCol[2]
Example #20
0
 def ShowConf():
     print 'doctextdir\nobjctf'
     FbObjCacheCenterC.ShowConf()
Example #21
0
class EdgeFeatureExtractCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.lQObjFeatureGroup = []
        self.lObjObjFeatureGroup = []
        self.lDocObjFeatureGroup = []

        self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC()
        self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC()
        self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC()
        self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC()
        self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC()

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NodeDir = ""

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)

        self.NodeDir = self.conf.GetConf('nodedir') + '/'

        self.lQObjFeatureGroup = self.conf.GetConf('qobjfeaturegroup',
                                                   self.lQObjFeatureGroup)
        self.lDocObjFeatureGroup = self.conf.GetConf('docobjfeaturegroup',
                                                     self.lDocObjFeatureGroup)
        self.lObjObjFeatureGroup = self.conf.GetConf('objobjfeaturegroup',
                                                     self.lObjObjFeatureGroup)

        if 'ana' in self.lQObjFeatureGroup:
            self.QObjAnaExtractor.SetConf(ConfIn)
        if 'facc' in self.lDocObjFeatureGroup:
            self.DocObjFaccExtractor.SetConf(ConfIn)

        if 'kg' in self.lObjObjFeatureGroup:
            self.ObjObjKGExtractor.SetConf(ConfIn)
        if 'precalc' in self.lObjObjFeatureGroup:
            self.ObjObjPreCalcExtractor.SetConf(ConfIn)
        if 'textsim' in self.lObjObjFeatureGroup:
            self.ObjObjTextSimExtractor.SetConf(ConfIn)

        logging.info('edge feature center confs setted')

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup'

        QueryObjEdgeFeatureAnaExtractorC.ShowConf()
        DocObjEdgeFeatureFaccExtractorC.ShowConf()
        ObjObjEdgeFeatureKGExtractorC.ShowConf()
        ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf()
        ObjObjEdgeFeatureTextSimExtractorC.ShowConf()

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lObjId = open(self.NodeDir +
                      IndriSearchCenterC.GenerateQueryTargetName(query)).read(
                      ).splitlines()

        lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId]
        logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj))
        return lDoc, lObj

    def ExtractPerQObj(self, qid, query, obj):
        hFeature = {}
        logging.debug('start extracting q[%s]-obj[%s] feature', query,
                      obj.GetId())
        if 'ana' in self.lQObjFeatureGroup:
            hFeature.update(self.QObjAnaExtractor.process(qid, query, obj))
        logging.debug('q[%s]-obj[%s] feature extracted', query, obj.GetId())
        return hFeature

    def ExtractQObjFeature(self, qid, query, lObj):
        lhFeature = []
        logging.info('start extracting [%s][%s] q-obj feature [%d] obj', qid,
                     query, len(lObj))
        for obj in lObj:
            hFeature = self.ExtractPerQObj(qid, query, obj)
            lhFeature.append(hFeature)
        logging.info('q obj feature extracted')
        return lhFeature

    def ExtractPerDocObj(self, doc, obj):
        hFeature = {}
        logging.debug('start extracting doc[%s]-obj[%s] feature', doc.DocNo,
                      obj.GetId())
        if 'facc' in self.lDocObjFeatureGroup:
            hFeature.update(self.DocObjFaccExtractor.process(doc, obj))

        logging.debug('doc[%s]-obj[%s] feature extracted', doc.DocNo,
                      obj.GetId())
        return hFeature

    def ExtractDocObjFeature(self, lDoc, lObj):
        llhFeature = []  #doc \times obj
        logging.info('start extract [%d] doc - [%d] obj feature mtx',
                     len(lDoc), len(lObj))
        for doc in lDoc:
            lhFeature = []
            for obj in lObj:
                hFeature = self.ExtractPerDocObj(doc, obj)
                lhFeature.append(hFeature)
            llhFeature.append(lhFeature)
        logging.info('doc obj feature extracted')
        return llhFeature

    def ExtractPerObjObj(self, ObjA, ObjB, query):
        hFeature = {}
        logging.debug('start extracting for obj pair [%s-%s]', ObjA.GetId(),
                      ObjB.GetId())
        if 'kg' in self.lObjObjFeatureGroup:
            hFeature.update(self.ObjObjKGExtractor.process(ObjA, ObjB))
        if 'precalc' in self.lObjObjFeatureGroup:
            hFeature.update(
                self.ObjObjPreCalcExtractor.process(ObjA, ObjB, query))
        if 'textsim' in self.lObjObjFeatureGroup:
            hFeature.update(self.ObjObjTextSimExtractor.process(ObjA, ObjB))
        logging.debug('obj pair [%s-%s] feature extracted', ObjA.GetId(),
                      ObjB.GetId())
        return hFeature

    def ExtractObjObjFeature(self, lObj, query):
        llhFeature = []  #obj -> obj, diagonal is empty
        logging.info('start extract [%d] obj pair feature mtx', len(lObj))
        for ObjA in lObj:
            lhFeature = []
            for ObjB in lObj:
                if ObjA.GetId() == ObjB.GetId():
                    continue
                hFeature = self.ExtractPerObjObj(ObjA, ObjB, query)
                lhFeature.append(hFeature)
            llhFeature.append(lhFeature)

        logging.info('obj obj feature extracted')
        return llhFeature

    def Process(self, qid, query):

        lDoc, lObj = self.FormulateNodes(qid, query)
        logging.info('nodes fetched')

        lQObjFeature = self.ExtractQObjFeature(qid, query, lObj)

        llDocObjFeature = self.ExtractDocObjFeature(lDoc, lObj)

        llObjObjFeature = self.ExtractObjObjFeature(lObj, query)

        return lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature

    def DumpRes(self, OutName, query, lDoc, lObj, lQObjFeature,
                llDocObjFeature, llObjObjFeature):
        out = open(OutName, 'w')

        for obj, hFeature in zip(lObj, lQObjFeature):
            print >> out, query + '\t' + obj.GetId() + '\t' + json.dumps(
                hFeature)

        for doc, lhFeature in zip(lDoc, llDocObjFeature):
            for obj, hFeature in zip(lObj, lhFeature):
                print >> out, doc.DocNo + '\t' + obj.GetId(
                ) + '\t' + json.dumps(hFeature)

        for ObjA, lhFeature in zip(lObj, llObjObjFeature):
            for ObjB, hFeature in zip(lObj, lhFeature):
                print >> out, ObjA.GetId() + '\t' + ObjB.GetId(
                ) + '\t' + json.dumps(hFeature)

        out.close()
        logging.info('query [%s] feature dumped', query)

    def PipeRun(self, QInName, OutDir):
        '''
        for now:
            output raw type
            each file is a query's edge features
                each line is query|doc|obj \t obj \t json.dumps(hFeature)
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            logging.info('start extracting for [%s][%s]', qid, query)
            lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process(
                qid, query)
            OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
                query)
            logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query,
                         OutName)
            self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature,
                         llDocObjFeature, llObjObjFeature)

        logging.info('all finished')
        return
Example #22
0
 def ShowConf():
     cxBaseC.ShowConf()
     IndriSearchCenterC.ShowConf()
     FbObjCacheCenterC.ShowConf()
     print 'neighbornum'
Example #23
0
class EntityCorrelationFromTextSimC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NeighborNum = 50

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum)

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        print 'neighbornum'

    def ProcessOneObj(self, ObjId, name):
        '''
        return lObjNeighbor=[objid,KL score] top self.NeighborNum
        '''

        #search in index, get top 1000
        query = TextBaseC.RawClean(name)
        if "" == query:
            return []
        lObjDoc = self.Searcher.RunQuery(query)

        lObjNeighbor = []

        ThisDesp = self.ObjCenter.FetchObjDesp(ObjId)
        ThisLm = LmBaseC(ThisDesp)
        ThisVec = VectorC(ThisLm.hTermTF)
        #         print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp)
        if len(ThisLm.hTermTF) == 0:
            return []
        for ObjDoc in lObjDoc:
            Id = ObjDoc.DocNo
            if Id == ObjId:
                continue
            if not Id.startswith('/m/'):
                print "[%s %s] neighbor id [%s] format error" % (ObjId, name,
                                                                 Id)
                continue
#             print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent())
#             NeighborDesp = ObjDoc.GetContent()
            NeighborLm = LmBaseC(ObjDoc)
            NeighborVec = VectorC(NeighborLm.hTermTF)
            if len(NeighborVec.hDim) == 0:
                continue
            score = VectorC.KL(ThisVec, NeighborVec)
            lObjNeighbor.append([Id, -score])


#             print "[%s %s] KL [%f]" %(ObjId,Id,score)
#             print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim))

        lObjNeighbor.sort(key=lambda item: item[1], reverse=True)
        print "[%s:%s] neighbor id score get" % (ObjId, name)
        return lObjNeighbor

    def Process(self, ObjInName, OutName):
        out = open(OutName, 'w')

        for line in open(ObjInName):
            vCol = line.strip().split('\t')
            if len(vCol) < 2:
                continue
            lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1])
            for NeighborId, score in lObjNeighbor[:self.NeighborNum]:
                print >> out, '%s\t%s\t%f\t%s\t%s' % (
                    vCol[0], NeighborId, score, vCol[1],
                    self.ObjCenter.FetchObjName(NeighborId))
            print "[%s:%s] done" % (vCol[0], vCol[1])

        out.close()
        print "finished"
 def Init(self):
     SearchResDocGraphConstructorC.Init(self)
     self.EdgeFeatureCenter = ObjObjFeatureExtractCenterC()
     self.ObjCenter = FbObjCacheCenterC()