class SearchResDocGraphConstructorC(DocGraphConstructorC):
    def Init(self):
        DocGraphConstructorC.Init(self)
        self.Searcher = IndriSearchCenterC()

    def SetConf(self, ConfIn):
        DocGraphConstructorC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)

    @staticmethod
    def ShowConf():
        DocGraphConstructorC.ShowConf()
        IndriSearchCenterC.ShowConf()

    def FormForOneQ(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)

        lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc]

        QueryOutDir = self.OutDir + '/' + qid
        if not os.path.exists(QueryOutDir):
            os.makedirs(QueryOutDir)

        for DocKg in lDocKg:
            DocKg.dump(QueryOutDir + '/' + DocKg.DocNo)
            logging.debug('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg))

        logging.info('[%s-%s] doc kg formed', qid, query)
        return True

    def Process(self, QInName):
        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.FormForOneQ(qid, query)

        logging.info('[%s] query finished', QInName)
        return True
Esempio n. 2
0
class LESRanker(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.Evaluator = AdhocEvaC()

        self.Inferener = LESInferencerC()

        self.QDocNodeDataDir = ""
        self.OrigQWeight = 0.5
        self.UseQObjOnly = True

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        AdhocEvaC.ShowConf()

        print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.Evaluator.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/'
        self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight)
        self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1))

    def LoadQDocObj(self, query):
        InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        hQDocObj = {}
        for line in open(InName):
            key, ObjId = line.strip().split('\t')
            if not key in hQDocObj:
                hQDocObj[key] = [ObjId]
            else:
                hQDocObj[key].append(ObjId)
        logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj))
        return hQDocObj

    def RankingForOneQ(self, qid, query):
        logging.info('Start LES ranking for [%s-%s]', qid, query)

        lDoc = self.Searcher.RunQuery(query, qid)
        logging.info('doc fetched')

        hQDocObj = self.LoadQDocObj(query)

        QKey = 'q_%s' % (qid)
        if not QKey in hQDocObj:
            #do nothing
            logging.info('query [%s] has no object, return raw raning', qid)
            return [doc.DocNo for doc in lDoc]

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]]

        lDocLESScore = []
        LesCnt = 0
        for doc in lDoc:
            if self.UseQObjOnly:
                lDocObj = lQObj
            else:
                if not doc.DocNo in hQDocObj:
                    lDocLESScore.append(0)
                    continue
                lDocObj = [
                    self.ObjCenter.FetchObj(ObjId)
                    for ObjId in hQDocObj[doc.DocNo]
                ]

            score = self.Inferener.inference(query, doc, lQObj, lDocObj)
            if score != 0:
                #if 0, means the obj has no desp (or very short one), doesn't count as valid score
                LesCnt += 1
            lDocLESScore.append(score)

        #add average score to doc without annotation
        #using zero is not very proper
        AvgScore = sum(lDocLESScore) / float(LesCnt)

        lDocLESScore = [
            item if item != 0 else AvgScore for item in lDocLESScore
        ]

        lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \
                     for doc,LESScore in zip(lDoc,lDocLESScore)]

        lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore)
        lDocNoScore.sort(key=lambda item: item[1], reverse=True)
        lRankedDocNo = [item[0] for item in lDocNoScore]

        logging.info('query [%s] ranked', qid)

        return lRankedDocNo

    def Process(self, QIn, OutName):

        lQidQuery = [
            line.split('\t') for line in open(QIn).read().splitlines()
        ]

        llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery]

        logging.info('start evaluation')

        lQid = [item[0] for item in lQidQuery]
        lQuery = [item[1] for item in lQidQuery]
        lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo)

        out = open(OutName, 'w')
        for qid, EvaRes in lPerQEvaRes:
            print >> out, qid + '\t' + EvaRes.dumps()

        out.close()
        logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps())

        return True
class LeToRFeatureExtractCenterC(cxBaseC):
    
    def Init(self):
        cxBaseC.Init(self)
        self.Prepared = False
        
        self.Word2VecInName = ""
        self.Word2VecModel = None
        
        self.lFeatureGroup = []
        self.Searcher = IndriSearchCenterC()
        self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC()
        self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC()
        self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC()
        self.QRelCenter = AdhocQRelC()
        self.QRelIn = ""
        
    
    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Word2VecInName = self.conf.GetConf('word2vecin')
        
        self.lFeatureGroup = self.conf.GetConf('featuregroup')
        
        self.QRelIn = self.conf.GetConf('qrel')
        self.QRelCenter.Load(self.QRelIn)
        if type(self.lFeatureGroup) != list:
            self.lFeatureGroup = [self.lFeatureGroup]
            
        self.Searcher.SetConf(ConfIn)
        
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.SetConf(ConfIn)
            
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.SetConf(ConfIn)
            
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.SetConf(ConfIn)
            
            
        return True
    
    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm'
        LeToRGivenFeatureExtractorC.ShowConf()
        EmbeddingTermPairFeatureExtractorC.ShowConf()
        EmbeddingLmFeatureExtractorC.ShowConf()
        IndriSearchCenterC.ShowConf()
        
    def Prepare(self):
        if self.Prepared:
            return
        
        
        
        logging.info('start load word2vec input')
        self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName)
        logging.info('word2vec loaded')
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.Prepare()
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.Prepare()
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.Prepare()
        
        self.Prepared = True
        return
    
    def Process(self, qid,query,doc):
        '''
        extract all features here
        '''
        self.Prepare()
        
        
        hFeature = {}
        logging.debug('extracting for [%s][%s]',qid,doc.DocNo)
        if 'givenfeature' in self.lFeatureGroup:
            hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc))
            logging.debug('given feature extracted')
        
        if 'termpairemb' in self.lFeatureGroup:
            hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('termpairemb feature extracted')
            
        if 'emblm' in self.lFeatureGroup:
            hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('emblm feature extracted')
            
        return hFeature
    
    
    def PipeLineRun(self,QInName,OutName):
        '''
        will make a feature hash myself... It should be OK right?
        '''
        hFeatureName = {}
        self.Prepare()
        lLines = open(QInName).read().splitlines()
        lQidQuery = [line.split('\t') for line in lLines]
        out = open(OutName,'w')
        
        logging.info('start extracting for file [%s]',QInName)
        for qid,query in lQidQuery:
            lDoc = self.Searcher.RunQuery(query, qid)
            for doc in lDoc:
                hFeature = self.Process(qid, query, doc)
                LTRData = LeToRDataBaseC()
                LTRData.qid = qid
                LTRData.DocNo = doc.DocNo
                LTRData.hFeature = hFeature
                
                LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo)
                hFeatureName = LTRData.HashFeatureName(hFeatureName)
                print >>out,LTRData.dumps()
                
            logging.info('qid [%s] extracted',qid)
            
        out.close()
        
        NameOut = open(OutName + '_FeatureName','w')
        for name,Id in hFeatureName.items():
            print >>NameOut,'%d\t%s' %(Id,name)
        NameOut.close()
        logging.info('finished')
        return
class DocAnaResSERPSplitterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.hDocAnaData = {}
        self.hDocText = {}
        self.OutDir = ''
        self.QInName = ""

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        DocAnaIn = self.conf.GetConf('docanain')
        DocTextIn = self.conf.GetConf('doctextin')
        self.ReadDocAna(DocAnaIn, DocTextIn)
        self.OutDir = self.conf.GetConf('outdir')
        self.QInName = self.conf.GetConf('in')

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'docanain\noutdir\nin\ndoctextin'
        IndriSearchCenterC.ShowConf()

    def ReadDocAna(self, DocAnaIn, DocTextIn):
        lLines = open(DocAnaIn).read().splitlines()
        lDict = [[line.split()[0], line] for line in lLines]
        self.hDocAnaData = dict(lDict)

        lLines = open(DocTextIn).read().splitlines()

        lDict = [line.split('#')[0].strip().split('\t') for line in lLines]
        self.hDocText = dict(lDict)
        return True

    def DumpOneQ(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)
        out = open(self.OutDir + '/%s' % (query.replace(' ', '_')), 'w')

        for doc in lDoc:
            if (not doc.DocNo in self.hDocAnaData) | (not doc.DocNo
                                                      in self.hDocText):
                continue
            print >> out, "<doc>"
            line = self.hDocAnaData[doc.DocNo]

            vCol = line.split('\t')
            text = self.hDocText[doc.DocNo]
            print >> out, vCol[0] + '\t' + text

            if len(vCol) > 2:
                vAna = vCol[1:]
                for i in range(len(vAna) / 8):
                    print >> out, '\t'.join(vAna[8 * i:8 * i + 8])

            print >> out, "</doc>\n\n\n"

        out.close()
        logging.info('[%s] data dumped', query)
        return True

    def Process(self):

        lQidQuery = [
            line.split('\t')
            for line in open(self.QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.DumpOneQ(qid, query)

        logging.info('finished')
class NodeCollectorCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC()
        self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC()

        self.lQueryNodeGroup = []
        self.lDocNodeGroup = []

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.lQueryNodeGroup = self.conf.GetConf('querynodegroup',
                                                 self.lQueryNodeGroup)
        self.lDocNodeGroup = self.conf.GetConf('docnodegroup',
                                               self.lDocNodeGroup)
        self.Searcher.SetConf(ConfIn)
        if 'ana' in self.lQueryNodeGroup:
            self.QueryNodePreFetchedCollector.SetConf(ConfIn)
        if 'facc' in self.lDocNodeGroup:
            self.DocNodeFaccAnaCollector.SetConf(ConfIn)

        logging.info('node collector center conf set')
        return

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        QueryPreFetchedNodeCollectorC.ShowConf()
        DocNodeFaccAnaCollectorC.ShowConf()
        IndriSearchCenterC.ShowConf()
        print 'querynodegroup ana'
        print 'docnodegroup facc'

    def process(self, qid, query):
        '''
        retrieval lDoc
        call query node generator
        call doc node generator
        '''

        lDoc = self.Searcher.RunQuery(query, qid)

        lQObj = self.CollectQueryNode(qid, query)

        llDocObj = self.CollectDocNode(lDoc, qid, query)

        logging.info('[%s][%s] node collected', qid, query)
        return lDoc, lQObj, llDocObj

    def CollectQueryNode(self, qid, query):
        lQNodeScore = []

        if 'ana' in self.lQueryNodeGroup:
            lQNodeScore.extend(
                self.QueryNodePreFetchedCollector.process(qid, query))

        lQObj = list(set([item[0] for item in lQNodeScore]))
        return lQObj

    def CollectDocNode(self, lDoc, qid, query):
        llDocObj = []
        if 'facc' in self.lDocNodeGroup:
            llDocNodeScore = self.DocNodeFaccAnaCollector.process(
                lDoc, qid, query)
            llDocObj = [
                list(set([item[0] for item in lDocNodeScore]))
                for lDocNodeScore in llDocNodeScore
            ]


#             for lDocNodeScore in llDocNodeScore:
#                 lDocObj = [item[0] for item in lDocNodeScore]
#                 lDocObj = list(set(lDocObj))
#                 llDocObj.append(lDocObj)

        return llDocObj

    def PipeRun(self, QInName, OutName, OutFormat='json'):
        '''
        read qid,query
        run
        output to out name
        each line a json dumped [qid,query,lDoc,lQObj,lDocObj]
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        if OutFormat == 'json':
            out = open(OutName, 'w')

        for qid, query in lQidQuery:
            lDoc, lQObj, llDocObj = self.process(qid, query)
            if OutFormat == 'json':
                print >> out, json.dumps([qid, query, lDoc, lQObj, llDocObj])
            if OutFormat == 'dir':

                #print doc id\t obj id (doc id could be query indicating query obj)
                self.DumpRawFormat(qid, query, lDoc, lQObj, llDocObj, OutName)

        if OutFormat == 'json':
            out.close()
        logging.info('query in [%s] node genereated, dumped to [%s]', QInName,
                     OutName)

    def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName):

        if not os.path.exists(OutName):
            os.makedirs(OutName)

        out = open(
            OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')

        logging.info('q[%s] has [%d] q node', qid, len(lQObj))
        for QObj in lQObj:
            print >> out, 'q_' + qid + '\t' + QObj

        if llDocObj == []:
            logging.info('no doc node')
        else:
            for doc, lDocObj in zip(lDoc, llDocObj):
                logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj))
                for DocObj in lDocObj:
                    print >> out, doc.DocNo + '\t' + DocObj

        out.close()
        logging.info('q [%s] raw node res dumpped', qid)
        return

    @staticmethod
    def LoadRawFormatNodeRes(query, InDir):
        '''
        read results from the disk as dumped
        '''
        lDocNo = []
        llDocObj = []

        InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        lLines = open(InName).read().splitlines()
        lvCol = [line.split('\t') for line in lLines]

        lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')]
        lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')]

        lQObj = [vCol[1] for vCol in lQCol]
        logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj))

        LastDocNo = ""
        for DocNo, ObjId in lDocCol:
            if not DocNo == LastDocNo:
                llDocObj.append([])
                lDocNo.append(DocNo)
                LastDocNo = DocNo
            llDocObj[-1].append(ObjId)

        return lDocNo, lQObj, llDocObj
class ContinuousLmRankingEvaluatorC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Evaluator = AdhocEvaC()
        self.Searcher = IndriSearchCenterC()
        self.Word2VecInName = ""
        self.Word2VecModel = None
        self.lLmName = []
        self.LmClass = None
        self.lOutName = []
        self.QueryInName = ""

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.Evaluator.SetConf(ConfIn)

        self.lLmName = self.conf.GetConf('lmname', self.lLmName)

        self.QueryInName = self.conf.GetConf('in')
        self.lOutName = self.conf.GetConf('out', self.lOutName)

        self.Word2VecInName = self.conf.GetConf('word2vecin',
                                                self.Word2VecInName)
        self.LoadWord2Vec()

    def LoadWord2Vec(self):
        logging.info('start load word2vec input')
        self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(
            self.Word2VecInName)
        logging.info('word2vec loaded')

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'word2vecin\nkernel\nlmname\nbandwidth\nin\nout'
        IndriSearchCenterC.ShowConf()
        AdhocEvaC.ShowConf()

    def ReRankAndEvaPerQ(self, qid, query, lDoc, lLm):

        lReRankDocNo, lScore = self.FormNewRank(query, lDoc, lLm)
        EvaRes = self.Evaluator.EvaluatePerQ(qid, query, lReRankDocNo)
        logging.info('[%s][%s] result [%s]', qid, query, EvaRes.dumps())
        return EvaRes, lReRankDocNo, lScore

    def FormNewRank(self, query, lDoc, lLm):

        lQTerm = query.split()
        if [] == lQTerm:
            return self.MinLogPdf
        lQX = [
            self.Word2VecModel[term] for term in lQTerm
            if term in self.Word2VecModel
        ]

        lScore = [lm.InferenceQVec(lQX) for lm in lLm]
        lDocScore = zip(lDoc, lScore)
        lDocScore.sort(key=lambda item: item[1], reverse=True)
        lDocNo = [item[0].DocNo for item in lDocScore]
        lScore = [item[1] for item in lDocScore]
        return lDocNo, lScore

    def FormLm(self, doc):
        lTerm = doc.GetContent().split()
        Lm = self.LmClass()
        Lm.SetPara(self.conf)
        Lm.Construct(lTerm, self.Word2VecModel)
        return Lm

    def FormPerQData(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)
        lLm = [self.FormLm(doc) for doc in lDoc]

        return lDoc, lLm

    def SetLmClass(self, cLmName):
        '''
        select proper class name for cLmName
        '''

        if cLmName == 'gaussian':
            logging.info('use gaussian clm')
            self.LmClass = GaussianLmC
            return True

        if cLmName == 'kde':
            logging.info('use kde lm')
            self.LmClass = KernelDensityLmC
            return True

        if cLmName == 'sum':
            logging.info('use raw sum')
            self.LmClass = SummationLmC
            return True

        if cLmName == 'rand':
            logging.info('use rand')
            self.LmClass = RandLmC
            return True

        if cLmName == 'radius':
            logging.info('use radius')
            self.LmClass = RadiusMatchLmC
            return True

        raise NotImplementedError(
            'please choose continuous language model from gaussian|kde')

    def Process(self):

        for OutName, cLmName in zip(self.lOutName, self.lLmName):
            self.RunForOneLm(self.QueryInName, OutName, cLmName)

    def RunForOneLm(self, QueryInName, OutName, cLmName):
        '''
        evaluate cLmName on QueryInName's queries
        evaluation result output to OutName
        '''

        lQidQuery = [
            line.split('\t') for line in open(QueryInName).read().splitlines()
        ]

        self.SetLmClass(cLmName)

        lEvaRes = []

        RankOut = open(OutName + '_rank', 'w')

        logging.info('start evaluating...')
        for qid, query in lQidQuery:
            lDoc, lLm = self.FormPerQData(qid, query)
            EvaRes, lDocNo, lScore = self.ReRankAndEvaPerQ(
                qid, query, lDoc, lLm)
            lEvaRes.append(EvaRes)

            for i in range(len(lDocNo)):
                print >> RankOut, qid + ' Q0 ' + lDocNo[i] + ' %d %f %s' % (
                    i + 1, lScore[i], cLmName)

        RankOut.close()

        lEvaRes.append(AdhocMeasureC.AdhocMeasureMean(lEvaRes))
        lQid = [item[0] for item in lQidQuery] + ['mean']

        out = open(OutName, 'w')

        for qid, EvaRes in zip(lQid, lEvaRes):
            print >> out, qid + '\t' + EvaRes.dumps()

        out.close()
        logging.info('evaluation res %s', lEvaRes[-1].dumps())

        return True
Esempio n. 7
0
class GraphFullFeatureExtractCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.NodeDir = ""

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()

        self.QDocFeatureExtractor = LeToRFeatureExtractCenterC()
        self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC()
        self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC()
        self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC()

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.NodeDir = self.conf.GetConf('nodedir') + '/'

        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)

        self.QDocFeatureExtractor.SetConf(ConfIn)
        self.QObjFeatureExtractor.SetConf(ConfIn)
        self.DocObjFeatureExtractor.SetConf(ConfIn)
        self.ObjObjFeatureExtractor.SetConf(ConfIn)

        logging.info('graph full feature extractor conf setted')

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'nodedir'

        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        LeToRFeatureExtractCenterC.ShowConf()
        FbQObjFeatureExtractCenterC.ShowConf()
        FbObjDocFeatureExtractCenterC.ShowConf()
        ObjObjFeatureExtractCenterC.ShowConf()

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes(
            query, self.NodeDir)

        #match lDoc dim lDocNo dim
        lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo)

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId]
        llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId]
                    for lDocObjId in llDocObjId]
        while len(llDocObj) < len(lDoc):
            #add empty list for docs have no objects (thus will restrict to EsdRank)
            #if lQObj is also empty, then it is LeToR
            llDocObj.append([])

        logging.info('q[%s] all node fetched, q node %s', qid,
                     json.dumps([Obj.GetId() for Obj in lQObj]))
        return lDoc, lQObj, llDocObj

    def Process(self, qid, query, OutDir):
        '''
        
        '''
        lDoc, lQObj, llDocObj = self.FormulateNodes(qid, query)

        for doc, lDocObj in zip(lDoc, llDocObj):
            hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature = self.ExtractFeatureForOneQDoc(
                qid, query, doc, lQObj + lDocObj)
            self.DumpPerQRes(qid, query, doc, lQObj + lDocObj, hQDocFeature,
                             lhQObjFeature, lhDocObjFeature, llhObjObjFeature,
                             OutDir)

        logging.info('q [%s] processed')
        return True

    def PipeRun(self, QInName, OutDir):
        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.Process(qid, query, OutDir)

        logging.info('queries in [%s] processed features at [%s]', QInName,
                     OutDir)
        return True

    def ExtractFeatureForOneQDoc(self, qid, query, doc, lObj):
        #if wanna speed up, cache features
        #for clearity, now just extract multiple times

        hQDocFeature = self.QDocFeatureExtractor.Process(qid, query, doc)
        logging.debug('q[%s][%s] ltr feature extracted', query, doc.DocNo)

        lhQObjFeature = self.QObjFeatureExtractor.ProcessOneQuery([qid, query],
                                                                  lObj)
        logging.debug('q[%s][%s]  obj feature extracted', query, doc.DocNo)

        lhDocObjFeature = self.DocObjFeatureExtractor.ProcessOneQueryDocPair(
            [qid, query], doc, lObj)
        logging.debug('q[%s][%s]  doc obj feature extracted', query, doc.DocNo)

        llhObjObjFeature = self.ObjObjFeatureExtractor.Process(
            qid, query, lObj)  #symetric matrix
        logging.debug('q[%s] [%s] obj obj feature extracted', query, doc.DocNo)

        logging.debug('q [%s][%s]  all doc graph feature extracted', query,
                      doc.DocNo)

        return hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature

    def DumpPerQRes(self, qid, query, doc, lObj, hQDocFeature, lhQObjFeature,
                    lhDocObjFeature, llhObjObjFeature, OutDir):
        '''
        raw:
            a dir for this q
                a file for each doc
                    node a, node b, hFeature.json
        '''

        if not os.path.exists(OutDir + '/' + qid):
            os.makedirs(OutDir + '/' + qid)

        OutName = OutDir + '/' + qid + '/' + doc.DocNo
        out = open(OutName, 'w')

        #q doc
        print >> out, 'q_%s' % (qid) + '\t' + doc.DocNo + '\t' + json.dumps(
            hQDocFeature)

        #obj doc
        for Obj, hDocObjFeature in zip(lObj, lhDocObjFeature):
            print >> out, Obj.GetId() + '\t' + doc.DocNo + '\t' + json.dumps(
                hDocObjFeature)

        #q obj
        for Obj, hQObjFeature in zip(lObj, lhQObjFeature):
            print >> out, 'q_%s' % (
                qid) + '\t' + Obj.GetId() + '\t' + json.dumps(hQObjFeature)
            print >> out, Obj.GetId() + '\t' + 'q_%s' % (
                qid) + '\t' + json.dumps(hQObjFeature)
            #make it symmetric

        #obj obj
        for i in range(len(lObj)):
            for j in range(len(lObj)):
                if i == j:
                    continue
                print >> out, lObj[i].GetId() + '\t' + lObj[j].GetId(
                ) + '\t' + json.dumps(llhObjObjFeature[i][j])

        logging.info('q[%s] doc [%s] graph dumped to file [%s]', qid,
                     doc.DocNo, OutName)
        return True
Esempio n. 8
0
class EdgeFeatureExtractCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.lQObjFeatureGroup = []
        self.lObjObjFeatureGroup = []
        self.lDocObjFeatureGroup = []

        self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC()
        self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC()
        self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC()
        self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC()
        self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC()

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NodeDir = ""

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)

        self.NodeDir = self.conf.GetConf('nodedir') + '/'

        self.lQObjFeatureGroup = self.conf.GetConf('qobjfeaturegroup',
                                                   self.lQObjFeatureGroup)
        self.lDocObjFeatureGroup = self.conf.GetConf('docobjfeaturegroup',
                                                     self.lDocObjFeatureGroup)
        self.lObjObjFeatureGroup = self.conf.GetConf('objobjfeaturegroup',
                                                     self.lObjObjFeatureGroup)

        if 'ana' in self.lQObjFeatureGroup:
            self.QObjAnaExtractor.SetConf(ConfIn)
        if 'facc' in self.lDocObjFeatureGroup:
            self.DocObjFaccExtractor.SetConf(ConfIn)

        if 'kg' in self.lObjObjFeatureGroup:
            self.ObjObjKGExtractor.SetConf(ConfIn)
        if 'precalc' in self.lObjObjFeatureGroup:
            self.ObjObjPreCalcExtractor.SetConf(ConfIn)
        if 'textsim' in self.lObjObjFeatureGroup:
            self.ObjObjTextSimExtractor.SetConf(ConfIn)

        logging.info('edge feature center confs setted')

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup'

        QueryObjEdgeFeatureAnaExtractorC.ShowConf()
        DocObjEdgeFeatureFaccExtractorC.ShowConf()
        ObjObjEdgeFeatureKGExtractorC.ShowConf()
        ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf()
        ObjObjEdgeFeatureTextSimExtractorC.ShowConf()

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lObjId = open(self.NodeDir +
                      IndriSearchCenterC.GenerateQueryTargetName(query)).read(
                      ).splitlines()

        lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId]
        logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj))
        return lDoc, lObj

    def ExtractPerQObj(self, qid, query, obj):
        hFeature = {}
        logging.debug('start extracting q[%s]-obj[%s] feature', query,
                      obj.GetId())
        if 'ana' in self.lQObjFeatureGroup:
            hFeature.update(self.QObjAnaExtractor.process(qid, query, obj))
        logging.debug('q[%s]-obj[%s] feature extracted', query, obj.GetId())
        return hFeature

    def ExtractQObjFeature(self, qid, query, lObj):
        lhFeature = []
        logging.info('start extracting [%s][%s] q-obj feature [%d] obj', qid,
                     query, len(lObj))
        for obj in lObj:
            hFeature = self.ExtractPerQObj(qid, query, obj)
            lhFeature.append(hFeature)
        logging.info('q obj feature extracted')
        return lhFeature

    def ExtractPerDocObj(self, doc, obj):
        hFeature = {}
        logging.debug('start extracting doc[%s]-obj[%s] feature', doc.DocNo,
                      obj.GetId())
        if 'facc' in self.lDocObjFeatureGroup:
            hFeature.update(self.DocObjFaccExtractor.process(doc, obj))

        logging.debug('doc[%s]-obj[%s] feature extracted', doc.DocNo,
                      obj.GetId())
        return hFeature

    def ExtractDocObjFeature(self, lDoc, lObj):
        llhFeature = []  #doc \times obj
        logging.info('start extract [%d] doc - [%d] obj feature mtx',
                     len(lDoc), len(lObj))
        for doc in lDoc:
            lhFeature = []
            for obj in lObj:
                hFeature = self.ExtractPerDocObj(doc, obj)
                lhFeature.append(hFeature)
            llhFeature.append(lhFeature)
        logging.info('doc obj feature extracted')
        return llhFeature

    def ExtractPerObjObj(self, ObjA, ObjB, query):
        hFeature = {}
        logging.debug('start extracting for obj pair [%s-%s]', ObjA.GetId(),
                      ObjB.GetId())
        if 'kg' in self.lObjObjFeatureGroup:
            hFeature.update(self.ObjObjKGExtractor.process(ObjA, ObjB))
        if 'precalc' in self.lObjObjFeatureGroup:
            hFeature.update(
                self.ObjObjPreCalcExtractor.process(ObjA, ObjB, query))
        if 'textsim' in self.lObjObjFeatureGroup:
            hFeature.update(self.ObjObjTextSimExtractor.process(ObjA, ObjB))
        logging.debug('obj pair [%s-%s] feature extracted', ObjA.GetId(),
                      ObjB.GetId())
        return hFeature

    def ExtractObjObjFeature(self, lObj, query):
        llhFeature = []  #obj -> obj, diagonal is empty
        logging.info('start extract [%d] obj pair feature mtx', len(lObj))
        for ObjA in lObj:
            lhFeature = []
            for ObjB in lObj:
                if ObjA.GetId() == ObjB.GetId():
                    continue
                hFeature = self.ExtractPerObjObj(ObjA, ObjB, query)
                lhFeature.append(hFeature)
            llhFeature.append(lhFeature)

        logging.info('obj obj feature extracted')
        return llhFeature

    def Process(self, qid, query):

        lDoc, lObj = self.FormulateNodes(qid, query)
        logging.info('nodes fetched')

        lQObjFeature = self.ExtractQObjFeature(qid, query, lObj)

        llDocObjFeature = self.ExtractDocObjFeature(lDoc, lObj)

        llObjObjFeature = self.ExtractObjObjFeature(lObj, query)

        return lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature

    def DumpRes(self, OutName, query, lDoc, lObj, lQObjFeature,
                llDocObjFeature, llObjObjFeature):
        out = open(OutName, 'w')

        for obj, hFeature in zip(lObj, lQObjFeature):
            print >> out, query + '\t' + obj.GetId() + '\t' + json.dumps(
                hFeature)

        for doc, lhFeature in zip(lDoc, llDocObjFeature):
            for obj, hFeature in zip(lObj, lhFeature):
                print >> out, doc.DocNo + '\t' + obj.GetId(
                ) + '\t' + json.dumps(hFeature)

        for ObjA, lhFeature in zip(lObj, llObjObjFeature):
            for ObjB, hFeature in zip(lObj, lhFeature):
                print >> out, ObjA.GetId() + '\t' + ObjB.GetId(
                ) + '\t' + json.dumps(hFeature)

        out.close()
        logging.info('query [%s] feature dumped', query)

    def PipeRun(self, QInName, OutDir):
        '''
        for now:
            output raw type
            each file is a query's edge features
                each line is query|doc|obj \t obj \t json.dumps(hFeature)
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            logging.info('start extracting for [%s][%s]', qid, query)
            lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process(
                qid, query)
            OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
                query)
            logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query,
                         OutName)
            self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature,
                         llDocObjFeature, llObjObjFeature)

        logging.info('all finished')
        return
Esempio n. 9
0
class EntityCorrelationFromTextSimC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NeighborNum = 50

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum)

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        print 'neighbornum'

    def ProcessOneObj(self, ObjId, name):
        '''
        return lObjNeighbor=[objid,KL score] top self.NeighborNum
        '''

        #search in index, get top 1000
        query = TextBaseC.RawClean(name)
        if "" == query:
            return []
        lObjDoc = self.Searcher.RunQuery(query)

        lObjNeighbor = []

        ThisDesp = self.ObjCenter.FetchObjDesp(ObjId)
        ThisLm = LmBaseC(ThisDesp)
        ThisVec = VectorC(ThisLm.hTermTF)
        #         print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp)
        if len(ThisLm.hTermTF) == 0:
            return []
        for ObjDoc in lObjDoc:
            Id = ObjDoc.DocNo
            if Id == ObjId:
                continue
            if not Id.startswith('/m/'):
                print "[%s %s] neighbor id [%s] format error" % (ObjId, name,
                                                                 Id)
                continue
#             print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent())
#             NeighborDesp = ObjDoc.GetContent()
            NeighborLm = LmBaseC(ObjDoc)
            NeighborVec = VectorC(NeighborLm.hTermTF)
            if len(NeighborVec.hDim) == 0:
                continue
            score = VectorC.KL(ThisVec, NeighborVec)
            lObjNeighbor.append([Id, -score])


#             print "[%s %s] KL [%f]" %(ObjId,Id,score)
#             print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim))

        lObjNeighbor.sort(key=lambda item: item[1], reverse=True)
        print "[%s:%s] neighbor id score get" % (ObjId, name)
        return lObjNeighbor

    def Process(self, ObjInName, OutName):
        out = open(OutName, 'w')

        for line in open(ObjInName):
            vCol = line.strip().split('\t')
            if len(vCol) < 2:
                continue
            lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1])
            for NeighborId, score in lObjNeighbor[:self.NeighborNum]:
                print >> out, '%s\t%s\t%f\t%s\t%s' % (
                    vCol[0], NeighborId, score, vCol[1],
                    self.ObjCenter.FetchObjName(NeighborId))
            print "[%s:%s] done" % (vCol[0], vCol[1])

        out.close()
        print "finished"
Esempio n. 10
0
class SearchResultWordVecAnalysiserC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.QIn = ""
        self.OutDir = ""
        self.Word2VecInName = ""
        self.Word2VecModel = None
        self.Searcher = IndriSearchCenterC()
        self.BinNumber = 100

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)

        self.Word2VecInName = self.conf.GetConf('word2vecin')
        self.LoadWord2Vec()
        self.QIn = self.conf.GetConf('in')
        self.OutDir = self.conf.GetConf('outdir')
        self.BinNumber = self.conf.GetConf('binnumber', self.BinNumber)

    def LoadWord2Vec(self):
        logging.info('start load word2vec input')
        self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(
            self.Word2VecInName)
        logging.info('word2vec loaded')

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'word2vecin\nin\noutdir\nbinnumber'
        IndriSearchCenterC.ShowConf()

    def LoadDocWordVec(self):

        lDoc = []
        lQidQuery = [
            line.split('\t') for line in open(self.QIn).read().splitlines()
        ]

        for qid, query in lQidQuery:
            lDoc.extend(self.Searcher.RunQuery(query, qid))

        lTerm = []
        for doc in lDoc:
            lTerm.extend(doc.GetContent().split())

        lX = np.array([
            self.Word2VecModel[term] for term in lTerm
            if term in self.Word2VecModel
        ])

        logging.info('target doc word vec get')
        return lX

    def BinData(self, lX, OutName):
        '''
        bin all lX's dim
        [[mu,sigma, bins]]
        '''
        logging.info('binning data')
        lBinData = []
        dim = lX.shape[1]
        for i in range(dim):
            x = lX[:, i]
            logging.info('binning dim [%d]', i)
            mu = np.mean(x)
            sigma = np.var(x)
            hist, bins = np.histogram(x, bins=self.BinNumber)
            lBinData.append([mu, sigma, hist, bins])

        out = open(OutName, 'w')

        pickle.dump(lBinData, out)
        out.close()
        logging.info('data binned to [%s]', OutName)
        return

    def CalcPersonCorrelation(self, lX, OutName):

        n, d = lX.shape
        mPValue = np.zeros([d, d])
        mPearson = np.zeros([d, d])

        for i in range(d):
            for j in range(i + 1, d):
                per, p = pearsonr(lX[:, i], lX[:, j])
                mPValue[i, j] = p
                mPValue[j, i] = p
                mPearson[i, j] = per
                mPearson[j, i] = per
                if p < 0.05:
                    logging.info('[%d-%d] correlated p=%f', i, j, p)

        out = open(OutName + '_pearson', 'w')
        pickle.dump(mPearson, out)
        #         print >>out, np.array2string(mPearson)
        out.close()

        out = open(OutName + '_pvalue', 'w')
        pickle.dump(mPValue, out)
        #         print >>out, np.array2string(mPValue)
        out.close()

        logging.info('pearson corr calculated and dumped')

        return True

    def CalcCovarianceMtx(self, lX, OutName):
        logging.info('start calculating covariance matrix')
        #         CovMtx = np.cov(lX.T)   #OOM
        d = lX.shape[1]
        CovMtx = np.zeros([d, d])
        for i in range(d):
            for j in range(i, d):
                MiniCovMtx = np.cov(lX[:, i], lX[:, j])
                CovMtx[i, j] = MiniCovMtx[0, 1]
                CovMtx[i, i] = MiniCovMtx[0, 0]
                CovMtx[j, i] = MiniCovMtx[1, 0]
        out = open(OutName, 'w')
        pickle.dump(CovMtx, out)
        out.close()
        logging.info('covariance dumped to [%s]', OutName)

    def Process(self):

        lX = self.LoadDocWordVec()

        #         self.BinData(lX, self.OutDir + '/MarginalDist')

        self.CalcCovarianceMtx(lX, self.OutDir + '/CovarianceMtx')
        self.CalcPersonCorrelation(lX, self.OutDir + '/PersonCorrelationMtx')

        logging.info('[%s] search result word vec analysis finished', self.QIn)

        return True