Python IndriSearchCenterC 예제들, IndriSearch.IndriSearchCenter.IndriSearchCenterC Python 예제들

예제 #1

0

파일 보기

파일: SplitDocAnaResPerQuery.py 프로젝트: wayne9qiu/GraphRepresentation

 def Init(self):
     cxBaseC.Init(self)
     self.Searcher = IndriSearchCenterC()
     self.hDocAnaData = {}
     self.hDocText = {}
     self.OutDir = ''
     self.QInName = ""

예제 #2

0

파일 보기

파일: NodeCollectCenter.py 프로젝트: wayne9qiu/GraphRepresentation

    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC()
        self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC()

        self.lQueryNodeGroup = []
        self.lDocNodeGroup = []

예제 #3

0

파일 보기

 def Init(self):
     cxBaseC.Init(self)
     self.QIn = ""
     self.OutDir = ""
     self.Word2VecInName = ""
     self.Word2VecModel = None
     self.Searcher = IndriSearchCenterC()
     self.BinNumber = 100

예제 #4

0

파일 보기

파일: ContinuousLmRankingEva.py 프로젝트: wayne9qiu/EmbeddingForIR

 def Init(self):
     cxBaseC.Init(self)
     self.Evaluator = AdhocEvaC()
     self.Searcher = IndriSearchCenterC()
     self.Word2VecInName = ""
     self.Word2VecModel = None
     self.lLmName = []
     self.LmClass = None
     self.lOutName = []
     self.QueryInName = ""

예제 #5

0

파일 보기

파일: ContinuousLmRankingBase.py 프로젝트: wayne9qiu/EmbeddingForIR

    def Init(self):
        cxBaseC.Init(self)
        self.Evaluator = AdhocEvaC()
        self.Searcher = IndriSearchCenterC()
        self.Word2VecInName = ""
        self.Word2VecModel = None
        self.cLmName = "kde"
        self.LmClass = KernelDensityLmC

        #lm conf input
        self.ParaConf = cxConfC()

예제 #6

0

파일 보기

    def Init(self):
        cxBaseC.Init(self)

        self.NodeDir = ""

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()

        self.QDocFeatureExtractor = LeToRFeatureExtractCenterC()
        self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC()
        self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC()
        self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC()

예제 #7

0

파일 보기

    def Init(self):
        cxBaseC.Init(self)

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.Evaluator = AdhocEvaC()

        self.Inferener = LESInferencerC()

        self.QDocNodeDataDir = ""
        self.OrigQWeight = 0.5
        self.UseQObjOnly = True

예제 #8

0

파일 보기

파일: LeToRFeatureExtractCenter.py 프로젝트: wayne9qiu/EmbeddingForIR

 def Init(self):
     cxBaseC.Init(self)
     self.Prepared = False
     
     self.Word2VecInName = ""
     self.Word2VecModel = None
     
     self.lFeatureGroup = []
     self.Searcher = IndriSearchCenterC()
     self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC()
     self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC()
     self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC()
     self.QRelCenter = AdhocQRelC()
     self.QRelIn = ""

예제 #9

0

파일 보기

파일: NodeCollectCenter.py 프로젝트: wayne9qiu/GraphRepresentation

 def ShowConf():
     cxBaseC.ShowConf()
     QueryPreFetchedNodeCollectorC.ShowConf()
     DocNodeFaccAnaCollectorC.ShowConf()
     IndriSearchCenterC.ShowConf()
     print 'querynodegroup ana'
     print 'docnodegroup facc'

예제 #10

0

파일 보기

파일: NodeCollectCenter.py 프로젝트: wayne9qiu/GraphRepresentation

    def LoadRawFormatNodeRes(query, InDir):
        '''
        read results from the disk as dumped
        '''
        lDocNo = []
        llDocObj = []

        InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        lLines = open(InName).read().splitlines()
        lvCol = [line.split('\t') for line in lLines]

        lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')]
        lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')]

        lQObj = [vCol[1] for vCol in lQCol]
        logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj))

        LastDocNo = ""
        for DocNo, ObjId in lDocCol:
            if not DocNo == LastDocNo:
                llDocObj.append([])
                lDocNo.append(DocNo)
                LastDocNo = DocNo
            llDocObj[-1].append(ObjId)

        return lDocNo, lQObj, llDocObj

예제 #11

0

파일 보기

파일: LeToRFeatureExtractCenter.py 프로젝트: wayne9qiu/EmbeddingForIR

 def ShowConf():
     cxBaseC.ShowConf()
     print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm'
     LeToRGivenFeatureExtractorC.ShowConf()
     EmbeddingTermPairFeatureExtractorC.ShowConf()
     EmbeddingLmFeatureExtractorC.ShowConf()
     IndriSearchCenterC.ShowConf()

예제 #12

0

파일 보기

    def ShowConf(cls):
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        AdhocEvaC.ShowConf()

        print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'

예제 #13

0

파일 보기

    def PipeRun(self, QInName, OutDir):
        '''
        for now:
            output raw type
            each file is a query's edge features
                each line is query|doc|obj \t obj \t json.dumps(hFeature)
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            logging.info('start extracting for [%s][%s]', qid, query)
            lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process(
                qid, query)
            OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
                query)
            logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query,
                         OutName)
            self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature,
                         llDocObjFeature, llObjObjFeature)

        logging.info('all finished')
        return

예제 #14

0

파일 보기

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes(
            query, self.NodeDir)

        #match lDoc dim lDocNo dim
        lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo)

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId]
        llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId]
                    for lDocObjId in llDocObjId]
        while len(llDocObj) < len(lDoc):
            #add empty list for docs have no objects (thus will restrict to EsdRank)
            #if lQObj is also empty, then it is LeToR
            llDocObj.append([])

        logging.info('q[%s] all node fetched, q node %s', qid,
                     json.dumps([Obj.GetId() for Obj in lQObj]))
        return lDoc, lQObj, llDocObj

예제 #15

0

파일 보기

    def Init(self):
        cxBaseC.Init(self)

        self.lQObjFeatureGroup = []
        self.lObjObjFeatureGroup = []
        self.lDocObjFeatureGroup = []

        self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC()
        self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC()
        self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC()
        self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC()
        self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC()

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.NodeDir = ""

예제 #16

0

파일 보기

 def LoadQueryQid(self, QIn):
     lQidQuery = [
         line.split('\t') for line in open(QIn).read().splitlines()
     ]
     lQueryNameQid = [[
         IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0]
     ] for item in lQidQuery]
     self.hQueryQid = dict(lQueryNameQid)

예제 #17

0

파일 보기

 def LoadOneQueryObjSim(self,query):
     for i in range(len(self.lPreCalcDir)):
         InName = self.lPreCalcDir[i] +'/' + IndriSearchCenterC.GenerateQueryTargetName(query)
         if not os.path.exists(InName):
             return False
         hObjPairSim = pickle.load(open(InName))
         self.lhQueryObjPairSim[i][query] = hObjPairSim
     logging.info('query [%s] obj sim loaded',query)
     return True

예제 #18

0

파일 보기

    def ShowConf():
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup'

        QueryObjEdgeFeatureAnaExtractorC.ShowConf()
        DocObjEdgeFeatureFaccExtractorC.ShowConf()
        ObjObjEdgeFeatureKGExtractorC.ShowConf()
        ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf()
        ObjObjEdgeFeatureTextSimExtractorC.ShowConf()

예제 #19

0

파일 보기

    def ShowConf(cls):
        cxBaseC.ShowConf()
        print cls.__name__
        print 'nodedir'

        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()

        LeToRFeatureExtractCenterC.ShowConf()
        FbQObjFeatureExtractCenterC.ShowConf()
        FbObjDocFeatureExtractCenterC.ShowConf()
        ObjObjFeatureExtractCenterC.ShowConf()

예제 #20

0

파일 보기

def OutputDocText(hQueryDocText, OutDir):
    for query, lDocNoText in hQueryDocText.items():
        out = open(
            OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')
        for DocNo, text in lDocNoText:
            print >> out, DocNo + '\t' + text
        logging.info('query [%s] [%d] doc text  outputed', query,
                     len(lDocNoText))
        out.close()
    logging.info('doc text dumped to [%s]', OutDir)
    return True

예제 #21

0

파일 보기

 def LoadQDocObj(self, query):
     InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName(
         query)
     hQDocObj = {}
     for line in open(InName):
         key, ObjId = line.strip().split('\t')
         if not key in hQDocObj:
             hQDocObj[key] = [ObjId]
         else:
             hQDocObj[key].append(ObjId)
     logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj))
     return hQDocObj

예제 #22

0

파일 보기

파일: ConstructSearchResDocGraph.py 프로젝트: wayne9qiu/GraphRepresentation

class SearchResDocGraphConstructorC(DocGraphConstructorC):
    def Init(self):
        DocGraphConstructorC.Init(self)
        self.Searcher = IndriSearchCenterC()

    def SetConf(self, ConfIn):
        DocGraphConstructorC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)

    @staticmethod
    def ShowConf():
        DocGraphConstructorC.ShowConf()
        IndriSearchCenterC.ShowConf()

    def FormForOneQ(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)

        lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc]

        QueryOutDir = self.OutDir + '/' + qid
        if not os.path.exists(QueryOutDir):
            os.makedirs(QueryOutDir)

        for DocKg in lDocKg:
            DocKg.dump(QueryOutDir + '/' + DocKg.DocNo)
            logging.debug('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg))

        logging.info('[%s-%s] doc kg formed', qid, query)
        return True

    def Process(self, QInName):
        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.FormForOneQ(qid, query)

        logging.info('[%s] query finished', QInName)
        return True

예제 #23

0

파일 보기

    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lObjId = open(self.NodeDir +
                      IndriSearchCenterC.GenerateQueryTargetName(query)).read(
                      ).splitlines()

        lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId]
        logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj))
        return lDoc, lObj

예제 #24

0

파일 보기

파일: NodeCollectCenter.py 프로젝트: wayne9qiu/GraphRepresentation

    def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName):

        if not os.path.exists(OutName):
            os.makedirs(OutName)

        out = open(
            OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')

        logging.info('q[%s] has [%d] q node', qid, len(lQObj))
        for QObj in lQObj:
            print >> out, 'q_' + qid + '\t' + QObj

        if llDocObj == []:
            logging.info('no doc node')
        else:
            for doc, lDocObj in zip(lDoc, llDocObj):
                logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj))
                for DocObj in lDocObj:
                    print >> out, doc.DocNo + '\t' + DocObj

        out.close()
        logging.info('q [%s] raw node res dumpped', qid)
        return

예제 #25

0

파일 보기

class LESRanker(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)

        self.Searcher = IndriSearchCenterC()
        self.ObjCenter = FbObjCacheCenterC()
        self.Evaluator = AdhocEvaC()

        self.Inferener = LESInferencerC()

        self.QDocNodeDataDir = ""
        self.OrigQWeight = 0.5
        self.UseQObjOnly = True

    @classmethod
    def ShowConf(cls):
        cxBaseC.ShowConf()
        IndriSearchCenterC.ShowConf()
        FbObjCacheCenterC.ShowConf()
        AdhocEvaC.ShowConf()

        print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)

        self.Searcher.SetConf(ConfIn)
        self.Evaluator.SetConf(ConfIn)
        self.ObjCenter.SetConf(ConfIn)
        self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/'
        self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight)
        self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1))

    def LoadQDocObj(self, query):
        InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        hQDocObj = {}
        for line in open(InName):
            key, ObjId = line.strip().split('\t')
            if not key in hQDocObj:
                hQDocObj[key] = [ObjId]
            else:
                hQDocObj[key].append(ObjId)
        logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj))
        return hQDocObj

    def RankingForOneQ(self, qid, query):
        logging.info('Start LES ranking for [%s-%s]', qid, query)

        lDoc = self.Searcher.RunQuery(query, qid)
        logging.info('doc fetched')

        hQDocObj = self.LoadQDocObj(query)

        QKey = 'q_%s' % (qid)
        if not QKey in hQDocObj:
            #do nothing
            logging.info('query [%s] has no object, return raw raning', qid)
            return [doc.DocNo for doc in lDoc]

        lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]]

        lDocLESScore = []
        LesCnt = 0
        for doc in lDoc:
            if self.UseQObjOnly:
                lDocObj = lQObj
            else:
                if not doc.DocNo in hQDocObj:
                    lDocLESScore.append(0)
                    continue
                lDocObj = [
                    self.ObjCenter.FetchObj(ObjId)
                    for ObjId in hQDocObj[doc.DocNo]
                ]

            score = self.Inferener.inference(query, doc, lQObj, lDocObj)
            if score != 0:
                #if 0, means the obj has no desp (or very short one), doesn't count as valid score
                LesCnt += 1
            lDocLESScore.append(score)

        #add average score to doc without annotation
        #using zero is not very proper
        AvgScore = sum(lDocLESScore) / float(LesCnt)

        lDocLESScore = [
            item if item != 0 else AvgScore for item in lDocLESScore
        ]

        lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \
                     for doc,LESScore in zip(lDoc,lDocLESScore)]

        lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore)
        lDocNoScore.sort(key=lambda item: item[1], reverse=True)
        lRankedDocNo = [item[0] for item in lDocNoScore]

        logging.info('query [%s] ranked', qid)

        return lRankedDocNo

    def Process(self, QIn, OutName):

        lQidQuery = [
            line.split('\t') for line in open(QIn).read().splitlines()
        ]

        llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery]

        logging.info('start evaluation')

        lQid = [item[0] for item in lQidQuery]
        lQuery = [item[1] for item in lQidQuery]
        lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo)

        out = open(OutName, 'w')
        for qid, EvaRes in lPerQEvaRes:
            print >> out, qid + '\t' + EvaRes.dumps()

        out.close()
        logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps())

        return True

예제 #26

0

파일 보기

파일: ContinuousLmRankingEva.py 프로젝트: wayne9qiu/EmbeddingForIR

 def ShowConf(cls):
     cxBaseC.ShowConf()
     print cls.__name__
     print 'word2vecin\nkernel\nlmname\nbandwidth\nin\nout'
     IndriSearchCenterC.ShowConf()
     AdhocEvaC.ShowConf()

예제 #27

0

파일 보기

파일: NodeCollectCenter.py 프로젝트: wayne9qiu/GraphRepresentation

class NodeCollectorCenterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC()
        self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC()

        self.lQueryNodeGroup = []
        self.lDocNodeGroup = []

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.lQueryNodeGroup = self.conf.GetConf('querynodegroup',
                                                 self.lQueryNodeGroup)
        self.lDocNodeGroup = self.conf.GetConf('docnodegroup',
                                               self.lDocNodeGroup)
        self.Searcher.SetConf(ConfIn)
        if 'ana' in self.lQueryNodeGroup:
            self.QueryNodePreFetchedCollector.SetConf(ConfIn)
        if 'facc' in self.lDocNodeGroup:
            self.DocNodeFaccAnaCollector.SetConf(ConfIn)

        logging.info('node collector center conf set')
        return

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        QueryPreFetchedNodeCollectorC.ShowConf()
        DocNodeFaccAnaCollectorC.ShowConf()
        IndriSearchCenterC.ShowConf()
        print 'querynodegroup ana'
        print 'docnodegroup facc'

    def process(self, qid, query):
        '''
        retrieval lDoc
        call query node generator
        call doc node generator
        '''

        lDoc = self.Searcher.RunQuery(query, qid)

        lQObj = self.CollectQueryNode(qid, query)

        llDocObj = self.CollectDocNode(lDoc, qid, query)

        logging.info('[%s][%s] node collected', qid, query)
        return lDoc, lQObj, llDocObj

    def CollectQueryNode(self, qid, query):
        lQNodeScore = []

        if 'ana' in self.lQueryNodeGroup:
            lQNodeScore.extend(
                self.QueryNodePreFetchedCollector.process(qid, query))

        lQObj = list(set([item[0] for item in lQNodeScore]))
        return lQObj

    def CollectDocNode(self, lDoc, qid, query):
        llDocObj = []
        if 'facc' in self.lDocNodeGroup:
            llDocNodeScore = self.DocNodeFaccAnaCollector.process(
                lDoc, qid, query)
            llDocObj = [
                list(set([item[0] for item in lDocNodeScore]))
                for lDocNodeScore in llDocNodeScore
            ]


#             for lDocNodeScore in llDocNodeScore:
#                 lDocObj = [item[0] for item in lDocNodeScore]
#                 lDocObj = list(set(lDocObj))
#                 llDocObj.append(lDocObj)

        return llDocObj

    def PipeRun(self, QInName, OutName, OutFormat='json'):
        '''
        read qid,query
        run
        output to out name
        each line a json dumped [qid,query,lDoc,lQObj,lDocObj]
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        if OutFormat == 'json':
            out = open(OutName, 'w')

        for qid, query in lQidQuery:
            lDoc, lQObj, llDocObj = self.process(qid, query)
            if OutFormat == 'json':
                print >> out, json.dumps([qid, query, lDoc, lQObj, llDocObj])
            if OutFormat == 'dir':

                #print doc id\t obj id (doc id could be query indicating query obj)
                self.DumpRawFormat(qid, query, lDoc, lQObj, llDocObj, OutName)

        if OutFormat == 'json':
            out.close()
        logging.info('query in [%s] node genereated, dumped to [%s]', QInName,
                     OutName)

    def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName):

        if not os.path.exists(OutName):
            os.makedirs(OutName)

        out = open(
            OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')

        logging.info('q[%s] has [%d] q node', qid, len(lQObj))
        for QObj in lQObj:
            print >> out, 'q_' + qid + '\t' + QObj

        if llDocObj == []:
            logging.info('no doc node')
        else:
            for doc, lDocObj in zip(lDoc, llDocObj):
                logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj))
                for DocObj in lDocObj:
                    print >> out, doc.DocNo + '\t' + DocObj

        out.close()
        logging.info('q [%s] raw node res dumpped', qid)
        return

    @staticmethod
    def LoadRawFormatNodeRes(query, InDir):
        '''
        read results from the disk as dumped
        '''
        lDocNo = []
        llDocObj = []

        InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        lLines = open(InName).read().splitlines()
        lvCol = [line.split('\t') for line in lLines]

        lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')]
        lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')]

        lQObj = [vCol[1] for vCol in lQCol]
        logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj))

        LastDocNo = ""
        for DocNo, ObjId in lDocCol:
            if not DocNo == LastDocNo:
                llDocObj.append([])
                lDocNo.append(DocNo)
                LastDocNo = DocNo
            llDocObj[-1].append(ObjId)

        return lDocNo, lQObj, llDocObj

예제 #28

0

파일 보기

파일: LeToRFeatureExtractCenter.py 프로젝트: wayne9qiu/EmbeddingForIR

class LeToRFeatureExtractCenterC(cxBaseC):
    
    def Init(self):
        cxBaseC.Init(self)
        self.Prepared = False
        
        self.Word2VecInName = ""
        self.Word2VecModel = None
        
        self.lFeatureGroup = []
        self.Searcher = IndriSearchCenterC()
        self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC()
        self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC()
        self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC()
        self.QRelCenter = AdhocQRelC()
        self.QRelIn = ""
        
    
    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Word2VecInName = self.conf.GetConf('word2vecin')
        
        self.lFeatureGroup = self.conf.GetConf('featuregroup')
        
        self.QRelIn = self.conf.GetConf('qrel')
        self.QRelCenter.Load(self.QRelIn)
        if type(self.lFeatureGroup) != list:
            self.lFeatureGroup = [self.lFeatureGroup]
            
        self.Searcher.SetConf(ConfIn)
        
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.SetConf(ConfIn)
            
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.SetConf(ConfIn)
            
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.SetConf(ConfIn)
            
            
        return True
    
    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm'
        LeToRGivenFeatureExtractorC.ShowConf()
        EmbeddingTermPairFeatureExtractorC.ShowConf()
        EmbeddingLmFeatureExtractorC.ShowConf()
        IndriSearchCenterC.ShowConf()
        
    def Prepare(self):
        if self.Prepared:
            return
        
        
        
        logging.info('start load word2vec input')
        self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName)
        logging.info('word2vec loaded')
        if 'givenfeature' in self.lFeatureGroup:
            self.GivenFeatureExtractor.Prepare()
        if 'termpairemb' in self.lFeatureGroup:
            self.EmbTermPairFeatureExtractor.Prepare()
        if 'emblm' in self.lFeatureGroup:
            self.EmbLmFeatureExtractor.Prepare()
        
        self.Prepared = True
        return
    
    def Process(self, qid,query,doc):
        '''
        extract all features here
        '''
        self.Prepare()
        
        
        hFeature = {}
        logging.debug('extracting for [%s][%s]',qid,doc.DocNo)
        if 'givenfeature' in self.lFeatureGroup:
            hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc))
            logging.debug('given feature extracted')
        
        if 'termpairemb' in self.lFeatureGroup:
            hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('termpairemb feature extracted')
            
        if 'emblm' in self.lFeatureGroup:
            hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel))
            logging.debug('emblm feature extracted')
            
        return hFeature
    
    
    def PipeLineRun(self,QInName,OutName):
        '''
        will make a feature hash myself... It should be OK right?
        '''
        hFeatureName = {}
        self.Prepare()
        lLines = open(QInName).read().splitlines()
        lQidQuery = [line.split('\t') for line in lLines]
        out = open(OutName,'w')
        
        logging.info('start extracting for file [%s]',QInName)
        for qid,query in lQidQuery:
            lDoc = self.Searcher.RunQuery(query, qid)
            for doc in lDoc:
                hFeature = self.Process(qid, query, doc)
                LTRData = LeToRDataBaseC()
                LTRData.qid = qid
                LTRData.DocNo = doc.DocNo
                LTRData.hFeature = hFeature
                
                LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo)
                hFeatureName = LTRData.HashFeatureName(hFeatureName)
                print >>out,LTRData.dumps()
                
            logging.info('qid [%s] extracted',qid)
            
        out.close()
        
        NameOut = open(OutName + '_FeatureName','w')
        for name,Id in hFeatureName.items():
            print >>NameOut,'%d\t%s' %(Id,name)
        NameOut.close()
        logging.info('finished')
        return

예제 #29

0

파일 보기

파일: SplitDocAnaResPerQuery.py 프로젝트: wayne9qiu/GraphRepresentation

 def ShowConf():
     cxBaseC.ShowConf()
     print 'docanain\noutdir\nin\ndoctextin'
     IndriSearchCenterC.ShowConf()

예제 #30

0

파일 보기

파일: SplitDocAnaResPerQuery.py 프로젝트: wayne9qiu/GraphRepresentation

class DocAnaResSERPSplitterC(cxBaseC):
    def Init(self):
        cxBaseC.Init(self)
        self.Searcher = IndriSearchCenterC()
        self.hDocAnaData = {}
        self.hDocText = {}
        self.OutDir = ''
        self.QInName = ""

    def SetConf(self, ConfIn):
        cxBaseC.SetConf(self, ConfIn)
        self.Searcher.SetConf(ConfIn)
        DocAnaIn = self.conf.GetConf('docanain')
        DocTextIn = self.conf.GetConf('doctextin')
        self.ReadDocAna(DocAnaIn, DocTextIn)
        self.OutDir = self.conf.GetConf('outdir')
        self.QInName = self.conf.GetConf('in')

    @staticmethod
    def ShowConf():
        cxBaseC.ShowConf()
        print 'docanain\noutdir\nin\ndoctextin'
        IndriSearchCenterC.ShowConf()

    def ReadDocAna(self, DocAnaIn, DocTextIn):
        lLines = open(DocAnaIn).read().splitlines()
        lDict = [[line.split()[0], line] for line in lLines]
        self.hDocAnaData = dict(lDict)

        lLines = open(DocTextIn).read().splitlines()

        lDict = [line.split('#')[0].strip().split('\t') for line in lLines]
        self.hDocText = dict(lDict)
        return True

    def DumpOneQ(self, qid, query):
        lDoc = self.Searcher.RunQuery(query, qid)
        out = open(self.OutDir + '/%s' % (query.replace(' ', '_')), 'w')

        for doc in lDoc:
            if (not doc.DocNo in self.hDocAnaData) | (not doc.DocNo
                                                      in self.hDocText):
                continue
            print >> out, "<doc>"
            line = self.hDocAnaData[doc.DocNo]

            vCol = line.split('\t')
            text = self.hDocText[doc.DocNo]
            print >> out, vCol[0] + '\t' + text

            if len(vCol) > 2:
                vAna = vCol[1:]
                for i in range(len(vAna) / 8):
                    print >> out, '\t'.join(vAna[8 * i:8 * i + 8])

            print >> out, "</doc>\n\n\n"

        out.close()
        logging.info('[%s] data dumped', query)
        return True

    def Process(self):

        lQidQuery = [
            line.split('\t')
            for line in open(self.QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            self.DumpOneQ(qid, query)

        logging.info('finished')