def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.hDocAnaData = {} self.hDocText = {} self.OutDir = '' self.QInName = ""
def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC() self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC() self.lQueryNodeGroup = [] self.lDocNodeGroup = []
def Init(self): cxBaseC.Init(self) self.QIn = "" self.OutDir = "" self.Word2VecInName = "" self.Word2VecModel = None self.Searcher = IndriSearchCenterC() self.BinNumber = 100
def Init(self): cxBaseC.Init(self) self.Evaluator = AdhocEvaC() self.Searcher = IndriSearchCenterC() self.Word2VecInName = "" self.Word2VecModel = None self.lLmName = [] self.LmClass = None self.lOutName = [] self.QueryInName = ""
def Init(self): cxBaseC.Init(self) self.Evaluator = AdhocEvaC() self.Searcher = IndriSearchCenterC() self.Word2VecInName = "" self.Word2VecModel = None self.cLmName = "kde" self.LmClass = KernelDensityLmC #lm conf input self.ParaConf = cxConfC()
def Init(self): cxBaseC.Init(self) self.NodeDir = "" self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.QDocFeatureExtractor = LeToRFeatureExtractCenterC() self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC() self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC() self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC()
def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.Evaluator = AdhocEvaC() self.Inferener = LESInferencerC() self.QDocNodeDataDir = "" self.OrigQWeight = 0.5 self.UseQObjOnly = True
def Init(self): cxBaseC.Init(self) self.Prepared = False self.Word2VecInName = "" self.Word2VecModel = None self.lFeatureGroup = [] self.Searcher = IndriSearchCenterC() self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC() self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC() self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC() self.QRelCenter = AdhocQRelC() self.QRelIn = ""
def ShowConf(): cxBaseC.ShowConf() QueryPreFetchedNodeCollectorC.ShowConf() DocNodeFaccAnaCollectorC.ShowConf() IndriSearchCenterC.ShowConf() print 'querynodegroup ana' print 'docnodegroup facc'
def LoadRawFormatNodeRes(query, InDir): ''' read results from the disk as dumped ''' lDocNo = [] llDocObj = [] InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) lLines = open(InName).read().splitlines() lvCol = [line.split('\t') for line in lLines] lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')] lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')] lQObj = [vCol[1] for vCol in lQCol] logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj)) LastDocNo = "" for DocNo, ObjId in lDocCol: if not DocNo == LastDocNo: llDocObj.append([]) lDocNo.append(DocNo) LastDocNo = DocNo llDocObj[-1].append(ObjId) return lDocNo, lQObj, llDocObj
def ShowConf(): cxBaseC.ShowConf() print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm' LeToRGivenFeatureExtractorC.ShowConf() EmbeddingTermPairFeatureExtractorC.ShowConf() EmbeddingLmFeatureExtractorC.ShowConf() IndriSearchCenterC.ShowConf()
def ShowConf(cls): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() AdhocEvaC.ShowConf() print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1'
def PipeRun(self, QInName, OutDir): ''' for now: output raw type each file is a query's edge features each line is query|doc|obj \t obj \t json.dumps(hFeature) ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: logging.info('start extracting for [%s][%s]', qid, query) lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process( qid, query) OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query, OutName) self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature) logging.info('all finished') return
def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes( query, self.NodeDir) #match lDoc dim lDocNo dim lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo) lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] for lDocObjId in llDocObjId] while len(llDocObj) < len(lDoc): #add empty list for docs have no objects (thus will restrict to EsdRank) #if lQObj is also empty, then it is LeToR llDocObj.append([]) logging.info('q[%s] all node fetched, q node %s', qid, json.dumps([Obj.GetId() for Obj in lQObj])) return lDoc, lQObj, llDocObj
def Init(self): cxBaseC.Init(self) self.lQObjFeatureGroup = [] self.lObjObjFeatureGroup = [] self.lDocObjFeatureGroup = [] self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC() self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC() self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC() self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC() self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC() self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NodeDir = ""
def LoadQueryQid(self, QIn): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] lQueryNameQid = [[ IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0] ] for item in lQidQuery] self.hQueryQid = dict(lQueryNameQid)
def LoadOneQueryObjSim(self,query): for i in range(len(self.lPreCalcDir)): InName = self.lPreCalcDir[i] +'/' + IndriSearchCenterC.GenerateQueryTargetName(query) if not os.path.exists(InName): return False hObjPairSim = pickle.load(open(InName)) self.lhQueryObjPairSim[i][query] = hObjPairSim logging.info('query [%s] obj sim loaded',query) return True
def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup' QueryObjEdgeFeatureAnaExtractorC.ShowConf() DocObjEdgeFeatureFaccExtractorC.ShowConf() ObjObjEdgeFeatureKGExtractorC.ShowConf() ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf() ObjObjEdgeFeatureTextSimExtractorC.ShowConf()
def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'nodedir' IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() LeToRFeatureExtractCenterC.ShowConf() FbQObjFeatureExtractCenterC.ShowConf() FbObjDocFeatureExtractCenterC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf()
def OutputDocText(hQueryDocText, OutDir): for query, lDocNoText in hQueryDocText.items(): out = open( OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') for DocNo, text in lDocNoText: print >> out, DocNo + '\t' + text logging.info('query [%s] [%d] doc text outputed', query, len(lDocNoText)) out.close() logging.info('doc text dumped to [%s]', OutDir) return True
def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj
class SearchResDocGraphConstructorC(DocGraphConstructorC): def Init(self): DocGraphConstructorC.Init(self) self.Searcher = IndriSearchCenterC() def SetConf(self, ConfIn): DocGraphConstructorC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) @staticmethod def ShowConf(): DocGraphConstructorC.ShowConf() IndriSearchCenterC.ShowConf() def FormForOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc] QueryOutDir = self.OutDir + '/' + qid if not os.path.exists(QueryOutDir): os.makedirs(QueryOutDir) for DocKg in lDocKg: DocKg.dump(QueryOutDir + '/' + DocKg.DocNo) logging.debug('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg)) logging.info('[%s-%s] doc kg formed', qid, query) return True def Process(self, QInName): lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: self.FormForOneQ(qid, query) logging.info('[%s] query finished', QInName) return True
def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lObjId = open(self.NodeDir + IndriSearchCenterC.GenerateQueryTargetName(query)).read( ).splitlines() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj)) return lDoc, lObj
def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName): if not os.path.exists(OutName): os.makedirs(OutName) out = open( OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') logging.info('q[%s] has [%d] q node', qid, len(lQObj)) for QObj in lQObj: print >> out, 'q_' + qid + '\t' + QObj if llDocObj == []: logging.info('no doc node') else: for doc, lDocObj in zip(lDoc, llDocObj): logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj)) for DocObj in lDocObj: print >> out, doc.DocNo + '\t' + DocObj out.close() logging.info('q [%s] raw node res dumpped', qid) return
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.Evaluator = AdhocEvaC() self.Inferener = LESInferencerC() self.QDocNodeDataDir = "" self.OrigQWeight = 0.5 self.UseQObjOnly = True @classmethod def ShowConf(cls): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() AdhocEvaC.ShowConf() print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Evaluator.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/' self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1)) def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj def RankingForOneQ(self, qid, query): logging.info('Start LES ranking for [%s-%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) logging.info('doc fetched') hQDocObj = self.LoadQDocObj(query) QKey = 'q_%s' % (qid) if not QKey in hQDocObj: #do nothing logging.info('query [%s] has no object, return raw raning', qid) return [doc.DocNo for doc in lDoc] lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]] lDocLESScore = [] LesCnt = 0 for doc in lDoc: if self.UseQObjOnly: lDocObj = lQObj else: if not doc.DocNo in hQDocObj: lDocLESScore.append(0) continue lDocObj = [ self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[doc.DocNo] ] score = self.Inferener.inference(query, doc, lQObj, lDocObj) if score != 0: #if 0, means the obj has no desp (or very short one), doesn't count as valid score LesCnt += 1 lDocLESScore.append(score) #add average score to doc without annotation #using zero is not very proper AvgScore = sum(lDocLESScore) / float(LesCnt) lDocLESScore = [ item if item != 0 else AvgScore for item in lDocLESScore ] lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \ for doc,LESScore in zip(lDoc,lDocLESScore)] lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore) lDocNoScore.sort(key=lambda item: item[1], reverse=True) lRankedDocNo = [item[0] for item in lDocNoScore] logging.info('query [%s] ranked', qid) return lRankedDocNo def Process(self, QIn, OutName): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery] logging.info('start evaluation') lQid = [item[0] for item in lQidQuery] lQuery = [item[1] for item in lQidQuery] lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo) out = open(OutName, 'w') for qid, EvaRes in lPerQEvaRes: print >> out, qid + '\t' + EvaRes.dumps() out.close() logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps()) return True
def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'word2vecin\nkernel\nlmname\nbandwidth\nin\nout' IndriSearchCenterC.ShowConf() AdhocEvaC.ShowConf()
class NodeCollectorCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.QueryNodePreFetchedCollector = QueryPreFetchedNodeCollectorC() self.DocNodeFaccAnaCollector = DocNodeFaccAnaCollectorC() self.lQueryNodeGroup = [] self.lDocNodeGroup = [] def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.lQueryNodeGroup = self.conf.GetConf('querynodegroup', self.lQueryNodeGroup) self.lDocNodeGroup = self.conf.GetConf('docnodegroup', self.lDocNodeGroup) self.Searcher.SetConf(ConfIn) if 'ana' in self.lQueryNodeGroup: self.QueryNodePreFetchedCollector.SetConf(ConfIn) if 'facc' in self.lDocNodeGroup: self.DocNodeFaccAnaCollector.SetConf(ConfIn) logging.info('node collector center conf set') return @staticmethod def ShowConf(): cxBaseC.ShowConf() QueryPreFetchedNodeCollectorC.ShowConf() DocNodeFaccAnaCollectorC.ShowConf() IndriSearchCenterC.ShowConf() print 'querynodegroup ana' print 'docnodegroup facc' def process(self, qid, query): ''' retrieval lDoc call query node generator call doc node generator ''' lDoc = self.Searcher.RunQuery(query, qid) lQObj = self.CollectQueryNode(qid, query) llDocObj = self.CollectDocNode(lDoc, qid, query) logging.info('[%s][%s] node collected', qid, query) return lDoc, lQObj, llDocObj def CollectQueryNode(self, qid, query): lQNodeScore = [] if 'ana' in self.lQueryNodeGroup: lQNodeScore.extend( self.QueryNodePreFetchedCollector.process(qid, query)) lQObj = list(set([item[0] for item in lQNodeScore])) return lQObj def CollectDocNode(self, lDoc, qid, query): llDocObj = [] if 'facc' in self.lDocNodeGroup: llDocNodeScore = self.DocNodeFaccAnaCollector.process( lDoc, qid, query) llDocObj = [ list(set([item[0] for item in lDocNodeScore])) for lDocNodeScore in llDocNodeScore ] # for lDocNodeScore in llDocNodeScore: # lDocObj = [item[0] for item in lDocNodeScore] # lDocObj = list(set(lDocObj)) # llDocObj.append(lDocObj) return llDocObj def PipeRun(self, QInName, OutName, OutFormat='json'): ''' read qid,query run output to out name each line a json dumped [qid,query,lDoc,lQObj,lDocObj] ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] if OutFormat == 'json': out = open(OutName, 'w') for qid, query in lQidQuery: lDoc, lQObj, llDocObj = self.process(qid, query) if OutFormat == 'json': print >> out, json.dumps([qid, query, lDoc, lQObj, llDocObj]) if OutFormat == 'dir': #print doc id\t obj id (doc id could be query indicating query obj) self.DumpRawFormat(qid, query, lDoc, lQObj, llDocObj, OutName) if OutFormat == 'json': out.close() logging.info('query in [%s] node genereated, dumped to [%s]', QInName, OutName) def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName): if not os.path.exists(OutName): os.makedirs(OutName) out = open( OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') logging.info('q[%s] has [%d] q node', qid, len(lQObj)) for QObj in lQObj: print >> out, 'q_' + qid + '\t' + QObj if llDocObj == []: logging.info('no doc node') else: for doc, lDocObj in zip(lDoc, llDocObj): logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj)) for DocObj in lDocObj: print >> out, doc.DocNo + '\t' + DocObj out.close() logging.info('q [%s] raw node res dumpped', qid) return @staticmethod def LoadRawFormatNodeRes(query, InDir): ''' read results from the disk as dumped ''' lDocNo = [] llDocObj = [] InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) lLines = open(InName).read().splitlines() lvCol = [line.split('\t') for line in lLines] lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')] lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')] lQObj = [vCol[1] for vCol in lQCol] logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj)) LastDocNo = "" for DocNo, ObjId in lDocCol: if not DocNo == LastDocNo: llDocObj.append([]) lDocNo.append(DocNo) LastDocNo = DocNo llDocObj[-1].append(ObjId) return lDocNo, lQObj, llDocObj
class LeToRFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Prepared = False self.Word2VecInName = "" self.Word2VecModel = None self.lFeatureGroup = [] self.Searcher = IndriSearchCenterC() self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC() self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC() self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC() self.QRelCenter = AdhocQRelC() self.QRelIn = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Word2VecInName = self.conf.GetConf('word2vecin') self.lFeatureGroup = self.conf.GetConf('featuregroup') self.QRelIn = self.conf.GetConf('qrel') self.QRelCenter.Load(self.QRelIn) if type(self.lFeatureGroup) != list: self.lFeatureGroup = [self.lFeatureGroup] self.Searcher.SetConf(ConfIn) if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.SetConf(ConfIn) if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.SetConf(ConfIn) if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.SetConf(ConfIn) return True @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm' LeToRGivenFeatureExtractorC.ShowConf() EmbeddingTermPairFeatureExtractorC.ShowConf() EmbeddingLmFeatureExtractorC.ShowConf() IndriSearchCenterC.ShowConf() def Prepare(self): if self.Prepared: return logging.info('start load word2vec input') self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName) logging.info('word2vec loaded') if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.Prepare() if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.Prepare() if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.Prepare() self.Prepared = True return def Process(self, qid,query,doc): ''' extract all features here ''' self.Prepare() hFeature = {} logging.debug('extracting for [%s][%s]',qid,doc.DocNo) if 'givenfeature' in self.lFeatureGroup: hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc)) logging.debug('given feature extracted') if 'termpairemb' in self.lFeatureGroup: hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('termpairemb feature extracted') if 'emblm' in self.lFeatureGroup: hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('emblm feature extracted') return hFeature def PipeLineRun(self,QInName,OutName): ''' will make a feature hash myself... It should be OK right? ''' hFeatureName = {} self.Prepare() lLines = open(QInName).read().splitlines() lQidQuery = [line.split('\t') for line in lLines] out = open(OutName,'w') logging.info('start extracting for file [%s]',QInName) for qid,query in lQidQuery: lDoc = self.Searcher.RunQuery(query, qid) for doc in lDoc: hFeature = self.Process(qid, query, doc) LTRData = LeToRDataBaseC() LTRData.qid = qid LTRData.DocNo = doc.DocNo LTRData.hFeature = hFeature LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo) hFeatureName = LTRData.HashFeatureName(hFeatureName) print >>out,LTRData.dumps() logging.info('qid [%s] extracted',qid) out.close() NameOut = open(OutName + '_FeatureName','w') for name,Id in hFeatureName.items(): print >>NameOut,'%d\t%s' %(Id,name) NameOut.close() logging.info('finished') return
def ShowConf(): cxBaseC.ShowConf() print 'docanain\noutdir\nin\ndoctextin' IndriSearchCenterC.ShowConf()
class DocAnaResSERPSplitterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.hDocAnaData = {} self.hDocText = {} self.OutDir = '' self.QInName = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) DocAnaIn = self.conf.GetConf('docanain') DocTextIn = self.conf.GetConf('doctextin') self.ReadDocAna(DocAnaIn, DocTextIn) self.OutDir = self.conf.GetConf('outdir') self.QInName = self.conf.GetConf('in') @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'docanain\noutdir\nin\ndoctextin' IndriSearchCenterC.ShowConf() def ReadDocAna(self, DocAnaIn, DocTextIn): lLines = open(DocAnaIn).read().splitlines() lDict = [[line.split()[0], line] for line in lLines] self.hDocAnaData = dict(lDict) lLines = open(DocTextIn).read().splitlines() lDict = [line.split('#')[0].strip().split('\t') for line in lLines] self.hDocText = dict(lDict) return True def DumpOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) out = open(self.OutDir + '/%s' % (query.replace(' ', '_')), 'w') for doc in lDoc: if (not doc.DocNo in self.hDocAnaData) | (not doc.DocNo in self.hDocText): continue print >> out, "<doc>" line = self.hDocAnaData[doc.DocNo] vCol = line.split('\t') text = self.hDocText[doc.DocNo] print >> out, vCol[0] + '\t' + text if len(vCol) > 2: vAna = vCol[1:] for i in range(len(vAna) / 8): print >> out, '\t'.join(vAna[8 * i:8 * i + 8]) print >> out, "</doc>\n\n\n" out.close() logging.info('[%s] data dumped', query) return True def Process(self): lQidQuery = [ line.split('\t') for line in open(self.QInName).read().splitlines() ] for qid, query in lQidQuery: self.DumpOneQ(qid, query) logging.info('finished')