class RawGraphPerEdgeFeatureConstructorC(SearchResDocGraphConstructorC): def Init(self): SearchResDocGraphConstructorC.Init(self) self.EdgeFeatureCenter = ObjObjFeatureExtractCenterC() self.ObjCenter = FbObjCacheCenterC() def SetConf(self, ConfIn): SearchResDocGraphConstructorC.SetConf(self, ConfIn) self.EdgeFeatureCenter.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) @staticmethod def ShowConf(): SearchResDocGraphConstructorC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf() FbObjCacheCenterC.ShowConf() def FormForOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc] for DocKg in lDocKg: logging.info('forming edge mtx for [%s] [%d] obj', DocKg.DocNo, len(DocKg.hNodeId)) lObjId = DocKg.hNodeId.keys() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] mhFeature = self.EdgeFeatureCenter.ExtractObjObjFeature( lObj, query) for FeatureName in self.EdgeFeatureCenter.FeatureDims(): OutDir = self.OutDir + '/' + FeatureName + '/' + qid if not os.path.exists(OutDir): os.makedirs(OutDir) llEdgeFeatureScore = [[ hFeature[FeatureName] for hFeature in lhFeature ] for lhFeature in mhFeature] DocKg.mEdgeMatrix = np.array(llEdgeFeatureScore) DocKg.dump(OutDir + '/' + DocKg.DocNo) logging.debug('[%s] feature for doc [%s] dummped', FeatureName, DocKg.DocNo) logging.info('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg)) logging.info('[%s-%s] doc kg formed', qid, query) return True
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.Evaluator = AdhocEvaC() self.Inferener = LESInferencerC() self.QDocNodeDataDir = "" self.OrigQWeight = 0.5 self.UseQObjOnly = True @classmethod def ShowConf(cls): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() AdhocEvaC.ShowConf() print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Evaluator.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/' self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1)) def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj def RankingForOneQ(self, qid, query): logging.info('Start LES ranking for [%s-%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) logging.info('doc fetched') hQDocObj = self.LoadQDocObj(query) QKey = 'q_%s' % (qid) if not QKey in hQDocObj: #do nothing logging.info('query [%s] has no object, return raw raning', qid) return [doc.DocNo for doc in lDoc] lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]] lDocLESScore = [] LesCnt = 0 for doc in lDoc: if self.UseQObjOnly: lDocObj = lQObj else: if not doc.DocNo in hQDocObj: lDocLESScore.append(0) continue lDocObj = [ self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[doc.DocNo] ] score = self.Inferener.inference(query, doc, lQObj, lDocObj) if score != 0: #if 0, means the obj has no desp (or very short one), doesn't count as valid score LesCnt += 1 lDocLESScore.append(score) #add average score to doc without annotation #using zero is not very proper AvgScore = sum(lDocLESScore) / float(LesCnt) lDocLESScore = [ item if item != 0 else AvgScore for item in lDocLESScore ] lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \ for doc,LESScore in zip(lDoc,lDocLESScore)] lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore) lDocNoScore.sort(key=lambda item: item[1], reverse=True) lRankedDocNo = [item[0] for item in lDocNoScore] logging.info('query [%s] ranked', qid) return lRankedDocNo def Process(self, QIn, OutName): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery] logging.info('start evaluation') lQid = [item[0] for item in lQidQuery] lQuery = [item[1] for item in lQidQuery] lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo) out = open(OutName, 'w') for qid, EvaRes in lPerQEvaRes: print >> out, qid + '\t' + EvaRes.dumps() out.close() logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps()) return True
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.ObjCenter = FbObjCacheCenterC() self.Inferener = LESInferencerC() self.DocKgDir = "" self.hQObj = {} self.OrigQWeight = 0.5 @classmethod def ShowConf(cls): cxBaseC.ShowConf() FbObjCacheCenterC.ShowConf() print 'origqweight 0.5' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.DocKgDir = self.conf.GetConf('dockgdir') QAnaInName = self.conf.GetConf('qanain') self.LoadQObj(QAnaInName) self.ObjCenter.SetConf(ConfIn) self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) def LoadQObj(self,QAnaInName): for line in open(QAnaInName).read().splitlines(): vCol = line.strip().split('\t') qid = vCol[0] ObjId = vCol[2] score = vCol[-1] if not qid in self.hQObj: self.hQObj[qid] = [[ObjId,score]] else: self.hQObj[qid].append([ObjId,score]) logging.info('qobj loaded from [%s]',QAnaInName) return True def RankScoreForDoc(self,qid,query,doc): DocKg = SearchResDocGraphConstructorC.LoadDocGraph(self.DocKgDir, qid, doc.DocNo) lQObjId = [item[0] for item in self.hQObj[qid]] lDocObjId = DocKg.hNodeId.keys() lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] lDocObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] score = self.Inferener.inference(query, doc, lQObj, lDocObj) return score def Rank(self,qid,query,lDoc): if not qid in self.hQObj: logging.warn('qid [%s] no ana obj, withdraw to given score',qid) return [doc.DocNo for doc in lDoc] lScore = [self.RankScoreForDoc(qid, query, doc) for doc in lDoc] lDocNoScore = zip([doc.DocNo for doc in lDoc],lScore) lDocNoScore.sort(key=lambda item: item[1], reverse = True) lRankRes = [item[0] for item in lDocNoScore] return lRankRes
class GraphFullFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.NodeDir = "" self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.QDocFeatureExtractor = LeToRFeatureExtractCenterC() self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC() self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC() self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC() def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocFeatureExtractor.SetConf(ConfIn) self.QObjFeatureExtractor.SetConf(ConfIn) self.DocObjFeatureExtractor.SetConf(ConfIn) self.ObjObjFeatureExtractor.SetConf(ConfIn) logging.info('graph full feature extractor conf setted') @classmethod def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'nodedir' IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() LeToRFeatureExtractCenterC.ShowConf() FbQObjFeatureExtractCenterC.ShowConf() FbObjDocFeatureExtractCenterC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes( query, self.NodeDir) #match lDoc dim lDocNo dim lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo) lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] for lDocObjId in llDocObjId] while len(llDocObj) < len(lDoc): #add empty list for docs have no objects (thus will restrict to EsdRank) #if lQObj is also empty, then it is LeToR llDocObj.append([]) logging.info('q[%s] all node fetched, q node %s', qid, json.dumps([Obj.GetId() for Obj in lQObj])) return lDoc, lQObj, llDocObj def Process(self, qid, query, OutDir): ''' ''' lDoc, lQObj, llDocObj = self.FormulateNodes(qid, query) for doc, lDocObj in zip(lDoc, llDocObj): hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature = self.ExtractFeatureForOneQDoc( qid, query, doc, lQObj + lDocObj) self.DumpPerQRes(qid, query, doc, lQObj + lDocObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir) logging.info('q [%s] processed') return True def PipeRun(self, QInName, OutDir): lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: self.Process(qid, query, OutDir) logging.info('queries in [%s] processed features at [%s]', QInName, OutDir) return True def ExtractFeatureForOneQDoc(self, qid, query, doc, lObj): #if wanna speed up, cache features #for clearity, now just extract multiple times hQDocFeature = self.QDocFeatureExtractor.Process(qid, query, doc) logging.debug('q[%s][%s] ltr feature extracted', query, doc.DocNo) lhQObjFeature = self.QObjFeatureExtractor.ProcessOneQuery([qid, query], lObj) logging.debug('q[%s][%s] obj feature extracted', query, doc.DocNo) lhDocObjFeature = self.DocObjFeatureExtractor.ProcessOneQueryDocPair( [qid, query], doc, lObj) logging.debug('q[%s][%s] doc obj feature extracted', query, doc.DocNo) llhObjObjFeature = self.ObjObjFeatureExtractor.Process( qid, query, lObj) #symetric matrix logging.debug('q[%s] [%s] obj obj feature extracted', query, doc.DocNo) logging.debug('q [%s][%s] all doc graph feature extracted', query, doc.DocNo) return hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature def DumpPerQRes(self, qid, query, doc, lObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir): ''' raw: a dir for this q a file for each doc node a, node b, hFeature.json ''' if not os.path.exists(OutDir + '/' + qid): os.makedirs(OutDir + '/' + qid) OutName = OutDir + '/' + qid + '/' + doc.DocNo out = open(OutName, 'w') #q doc print >> out, 'q_%s' % (qid) + '\t' + doc.DocNo + '\t' + json.dumps( hQDocFeature) #obj doc for Obj, hDocObjFeature in zip(lObj, lhDocObjFeature): print >> out, Obj.GetId() + '\t' + doc.DocNo + '\t' + json.dumps( hDocObjFeature) #q obj for Obj, hQObjFeature in zip(lObj, lhQObjFeature): print >> out, 'q_%s' % ( qid) + '\t' + Obj.GetId() + '\t' + json.dumps(hQObjFeature) print >> out, Obj.GetId() + '\t' + 'q_%s' % ( qid) + '\t' + json.dumps(hQObjFeature) #make it symmetric #obj obj for i in range(len(lObj)): for j in range(len(lObj)): if i == j: continue print >> out, lObj[i].GetId() + '\t' + lObj[j].GetId( ) + '\t' + json.dumps(llhObjObjFeature[i][j]) logging.info('q[%s] doc [%s] graph dumped to file [%s]', qid, doc.DocNo, OutName) return True
site.addsitedir('/bos/usr0/cx/PyCode/GoogleAPI') from ObjCenter.FbObjCacheCenter import FbObjCacheCenterC from cxBase.Conf import cxConfC import sys if 2 != len(sys.argv): FbObjCacheCenterC.ShowConf() print 'in\nout' sys.exit() conf = cxConfC(sys.argv[1]) CacheCenter = FbObjCacheCenterC(sys.argv[1]) InName = conf.GetConf('in') OutName = conf.GetConf('out') out = open(OutName, 'w') for line in open(InName): line = line.strip() vCol = line.split('\t') ObjId = vCol[2] lLinkedObj = CacheCenter.FetchObj(ObjId).GetNeighbor() print >> out, line for edge, ApiObj in lLinkedObj: print >> out, '\t'.join(vCol[:2]) + '\t' + ApiObj.GetId( ) + '\t' + ApiObj.GetName() + '\tNeighbor\t1\t' + edge out.close() print "finished"
class EdgeFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.lQObjFeatureGroup = [] self.lObjObjFeatureGroup = [] self.lDocObjFeatureGroup = [] self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC() self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC() self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC() self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC() self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC() self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NodeDir = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.lQObjFeatureGroup = self.conf.GetConf('qobjfeaturegroup', self.lQObjFeatureGroup) self.lDocObjFeatureGroup = self.conf.GetConf('docobjfeaturegroup', self.lDocObjFeatureGroup) self.lObjObjFeatureGroup = self.conf.GetConf('objobjfeaturegroup', self.lObjObjFeatureGroup) if 'ana' in self.lQObjFeatureGroup: self.QObjAnaExtractor.SetConf(ConfIn) if 'facc' in self.lDocObjFeatureGroup: self.DocObjFaccExtractor.SetConf(ConfIn) if 'kg' in self.lObjObjFeatureGroup: self.ObjObjKGExtractor.SetConf(ConfIn) if 'precalc' in self.lObjObjFeatureGroup: self.ObjObjPreCalcExtractor.SetConf(ConfIn) if 'textsim' in self.lObjObjFeatureGroup: self.ObjObjTextSimExtractor.SetConf(ConfIn) logging.info('edge feature center confs setted') @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup' QueryObjEdgeFeatureAnaExtractorC.ShowConf() DocObjEdgeFeatureFaccExtractorC.ShowConf() ObjObjEdgeFeatureKGExtractorC.ShowConf() ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf() ObjObjEdgeFeatureTextSimExtractorC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lObjId = open(self.NodeDir + IndriSearchCenterC.GenerateQueryTargetName(query)).read( ).splitlines() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj)) return lDoc, lObj def ExtractPerQObj(self, qid, query, obj): hFeature = {} logging.debug('start extracting q[%s]-obj[%s] feature', query, obj.GetId()) if 'ana' in self.lQObjFeatureGroup: hFeature.update(self.QObjAnaExtractor.process(qid, query, obj)) logging.debug('q[%s]-obj[%s] feature extracted', query, obj.GetId()) return hFeature def ExtractQObjFeature(self, qid, query, lObj): lhFeature = [] logging.info('start extracting [%s][%s] q-obj feature [%d] obj', qid, query, len(lObj)) for obj in lObj: hFeature = self.ExtractPerQObj(qid, query, obj) lhFeature.append(hFeature) logging.info('q obj feature extracted') return lhFeature def ExtractPerDocObj(self, doc, obj): hFeature = {} logging.debug('start extracting doc[%s]-obj[%s] feature', doc.DocNo, obj.GetId()) if 'facc' in self.lDocObjFeatureGroup: hFeature.update(self.DocObjFaccExtractor.process(doc, obj)) logging.debug('doc[%s]-obj[%s] feature extracted', doc.DocNo, obj.GetId()) return hFeature def ExtractDocObjFeature(self, lDoc, lObj): llhFeature = [] #doc \times obj logging.info('start extract [%d] doc - [%d] obj feature mtx', len(lDoc), len(lObj)) for doc in lDoc: lhFeature = [] for obj in lObj: hFeature = self.ExtractPerDocObj(doc, obj) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('doc obj feature extracted') return llhFeature def ExtractPerObjObj(self, ObjA, ObjB, query): hFeature = {} logging.debug('start extracting for obj pair [%s-%s]', ObjA.GetId(), ObjB.GetId()) if 'kg' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjKGExtractor.process(ObjA, ObjB)) if 'precalc' in self.lObjObjFeatureGroup: hFeature.update( self.ObjObjPreCalcExtractor.process(ObjA, ObjB, query)) if 'textsim' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjTextSimExtractor.process(ObjA, ObjB)) logging.debug('obj pair [%s-%s] feature extracted', ObjA.GetId(), ObjB.GetId()) return hFeature def ExtractObjObjFeature(self, lObj, query): llhFeature = [] #obj -> obj, diagonal is empty logging.info('start extract [%d] obj pair feature mtx', len(lObj)) for ObjA in lObj: lhFeature = [] for ObjB in lObj: if ObjA.GetId() == ObjB.GetId(): continue hFeature = self.ExtractPerObjObj(ObjA, ObjB, query) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('obj obj feature extracted') return llhFeature def Process(self, qid, query): lDoc, lObj = self.FormulateNodes(qid, query) logging.info('nodes fetched') lQObjFeature = self.ExtractQObjFeature(qid, query, lObj) llDocObjFeature = self.ExtractDocObjFeature(lDoc, lObj) llObjObjFeature = self.ExtractObjObjFeature(lObj, query) return lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature def DumpRes(self, OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature): out = open(OutName, 'w') for obj, hFeature in zip(lObj, lQObjFeature): print >> out, query + '\t' + obj.GetId() + '\t' + json.dumps( hFeature) for doc, lhFeature in zip(lDoc, llDocObjFeature): for obj, hFeature in zip(lObj, lhFeature): print >> out, doc.DocNo + '\t' + obj.GetId( ) + '\t' + json.dumps(hFeature) for ObjA, lhFeature in zip(lObj, llObjObjFeature): for ObjB, hFeature in zip(lObj, lhFeature): print >> out, ObjA.GetId() + '\t' + ObjB.GetId( ) + '\t' + json.dumps(hFeature) out.close() logging.info('query [%s] feature dumped', query) def PipeRun(self, QInName, OutDir): ''' for now: output raw type each file is a query's edge features each line is query|doc|obj \t obj \t json.dumps(hFeature) ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: logging.info('start extracting for [%s][%s]', qid, query) lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process( qid, query) OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query, OutName) self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature) logging.info('all finished') return