class RawGraphPerEdgeFeatureConstructorC(SearchResDocGraphConstructorC): def Init(self): SearchResDocGraphConstructorC.Init(self) self.EdgeFeatureCenter = ObjObjFeatureExtractCenterC() self.ObjCenter = FbObjCacheCenterC() def SetConf(self, ConfIn): SearchResDocGraphConstructorC.SetConf(self, ConfIn) self.EdgeFeatureCenter.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) @staticmethod def ShowConf(): SearchResDocGraphConstructorC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf() FbObjCacheCenterC.ShowConf() def FormForOneQ(self, qid, query): lDoc = self.Searcher.RunQuery(query, qid) lDocKg = [self.GraphFormer.FillDocGraph(doc.DocNo) for doc in lDoc] for DocKg in lDocKg: logging.info('forming edge mtx for [%s] [%d] obj', DocKg.DocNo, len(DocKg.hNodeId)) lObjId = DocKg.hNodeId.keys() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] mhFeature = self.EdgeFeatureCenter.ExtractObjObjFeature( lObj, query) for FeatureName in self.EdgeFeatureCenter.FeatureDims(): OutDir = self.OutDir + '/' + FeatureName + '/' + qid if not os.path.exists(OutDir): os.makedirs(OutDir) llEdgeFeatureScore = [[ hFeature[FeatureName] for hFeature in lhFeature ] for lhFeature in mhFeature] DocKg.mEdgeMatrix = np.array(llEdgeFeatureScore) DocKg.dump(OutDir + '/' + DocKg.DocNo) logging.debug('[%s] feature for doc [%s] dummped', FeatureName, DocKg.DocNo) logging.info('[%s] dummped [%d] node', DocKg.DocNo, len(DocKg)) logging.info('[%s-%s] doc kg formed', qid, query) return True
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.Evaluator = AdhocEvaC() self.Inferener = LESInferencerC() self.QDocNodeDataDir = "" self.OrigQWeight = 0.5 self.UseQObjOnly = True @classmethod def ShowConf(cls): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() AdhocEvaC.ShowConf() print 'qdocnodedatadir\norigqweight 0.5\nqobjonly 1' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.Evaluator.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocNodeDataDir = self.conf.GetConf('qdocnodedatadir') + '/' self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) self.UseQObjOnly = bool(self.conf.GetConf('qobjonly', 1)) def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj def RankingForOneQ(self, qid, query): logging.info('Start LES ranking for [%s-%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) logging.info('doc fetched') hQDocObj = self.LoadQDocObj(query) QKey = 'q_%s' % (qid) if not QKey in hQDocObj: #do nothing logging.info('query [%s] has no object, return raw raning', qid) return [doc.DocNo for doc in lDoc] lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[QKey]] lDocLESScore = [] LesCnt = 0 for doc in lDoc: if self.UseQObjOnly: lDocObj = lQObj else: if not doc.DocNo in hQDocObj: lDocLESScore.append(0) continue lDocObj = [ self.ObjCenter.FetchObj(ObjId) for ObjId in hQDocObj[doc.DocNo] ] score = self.Inferener.inference(query, doc, lQObj, lDocObj) if score != 0: #if 0, means the obj has no desp (or very short one), doesn't count as valid score LesCnt += 1 lDocLESScore.append(score) #add average score to doc without annotation #using zero is not very proper AvgScore = sum(lDocLESScore) / float(LesCnt) lDocLESScore = [ item if item != 0 else AvgScore for item in lDocLESScore ] lScore= [self.OrigQWeight * math.exp(doc.score) + (1-self.OrigQWeight) * LESScore \ for doc,LESScore in zip(lDoc,lDocLESScore)] lDocNoScore = zip([doc.DocNo for doc in lDoc], lScore) lDocNoScore.sort(key=lambda item: item[1], reverse=True) lRankedDocNo = [item[0] for item in lDocNoScore] logging.info('query [%s] ranked', qid) return lRankedDocNo def Process(self, QIn, OutName): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] llDocNo = [self.RankingForOneQ(qid, query) for qid, query in lQidQuery] logging.info('start evaluation') lQid = [item[0] for item in lQidQuery] lQuery = [item[1] for item in lQidQuery] lPerQEvaRes = self.Evaluator.EvaluateFullRes(lQid, lQuery, llDocNo) out = open(OutName, 'w') for qid, EvaRes in lPerQEvaRes: print >> out, qid + '\t' + EvaRes.dumps() out.close() logging.info('%s %s', lPerQEvaRes[-1][0], lPerQEvaRes[-1][1].dumps()) return True
class LESRanker(cxBaseC): def Init(self): cxBaseC.Init(self) self.ObjCenter = FbObjCacheCenterC() self.Inferener = LESInferencerC() self.DocKgDir = "" self.hQObj = {} self.OrigQWeight = 0.5 @classmethod def ShowConf(cls): cxBaseC.ShowConf() FbObjCacheCenterC.ShowConf() print 'origqweight 0.5' def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.DocKgDir = self.conf.GetConf('dockgdir') QAnaInName = self.conf.GetConf('qanain') self.LoadQObj(QAnaInName) self.ObjCenter.SetConf(ConfIn) self.OrigQWeight = self.conf.GetConf('origqweight', self.OrigQWeight) def LoadQObj(self,QAnaInName): for line in open(QAnaInName).read().splitlines(): vCol = line.strip().split('\t') qid = vCol[0] ObjId = vCol[2] score = vCol[-1] if not qid in self.hQObj: self.hQObj[qid] = [[ObjId,score]] else: self.hQObj[qid].append([ObjId,score]) logging.info('qobj loaded from [%s]',QAnaInName) return True def RankScoreForDoc(self,qid,query,doc): DocKg = SearchResDocGraphConstructorC.LoadDocGraph(self.DocKgDir, qid, doc.DocNo) lQObjId = [item[0] for item in self.hQObj[qid]] lDocObjId = DocKg.hNodeId.keys() lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] lDocObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] score = self.Inferener.inference(query, doc, lQObj, lDocObj) return score def Rank(self,qid,query,lDoc): if not qid in self.hQObj: logging.warn('qid [%s] no ana obj, withdraw to given score',qid) return [doc.DocNo for doc in lDoc] lScore = [self.RankScoreForDoc(qid, query, doc) for doc in lDoc] lDocNoScore = zip([doc.DocNo for doc in lDoc],lScore) lDocNoScore.sort(key=lambda item: item[1], reverse = True) lRankRes = [item[0] for item in lDocNoScore] return lRankRes
class GraphFullFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.NodeDir = "" self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.QDocFeatureExtractor = LeToRFeatureExtractCenterC() self.QObjFeatureExtractor = FbQObjFeatureExtractCenterC() self.DocObjFeatureExtractor = FbObjDocFeatureExtractCenterC() self.ObjObjFeatureExtractor = ObjObjFeatureExtractCenterC() def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.QDocFeatureExtractor.SetConf(ConfIn) self.QObjFeatureExtractor.SetConf(ConfIn) self.DocObjFeatureExtractor.SetConf(ConfIn) self.ObjObjFeatureExtractor.SetConf(ConfIn) logging.info('graph full feature extractor conf setted') @classmethod def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'nodedir' IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() LeToRFeatureExtractCenterC.ShowConf() FbQObjFeatureExtractCenterC.ShowConf() FbObjDocFeatureExtractCenterC.ShowConf() ObjObjFeatureExtractCenterC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lDocNo, lQObjId, llDocObjId = NodeCollectorCenterC.LoadRawFormatNodeRes( query, self.NodeDir) #match lDoc dim lDocNo dim lDoc = IndriSearchCenterC.RearrangeDocOrder(lDoc, lDocNo) lQObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lQObjId] llDocObj = [[self.ObjCenter.FetchObj(ObjId) for ObjId in lDocObjId] for lDocObjId in llDocObjId] while len(llDocObj) < len(lDoc): #add empty list for docs have no objects (thus will restrict to EsdRank) #if lQObj is also empty, then it is LeToR llDocObj.append([]) logging.info('q[%s] all node fetched, q node %s', qid, json.dumps([Obj.GetId() for Obj in lQObj])) return lDoc, lQObj, llDocObj def Process(self, qid, query, OutDir): ''' ''' lDoc, lQObj, llDocObj = self.FormulateNodes(qid, query) for doc, lDocObj in zip(lDoc, llDocObj): hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature = self.ExtractFeatureForOneQDoc( qid, query, doc, lQObj + lDocObj) self.DumpPerQRes(qid, query, doc, lQObj + lDocObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir) logging.info('q [%s] processed') return True def PipeRun(self, QInName, OutDir): lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: self.Process(qid, query, OutDir) logging.info('queries in [%s] processed features at [%s]', QInName, OutDir) return True def ExtractFeatureForOneQDoc(self, qid, query, doc, lObj): #if wanna speed up, cache features #for clearity, now just extract multiple times hQDocFeature = self.QDocFeatureExtractor.Process(qid, query, doc) logging.debug('q[%s][%s] ltr feature extracted', query, doc.DocNo) lhQObjFeature = self.QObjFeatureExtractor.ProcessOneQuery([qid, query], lObj) logging.debug('q[%s][%s] obj feature extracted', query, doc.DocNo) lhDocObjFeature = self.DocObjFeatureExtractor.ProcessOneQueryDocPair( [qid, query], doc, lObj) logging.debug('q[%s][%s] doc obj feature extracted', query, doc.DocNo) llhObjObjFeature = self.ObjObjFeatureExtractor.Process( qid, query, lObj) #symetric matrix logging.debug('q[%s] [%s] obj obj feature extracted', query, doc.DocNo) logging.debug('q [%s][%s] all doc graph feature extracted', query, doc.DocNo) return hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature def DumpPerQRes(self, qid, query, doc, lObj, hQDocFeature, lhQObjFeature, lhDocObjFeature, llhObjObjFeature, OutDir): ''' raw: a dir for this q a file for each doc node a, node b, hFeature.json ''' if not os.path.exists(OutDir + '/' + qid): os.makedirs(OutDir + '/' + qid) OutName = OutDir + '/' + qid + '/' + doc.DocNo out = open(OutName, 'w') #q doc print >> out, 'q_%s' % (qid) + '\t' + doc.DocNo + '\t' + json.dumps( hQDocFeature) #obj doc for Obj, hDocObjFeature in zip(lObj, lhDocObjFeature): print >> out, Obj.GetId() + '\t' + doc.DocNo + '\t' + json.dumps( hDocObjFeature) #q obj for Obj, hQObjFeature in zip(lObj, lhQObjFeature): print >> out, 'q_%s' % ( qid) + '\t' + Obj.GetId() + '\t' + json.dumps(hQObjFeature) print >> out, Obj.GetId() + '\t' + 'q_%s' % ( qid) + '\t' + json.dumps(hQObjFeature) #make it symmetric #obj obj for i in range(len(lObj)): for j in range(len(lObj)): if i == j: continue print >> out, lObj[i].GetId() + '\t' + lObj[j].GetId( ) + '\t' + json.dumps(llhObjObjFeature[i][j]) logging.info('q[%s] doc [%s] graph dumped to file [%s]', qid, doc.DocNo, OutName) return True
class BoeLmWeighterC(BoeLmC): def __init__(self,ConfIn = ""): self.Init() if "" != ConfIn: self.SetConf(ConfIn) def Init(self): BoeLmC.Init(self) self.DocTextDir = "" self.ObjCenter = FbObjCacheCenterC() self.CtfCenter = TermCtfC() self.lInferenceWeight = [1,0,0] self.hDocText = {} def SetConf(self,ConfIn): conf = cxConfC(ConfIn) self.DocTextDir = conf.GetConf('doctextdir') self.LoadDocText() self.ObjCenter.SetConf(ConfIn) CtfInName = conf.GetConf('objctf') self.CtfCenter.Load(CtfInName) @classmethod def ShowConf(): print 'doctextdir\nobjctf' FbObjCacheCenterC.ShowConf() def LoadDocText(self): for fname in WalkDir(self.DocTextDir): for line in open(fname): DocNo,text = line.strip().split('\t') self.hDocText[DocNo] = text logging.info('doc text loaded') def GetAllIdf(self,DocKg): lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] lRes = [] for ObjId in lObjId: idf = self.CtfCenter.GetLogIdf(ObjId) lRes.append(idf) return lRes def GetAllTf(self,DocKg): return list(DocKg.vNodeWeight) def GetAllTextCosine(self,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] lCos = [] if "" == DocText: return [0] * len(DocKg) DocLm = LmBaseC(DocText) lItem = DocKg.hNodeId.items() lItem.sort(key=lambda item:item[1]) lObjId = [item[0] for item in lItem] for ObjId in lObjId: desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) lCos.append(LmBaseC.Cosine(lm, DocLm)) return lCos def GetTextCosine(self,ObjId,DocKg): DocText = "" if DocKg.DocNo in self.hDocText: DocText = self.hDocText[DocKg.DocNo] DocLm = LmBaseC(DocText) desp = self.ObjCenter.FetchObjDesp(ObjId) lm = LmBaseC(desp) score = LmBaseC.Cosine(lm, DocLm) if 0 == score: return self.MinLogProb return math.log(score) def LinearWeightTfIdfTextSim(self,ObjId,DocKg,TfScore = 1,IdfScore = 0, TextSimScore = 0): if not ObjId in DocKg: return self.MinLogProb lTf = np.zeros(len(DocKg)) lIdf = np.zeros(len(DocKg)) if TfScore != 0: lTf = np.array(self.GetAllTf(DocKg)) if IdfScore != 0: lIdf = np.array(self.GetAllIdf(DocKg)) # lCos = np.array(self.GetAllTextCosine(DocKg)) TextSim = 0 if TextSimScore != 0: TextSim = self.GetTextCosine(ObjId,DocKg) W = np.array([TfScore,IdfScore,TextSimScore]) W = W / float(sum(W)) lScore = lTf * W[0] + lIdf * W[1] res = self.MinLogProb * (W[0] + W[1]) if ObjId in DocKg: p = DocKg.hNodeId[ObjId] res = lScore[p] res = res + TextSim * TextSimScore return res def inference(self, ObjId, DocKg): return self.LinearWeightTfIdfTextSim(ObjId, DocKg, self.lInferenceWeight[0], self.lInferenceWeight[1], self.lInferenceWeight[2])
class EdgeFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.lQObjFeatureGroup = [] self.lObjObjFeatureGroup = [] self.lDocObjFeatureGroup = [] self.QObjAnaExtractor = QueryObjEdgeFeatureAnaExtractorC() self.DocObjFaccExtractor = DocObjEdgeFeatureFaccExtractorC() self.ObjObjKGExtractor = ObjObjEdgeFeatureKGExtractorC() self.ObjObjPreCalcExtractor = ObjObjEdgeFeaturePreCalcSimExtractorC() self.ObjObjTextSimExtractor = ObjObjEdgeFeatureTextSimExtractorC() self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NodeDir = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NodeDir = self.conf.GetConf('nodedir') + '/' self.lQObjFeatureGroup = self.conf.GetConf('qobjfeaturegroup', self.lQObjFeatureGroup) self.lDocObjFeatureGroup = self.conf.GetConf('docobjfeaturegroup', self.lDocObjFeatureGroup) self.lObjObjFeatureGroup = self.conf.GetConf('objobjfeaturegroup', self.lObjObjFeatureGroup) if 'ana' in self.lQObjFeatureGroup: self.QObjAnaExtractor.SetConf(ConfIn) if 'facc' in self.lDocObjFeatureGroup: self.DocObjFaccExtractor.SetConf(ConfIn) if 'kg' in self.lObjObjFeatureGroup: self.ObjObjKGExtractor.SetConf(ConfIn) if 'precalc' in self.lObjObjFeatureGroup: self.ObjObjPreCalcExtractor.SetConf(ConfIn) if 'textsim' in self.lObjObjFeatureGroup: self.ObjObjTextSimExtractor.SetConf(ConfIn) logging.info('edge feature center confs setted') @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'nodedir\nqobjfeaturegroup\ndocobjfeaturegroup\nobjobjfeaturegroup' QueryObjEdgeFeatureAnaExtractorC.ShowConf() DocObjEdgeFeatureFaccExtractorC.ShowConf() ObjObjEdgeFeatureKGExtractorC.ShowConf() ObjObjEdgeFeaturePreCalcSimExtractorC.ShowConf() ObjObjEdgeFeatureTextSimExtractorC.ShowConf() def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lObjId = open(self.NodeDir + IndriSearchCenterC.GenerateQueryTargetName(query)).read( ).splitlines() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj)) return lDoc, lObj def ExtractPerQObj(self, qid, query, obj): hFeature = {} logging.debug('start extracting q[%s]-obj[%s] feature', query, obj.GetId()) if 'ana' in self.lQObjFeatureGroup: hFeature.update(self.QObjAnaExtractor.process(qid, query, obj)) logging.debug('q[%s]-obj[%s] feature extracted', query, obj.GetId()) return hFeature def ExtractQObjFeature(self, qid, query, lObj): lhFeature = [] logging.info('start extracting [%s][%s] q-obj feature [%d] obj', qid, query, len(lObj)) for obj in lObj: hFeature = self.ExtractPerQObj(qid, query, obj) lhFeature.append(hFeature) logging.info('q obj feature extracted') return lhFeature def ExtractPerDocObj(self, doc, obj): hFeature = {} logging.debug('start extracting doc[%s]-obj[%s] feature', doc.DocNo, obj.GetId()) if 'facc' in self.lDocObjFeatureGroup: hFeature.update(self.DocObjFaccExtractor.process(doc, obj)) logging.debug('doc[%s]-obj[%s] feature extracted', doc.DocNo, obj.GetId()) return hFeature def ExtractDocObjFeature(self, lDoc, lObj): llhFeature = [] #doc \times obj logging.info('start extract [%d] doc - [%d] obj feature mtx', len(lDoc), len(lObj)) for doc in lDoc: lhFeature = [] for obj in lObj: hFeature = self.ExtractPerDocObj(doc, obj) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('doc obj feature extracted') return llhFeature def ExtractPerObjObj(self, ObjA, ObjB, query): hFeature = {} logging.debug('start extracting for obj pair [%s-%s]', ObjA.GetId(), ObjB.GetId()) if 'kg' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjKGExtractor.process(ObjA, ObjB)) if 'precalc' in self.lObjObjFeatureGroup: hFeature.update( self.ObjObjPreCalcExtractor.process(ObjA, ObjB, query)) if 'textsim' in self.lObjObjFeatureGroup: hFeature.update(self.ObjObjTextSimExtractor.process(ObjA, ObjB)) logging.debug('obj pair [%s-%s] feature extracted', ObjA.GetId(), ObjB.GetId()) return hFeature def ExtractObjObjFeature(self, lObj, query): llhFeature = [] #obj -> obj, diagonal is empty logging.info('start extract [%d] obj pair feature mtx', len(lObj)) for ObjA in lObj: lhFeature = [] for ObjB in lObj: if ObjA.GetId() == ObjB.GetId(): continue hFeature = self.ExtractPerObjObj(ObjA, ObjB, query) lhFeature.append(hFeature) llhFeature.append(lhFeature) logging.info('obj obj feature extracted') return llhFeature def Process(self, qid, query): lDoc, lObj = self.FormulateNodes(qid, query) logging.info('nodes fetched') lQObjFeature = self.ExtractQObjFeature(qid, query, lObj) llDocObjFeature = self.ExtractDocObjFeature(lDoc, lObj) llObjObjFeature = self.ExtractObjObjFeature(lObj, query) return lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature def DumpRes(self, OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature): out = open(OutName, 'w') for obj, hFeature in zip(lObj, lQObjFeature): print >> out, query + '\t' + obj.GetId() + '\t' + json.dumps( hFeature) for doc, lhFeature in zip(lDoc, llDocObjFeature): for obj, hFeature in zip(lObj, lhFeature): print >> out, doc.DocNo + '\t' + obj.GetId( ) + '\t' + json.dumps(hFeature) for ObjA, lhFeature in zip(lObj, llObjObjFeature): for ObjB, hFeature in zip(lObj, lhFeature): print >> out, ObjA.GetId() + '\t' + ObjB.GetId( ) + '\t' + json.dumps(hFeature) out.close() logging.info('query [%s] feature dumped', query) def PipeRun(self, QInName, OutDir): ''' for now: output raw type each file is a query's edge features each line is query|doc|obj \t obj \t json.dumps(hFeature) ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: logging.info('start extracting for [%s][%s]', qid, query) lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process( qid, query) OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query, OutName) self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature) logging.info('all finished') return
class EntityCorrelationFromTextSimC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Searcher = IndriSearchCenterC() self.ObjCenter = FbObjCacheCenterC() self.NeighborNum = 50 def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Searcher.SetConf(ConfIn) self.ObjCenter.SetConf(ConfIn) self.NeighborNum = self.conf.GetConf('neighbornum', self.NeighborNum) @staticmethod def ShowConf(): cxBaseC.ShowConf() IndriSearchCenterC.ShowConf() FbObjCacheCenterC.ShowConf() print 'neighbornum' def ProcessOneObj(self, ObjId, name): ''' return lObjNeighbor=[objid,KL score] top self.NeighborNum ''' #search in index, get top 1000 query = TextBaseC.RawClean(name) if "" == query: return [] lObjDoc = self.Searcher.RunQuery(query) lObjNeighbor = [] ThisDesp = self.ObjCenter.FetchObjDesp(ObjId) ThisLm = LmBaseC(ThisDesp) ThisVec = VectorC(ThisLm.hTermTF) # print '[%s:%s] desp : [%s]' %(ObjId,name,ThisDesp) if len(ThisLm.hTermTF) == 0: return [] for ObjDoc in lObjDoc: Id = ObjDoc.DocNo if Id == ObjId: continue if not Id.startswith('/m/'): print "[%s %s] neighbor id [%s] format error" % (ObjId, name, Id) continue # print "get neighbor [%s] [%s]" %(Id,ObjDoc.GetContent()) # NeighborDesp = ObjDoc.GetContent() NeighborLm = LmBaseC(ObjDoc) NeighborVec = VectorC(NeighborLm.hTermTF) if len(NeighborVec.hDim) == 0: continue score = VectorC.KL(ThisVec, NeighborVec) lObjNeighbor.append([Id, -score]) # print "[%s %s] KL [%f]" %(ObjId,Id,score) # print "%s\n%s" %(json.dumps(ThisVec.hDim),json.dumps(NeighborVec.hDim)) lObjNeighbor.sort(key=lambda item: item[1], reverse=True) print "[%s:%s] neighbor id score get" % (ObjId, name) return lObjNeighbor def Process(self, ObjInName, OutName): out = open(OutName, 'w') for line in open(ObjInName): vCol = line.strip().split('\t') if len(vCol) < 2: continue lObjNeighbor = self.ProcessOneObj(vCol[0], vCol[1]) for NeighborId, score in lObjNeighbor[:self.NeighborNum]: print >> out, '%s\t%s\t%f\t%s\t%s' % ( vCol[0], NeighborId, score, vCol[1], self.ObjCenter.FetchObjName(NeighborId)) print "[%s:%s] done" % (vCol[0], vCol[1]) out.close() print "finished"