def Init(self): cxBaseC.Init(self) self.InDir = "" self.OutDir = "" self.QRelCenter = AdhocQRelC() self.hQueryQid = {} #query name -> qid
def Init(self): cxBaseC.Init(self) self.RelCenter = AdhocQRelC() self.InDir = "" self.OutDir = "" self.hNodeFeatureId = {} #the id of node features self.hEdgeFeatureId = {} #the id of edge features
def Init(self): cxBaseC.Init(self) self.Prepared = False self.Word2VecInName = "" self.Word2VecModel = None self.lFeatureGroup = [] self.Searcher = IndriSearchCenterC() self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC() self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC() self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC() self.QRelCenter = AdhocQRelC() self.QRelIn = ""
class LeToRFeatureExtractCenterC(cxBaseC): def Init(self): cxBaseC.Init(self) self.Prepared = False self.Word2VecInName = "" self.Word2VecModel = None self.lFeatureGroup = [] self.Searcher = IndriSearchCenterC() self.GivenFeatureExtractor = LeToRGivenFeatureExtractorC() self.EmbTermPairFeatureExtractor = EmbeddingTermPairFeatureExtractorC() self.EmbLmFeatureExtractor = EmbeddingLmFeatureExtractorC() self.QRelCenter = AdhocQRelC() self.QRelIn = "" def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.Word2VecInName = self.conf.GetConf('word2vecin') self.lFeatureGroup = self.conf.GetConf('featuregroup') self.QRelIn = self.conf.GetConf('qrel') self.QRelCenter.Load(self.QRelIn) if type(self.lFeatureGroup) != list: self.lFeatureGroup = [self.lFeatureGroup] self.Searcher.SetConf(ConfIn) if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.SetConf(ConfIn) if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.SetConf(ConfIn) if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.SetConf(ConfIn) return True @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'word2vecin\nfeaturegroup givenfeature|termpairemb\nqrel\nemblm' LeToRGivenFeatureExtractorC.ShowConf() EmbeddingTermPairFeatureExtractorC.ShowConf() EmbeddingLmFeatureExtractorC.ShowConf() IndriSearchCenterC.ShowConf() def Prepare(self): if self.Prepared: return logging.info('start load word2vec input') self.Word2VecModel = gensim.models.Word2Vec.load_word2vec_format(self.Word2VecInName) logging.info('word2vec loaded') if 'givenfeature' in self.lFeatureGroup: self.GivenFeatureExtractor.Prepare() if 'termpairemb' in self.lFeatureGroup: self.EmbTermPairFeatureExtractor.Prepare() if 'emblm' in self.lFeatureGroup: self.EmbLmFeatureExtractor.Prepare() self.Prepared = True return def Process(self, qid,query,doc): ''' extract all features here ''' self.Prepare() hFeature = {} logging.debug('extracting for [%s][%s]',qid,doc.DocNo) if 'givenfeature' in self.lFeatureGroup: hFeature.update(self.GivenFeatureExtractor.Extract(qid, query, doc)) logging.debug('given feature extracted') if 'termpairemb' in self.lFeatureGroup: hFeature.update(self.EmbTermPairFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('termpairemb feature extracted') if 'emblm' in self.lFeatureGroup: hFeature.update(self.EmbLmFeatureExtractor.Extract(qid, query, doc, self.Word2VecModel)) logging.debug('emblm feature extracted') return hFeature def PipeLineRun(self,QInName,OutName): ''' will make a feature hash myself... It should be OK right? ''' hFeatureName = {} self.Prepare() lLines = open(QInName).read().splitlines() lQidQuery = [line.split('\t') for line in lLines] out = open(OutName,'w') logging.info('start extracting for file [%s]',QInName) for qid,query in lQidQuery: lDoc = self.Searcher.RunQuery(query, qid) for doc in lDoc: hFeature = self.Process(qid, query, doc) LTRData = LeToRDataBaseC() LTRData.qid = qid LTRData.DocNo = doc.DocNo LTRData.hFeature = hFeature LTRData.score = self.QRelCenter.GetScore(qid, doc.DocNo) hFeatureName = LTRData.HashFeatureName(hFeatureName) print >>out,LTRData.dumps() logging.info('qid [%s] extracted',qid) out.close() NameOut = open(OutName + '_FeatureName','w') for name,Id in hFeatureName.items(): print >>NameOut,'%d\t%s' %(Id,name) NameOut.close() logging.info('finished') return
def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'indir\noutdir' AdhocQRelC.ShowConf()
class GraphFeaturePostProcessorC(cxBaseC): def Init(self): cxBaseC.Init(self) self.RelCenter = AdhocQRelC() self.InDir = "" self.OutDir = "" self.hNodeFeatureId = {} #the id of node features self.hEdgeFeatureId = {} #the id of edge features def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.InDir = self.conf.GetConf('indir') + '/' self.OutDir = self.conf.GetConf('outdir') + '/' self.RelCenter.SetConf(ConfIn) @classmethod def ShowConf(cls): cxBaseC.ShowConf() print cls.__name__ print 'indir\noutdir' AdhocQRelC.ShowConf() def HashFeatureName(self, hFeature={}): ''' go through the full input dir, hash node features and edge features ''' if {} != hFeature: self.MakeFeatureHashFromNames(hFeature.keys()) else: self.MakeFeatureFromRawData() return True def MakeFeatureFromRawData(self): sNodeFeatureName = set() sEdgeFeatureName = set() lFName = WalkDir(self.InDir) for FName in lFName: # logging.info('checking feature names in [%s]',FName) lLines = open(FName).read().splitlines() lNodeLines = [ line for line in lLines if self.IsNodeFeatureLine(line) ] lEdgeLines = [ line for line in lLines if not self.IsNodeFeatureLine(line) ] sNodeFeatureName.update(self.GetFeatureName(lNodeLines)) sEdgeFeatureName.update(self.GetFeatureName(lEdgeLines)) self.MakeNodeFeatureHash(sNodeFeatureName) self.MakeEdgeFeatureHash(sEdgeFeatureName) logging.info('feature hash id assigned from raw data') return True def MakeFeatureHashFromNames(self, lName): lEdgeFeatureName = [ name for name in lName if name.startswith('ObjObj') | name.startswith('QObj') ] sEdgeFeatureName = set(lEdgeFeatureName) sNodeFeatureName = set(lName) - sEdgeFeatureName self.MakeNodeFeatureHash(sNodeFeatureName) self.MakeEdgeFeatureHash(sEdgeFeatureName) logging.info('Node Feature hash: %s', json.dumps(self.hNodeFeatureId)) logging.info('Edge Feature hash: %s', json.dumps(self.hEdgeFeatureId)) return True def FindGlobalFeatureMaxMin(self): hMin = {} hMax = {} for QDir, mid, lFname in os.walk(self.InDir): if QDir == self.InDir: continue hQMax, hQMin = self.FindMaxMinFeatureValuesForQ(QDir) hMax = FeatureProcessorC.Max(hMax, hQMax) hMin = FeatureProcessorC.Min(hMin, hQMin) logging.info('Global feature max-min found') return hMax, hMin def MakeNodeFeatureHash(self, sNodeFeatureName): ''' put LeToR features first ''' lName = list(sNodeFeatureName) lLtrName = [name for name in lName if name.startswith('LeToR')] lObjName = [name for name in lName if not name.startswith('LeToR')] lLtrName.sort() lObjName.sort() lName = lLtrName + lObjName self.hNodeFeatureId = dict(zip(lName, range(len(lName)))) return True def MakeEdgeFeatureHash(self, sEdgeFeatureName): ''' put QObj features first ''' lName = list(sEdgeFeatureName) lQObjName = [name for name in lName if name.startswith('QObj')] lObjObjName = [name for name in lName if not name.startswith('QObj')] lQObjName.sort() lObjObjName.sort() lName = lQObjName + lObjObjName self.hEdgeFeatureId = dict(zip(lName, range(len(lName)))) return True def GetFeatureName(self, lLines): # lhFeature = [] # for line in lLines: # FStr = line.split('\t')[-1] # try: # hFeature = json.loads(FStr) # lhFeature.append(hFeature) # except ValueError: # logging.error('[%s] cannot be json loaded', FStr) # sys.exit() lhFeature = [json.loads(line.split('\t')[-1]) for line in lLines] lName = [] for hFeature in lhFeature: lName.extend(hFeature.keys()) return set(lName) def FindMaxMinFeatureValuesForQ(self, QDir): ''' find the max and min feature values of this query so I perform max-min normalization per query level Should work too and is simple ''' hFeatureMax = {} hFeatureMin = {} lDocName = WalkDir(QDir) for DocName in lDocName: logging.info('finding max min of [%s]', DocName) for line in open(DocName): vCol = line.strip().split('\t') hFeature = json.loads(vCol[-1]) hFeatureMax = FeatureProcessorC.Max(hFeature, hFeatureMax) hFeatureMin = FeatureProcessorC.Min(hFeature, hFeatureMin) logging.info('q [%s] max-min feature score get', ntpath.basename(QDir)) logging.info('q [%s] max %s', ntpath.basename(QDir), json.dumps(hFeatureMax)) logging.info('q [%s] min %s', ntpath.basename(QDir), json.dumps(hFeatureMin)) return hFeatureMax, hFeatureMin def ProcessOneDoc(self, Qid, DocInName, hFeatureMax, hFeatureMin): ''' read data hash to node id normalize fetch rel label dump node mtx dump edge tensor dump rel label dump node name -> id ''' lLines = open(DocInName).read().splitlines() lNodeLines = [line for line in lLines if self.IsNodeFeatureLine(line)] lEdgeLines = [ line for line in lLines if not self.IsNodeFeatureLine(line) ] hNodeId = self.HashPerDocNode(lNodeLines) NodeMtx = self.FormNodeMtx(lNodeLines, hNodeId, hFeatureMax, hFeatureMin) EdgeTensor = self.FormEdgeTensor(lEdgeLines, hNodeId, hFeatureMax, hFeatureMin) DocNo = ntpath.basename(DocInName) rel = self.RelCenter.GetScore(Qid, DocNo) OutName = self.OutDir + '/' + Qid + '/' + DocNo if not os.path.exists(self.OutDir + '/' + Qid): os.makedirs(self.OutDir + '/' + Qid) out = open(OutName, 'w') pickle.dump([NodeMtx, EdgeTensor, rel, hNodeId], out) logging.info('[%s] processed and dumped', OutName) return True def HashPerDocNode(self, lLines): lNode = [] QNode = "" for line in lLines: vCol = line.split('\t') for NodeName in vCol[:2]: if self.IsObjNode(NodeName): lNode.append(NodeName) if self.IsQNode(NodeName): QNode = NodeName lNode = list(set(lNode)) lNode.sort() lTotalNode = [QNode] + lNode hNodeId = dict(zip(lTotalNode, range(len(lTotalNode)))) return hNodeId def FormNodeMtx(self, lNodeLines, hNodeId, hFeatureMax, hFeatureMin): ''' make lines to node id, hFeature pair normalize hFeature put it in corresponding rows in NodeMtx ''' NodeMtx = numpy.zeros([len(hNodeId), len(self.hNodeFeatureId)]) for line in lNodeLines: vCol = line.split('\t') NodeP = hNodeId[vCol[0]] hFeature = json.loads(vCol[-1]) hFeature = FeatureProcessorC.MaxMinNormalization( hFeature, hFeatureMax, hFeatureMin) FeatureVec = FeatureProcessorC.VectorlizeFeature( hFeature, self.hNodeFeatureId) NodeMtx[NodeP] = FeatureVec logging.info('node feature matrix converted') return NodeMtx def FormEdgeTensor(self, lEdgeLines, hNodeId, hFeatureMax, hFeatureMin): ''' make lines to node a, node b, hFeature triple normalize put it in corresponding cell in EdgeTensor ''' EdgeTensor = numpy.zeros( [len(hNodeId), len(hNodeId), len(self.hEdgeFeatureId)]) for line in lEdgeLines: vCol = line.split('\t') NodeA = hNodeId[vCol[0]] NodeB = hNodeId[vCol[1]] hFeature = json.loads(vCol[2]) hFeature = FeatureProcessorC.MaxMinNormalization( hFeature, hFeatureMax, hFeatureMin) FeatureVec = FeatureProcessorC.VectorlizeFeature( hFeature, self.hEdgeFeatureId) EdgeTensor[NodeA, NodeB] = FeatureVec logging.info('edge feature tensor converted') return EdgeTensor def Process(self): hGlobalFeatureMax, hGlobalFeatureMin = self.FindGlobalFeatureMaxMin() self.HashFeatureName(hGlobalFeatureMax) for QDir, mid, lDocName in os.walk(self.InDir): if QDir == self.InDir: continue logging.info('start working on query dir [%s]', QDir) # hFeatureMax,hFeatureMin = self.FindMaxMinFeatureValuesForQ(QDir) qid = ntpath.basename(QDir) for DocName in lDocName: self.ProcessOneDoc(qid, QDir + '/' + DocName, hGlobalFeatureMax, hGlobalFeatureMin) logging.info('q [%s] processed', qid) self.DumpFeatureHash() logging.info('feature normalized and transformed') return True def DumpFeatureHash(self): out = open(self.OutDir + 'NodeFeatureId', 'w') lNodeF = self.hNodeFeatureId.items() lNodeF.sort(key=lambda item: int(item[1])) print >> out, '\n'.join( ['%s\t%s' % (item[0], item[1]) for item in lNodeF]) out.close() out = open(self.OutDir + 'EdgeFeatureId', 'w') lEdgeF = self.hEdgeFeatureId.items() lEdgeF.sort(key=lambda item: int(item[1])) print >> out, '\n'.join( ['%s\t%s' % (item[0], item[1]) for item in lEdgeF]) out.close() logging.info('feature id name dumped') return def IsQNode(self, NodeName): return NodeName.startswith('q_') def IsObjNode(self, NodeName): return NodeName.startswith('/m/') def IsNodeFeatureLine(self, line): vCol = line.split('\t') if self.IsQNode(vCol[1]) | self.IsObjNode(vCol[1]): return False return True
class GraphDataPreparationcC(cxBaseC): def Init(self): cxBaseC.Init(self) self.InDir = "" self.OutDir = "" self.QRelCenter = AdhocQRelC() self.hQueryQid = {} #query name -> qid def SetConf(self, ConfIn): cxBaseC.SetConf(self, ConfIn) self.InDir = self.conf.GetConf('indir') self.OutDir = self.conf.GetConf('outdir') QRelInName = self.conf.GetConf('qrel') self.QRelCenter.Load(QRelInName) QIn = self.conf.GetConf('qin') self.LoadQueryQid(QIn) @staticmethod def ShowConf(): cxBaseC.ShowConf() print 'indir\noutdir\nqrelnqin' def LoadQueryQid(self, QIn): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] lQueryNameQid = [[ IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0] ] for item in lQidQuery] self.hQueryQid = dict(lQueryNameQid) def UpdateHashId(self, name, hDict): if not name in hDict: hDict[name] = len(hDict) def GeneratePerQHashMapping(self, InName): hNodeId = {} lEdgeFeatureName = [] for line in open(InName): NodeA, NodeB, FeatureStr = line.strip().split('\t') self.UpdateHashId(NodeA, hNodeId) self.UpdateHashId(NodeB, hNodeId) hFeature = json.loads(FeatureStr) lEdgeFeatureName.extend(hFeature.keys()) lEdgeFeatureName = list(set(lEdgeFeatureName)) lEdgeFeatureName.sort() #make sure feature id is uniq hEdgeFeatureId = dict( zip(lEdgeFeatureName, range(len(lEdgeFeatureName)))) logging.info('[%s] id made [%d] node [%d] edge feature', InName, len(hNodeId), len(hEdgeFeatureId)) return hNodeId, hEdgeFeatureId def FormGraphTensorPerFile(self, InName, hNodeId, hEdgeFeatureId): ''' form tensor for data in InName ''' NodeN = len(hNodeId) FeatureDim = len(hEdgeFeatureId) logging.info('initializing [%d^2,-%d] graph tensor', NodeN, FeatureDim) GraphTensor = np.zeros((NodeN, NodeN, FeatureDim)) for line in open(InName): NodeA, NodeB, FeatureStr = line.strip().split('\t') hFeature = json.loads(FeatureStr) AId = hNodeId[NodeA] BId = hNodeId[NodeB] for key, score in hFeature.items(): FId = hEdgeFeatureId[key] GraphTensor[AId, BId, FId] = score return GraphTensor def FetchQRelVec(self, hNodeId, qid): ''' fetch the relevance score from self.QRelCenter if the node is a query or a object, then rel score is np.nan ''' QRelVec = np.zeros(len(hNodeId)) for name, p in hNodeId: if not name.startswith('clueweb'): QRelVec[p] = np.nan continue QRelVec[p] = self.QRelCenter.GetScore(qid, name) return QRelVec def ProcessOneQuery(self, InName): QName = ntpath.basename(InName) qid = self.hQueryQid[QName] OutPre = self.OutDir + '/' + qid hNodeId, hEdgeFeatureId = self.GeneratePerQHashMapping(InName) pickle.dump(hNodeId, open(OutPre + '_NodeId', 'w')) pickle.dump(hEdgeFeatureId, open(OutPre + '_EdgeFeatureId', 'w')) logging.info('[%s] hash id dumped', QName) GraphTensor = self.FormGraphTensorPerFile(InName, hNodeId, hEdgeFeatureId) pickle.dump(GraphTensor, open(OutPre + '_Graph', 'w')) logging.info('[%s] graph tensor dumped', QName) QRelVec = self.FetchQRelVec(hNodeId, qid) pickle.dump(QRelVec, open(OutPre + '_Label', 'w')) logging.info('[%s] label vec dumped', QName) @staticmethod def LoadOneQuery(InPre): GraphTensor = pickle.load(open(InPre + '_Graph')) QRelVec = pickle.load(open(InPre + '_Label')) return GraphTensor, QRelVec @staticmethod def LoadData(InDir): lInName = WalkDir(InDir) lInName = list( set(['_'.join(line.split('_')[:-1]) for line in lInName])) lInName.sort(key=lambda item: int(ntpath.basename(item))) lGraph = [] lLabel = [] for InName in lInName: GraphTensor, QRelVec = GraphDataPreparationcC.LoadOneQuery(InName) lGraph.append(GraphTensor) lLabel.append(QRelVec) logging.info('[%s] data loaded', ntpath.basename(InName)) logging.info('add graph data and label loaded [%d] query', len(lGraph)) return lGraph, lLabel def Process(self): lInName = WalkDir(self.InDir) for InName in lInName: self.ProcessOneQuery(InName) logging.info('finished, data in [%s]', self.OutDir)