def PipeRun(self, QInName, OutDir): ''' for now: output raw type each file is a query's edge features each line is query|doc|obj \t obj \t json.dumps(hFeature) ''' lQidQuery = [ line.split('\t') for line in open(QInName).read().splitlines() ] for qid, query in lQidQuery: logging.info('start extracting for [%s][%s]', qid, query) lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process( qid, query) OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query, OutName) self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature) logging.info('all finished') return
def LoadRawFormatNodeRes(query, InDir): ''' read results from the disk as dumped ''' lDocNo = [] llDocObj = [] InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName( query) lLines = open(InName).read().splitlines() lvCol = [line.split('\t') for line in lLines] lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')] lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')] lQObj = [vCol[1] for vCol in lQCol] logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj)) LastDocNo = "" for DocNo, ObjId in lDocCol: if not DocNo == LastDocNo: llDocObj.append([]) lDocNo.append(DocNo) LastDocNo = DocNo llDocObj[-1].append(ObjId) return lDocNo, lQObj, llDocObj
def LoadQueryQid(self, QIn): lQidQuery = [ line.split('\t') for line in open(QIn).read().splitlines() ] lQueryNameQid = [[ IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0] ] for item in lQidQuery] self.hQueryQid = dict(lQueryNameQid)
def LoadOneQueryObjSim(self,query): for i in range(len(self.lPreCalcDir)): InName = self.lPreCalcDir[i] +'/' + IndriSearchCenterC.GenerateQueryTargetName(query) if not os.path.exists(InName): return False hObjPairSim = pickle.load(open(InName)) self.lhQueryObjPairSim[i][query] = hObjPairSim logging.info('query [%s] obj sim loaded',query) return True
def LoadQDocObj(self, query): InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName( query) hQDocObj = {} for line in open(InName): key, ObjId = line.strip().split('\t') if not key in hQDocObj: hQDocObj[key] = [ObjId] else: hQDocObj[key].append(ObjId) logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj)) return hQDocObj
def OutputDocText(hQueryDocText, OutDir): for query, lDocNoText in hQueryDocText.items(): out = open( OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') for DocNo, text in lDocNoText: print >> out, DocNo + '\t' + text logging.info('query [%s] [%d] doc text outputed', query, len(lDocNoText)) out.close() logging.info('doc text dumped to [%s]', OutDir) return True
def FormulateNodes(self, qid, query): ''' get ldoc and read lObjId fill lObjId ''' logging.info('formulating node for q [%s][%s]', qid, query) lDoc = self.Searcher.RunQuery(query, qid) lObjId = open(self.NodeDir + IndriSearchCenterC.GenerateQueryTargetName(query)).read( ).splitlines() lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId] logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj)) return lDoc, lObj
def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName): if not os.path.exists(OutName): os.makedirs(OutName) out = open( OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query), 'w') logging.info('q[%s] has [%d] q node', qid, len(lQObj)) for QObj in lQObj: print >> out, 'q_' + qid + '\t' + QObj if llDocObj == []: logging.info('no doc node') else: for doc, lDocObj in zip(lDoc, llDocObj): logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj)) for DocObj in lDocObj: print >> out, doc.DocNo + '\t' + DocObj out.close() logging.info('q [%s] raw node res dumpped', qid) return