Ejemplo n.º 1
0
    def PipeRun(self, QInName, OutDir):
        '''
        for now:
            output raw type
            each file is a query's edge features
                each line is query|doc|obj \t obj \t json.dumps(hFeature)
        '''

        lQidQuery = [
            line.split('\t') for line in open(QInName).read().splitlines()
        ]

        for qid, query in lQidQuery:
            logging.info('start extracting for [%s][%s]', qid, query)
            lDoc, lObj, lQObjFeature, llDocObjFeature, llObjObjFeature = self.Process(
                qid, query)
            OutName = OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
                query)
            logging.info('[%s][%s] extracted, dumpping to [%s]', qid, query,
                         OutName)
            self.DumpRes(OutName, query, lDoc, lObj, lQObjFeature,
                         llDocObjFeature, llObjObjFeature)

        logging.info('all finished')
        return
    def LoadRawFormatNodeRes(query, InDir):
        '''
        read results from the disk as dumped
        '''
        lDocNo = []
        llDocObj = []

        InName = InDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(
            query)
        lLines = open(InName).read().splitlines()
        lvCol = [line.split('\t') for line in lLines]

        lQCol = [vCol for vCol in lvCol if vCol[0].startswith('q_')]
        lDocCol = [vCol for vCol in lvCol if not vCol[0].startswith('q_')]

        lQObj = [vCol[1] for vCol in lQCol]
        logging.debug('q[%s] get q obj %s', query, json.dumps(lQObj))

        LastDocNo = ""
        for DocNo, ObjId in lDocCol:
            if not DocNo == LastDocNo:
                llDocObj.append([])
                lDocNo.append(DocNo)
                LastDocNo = DocNo
            llDocObj[-1].append(ObjId)

        return lDocNo, lQObj, llDocObj
Ejemplo n.º 3
0
 def LoadQueryQid(self, QIn):
     lQidQuery = [
         line.split('\t') for line in open(QIn).read().splitlines()
     ]
     lQueryNameQid = [[
         IndriSearchCenterC.GenerateQueryTargetName(item[1]), item[0]
     ] for item in lQidQuery]
     self.hQueryQid = dict(lQueryNameQid)
Ejemplo n.º 4
0
 def LoadOneQueryObjSim(self,query):
     for i in range(len(self.lPreCalcDir)):
         InName = self.lPreCalcDir[i] +'/' + IndriSearchCenterC.GenerateQueryTargetName(query)
         if not os.path.exists(InName):
             return False
         hObjPairSim = pickle.load(open(InName))
         self.lhQueryObjPairSim[i][query] = hObjPairSim
     logging.info('query [%s] obj sim loaded',query)
     return True
Ejemplo n.º 5
0
 def LoadQDocObj(self, query):
     InName = self.QDocNodeDataDir + IndriSearchCenterC.GenerateQueryTargetName(
         query)
     hQDocObj = {}
     for line in open(InName):
         key, ObjId = line.strip().split('\t')
         if not key in hQDocObj:
             hQDocObj[key] = [ObjId]
         else:
             hQDocObj[key].append(ObjId)
     logging.info('query [%s] q doc obj [%d] loaded', query, len(hQDocObj))
     return hQDocObj
Ejemplo n.º 6
0
def OutputDocText(hQueryDocText, OutDir):
    for query, lDocNoText in hQueryDocText.items():
        out = open(
            OutDir + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')
        for DocNo, text in lDocNoText:
            print >> out, DocNo + '\t' + text
        logging.info('query [%s] [%d] doc text  outputed', query,
                     len(lDocNoText))
        out.close()
    logging.info('doc text dumped to [%s]', OutDir)
    return True
Ejemplo n.º 7
0
    def FormulateNodes(self, qid, query):
        '''
        get ldoc and read lObjId
        fill lObjId
        '''
        logging.info('formulating node for q [%s][%s]', qid, query)
        lDoc = self.Searcher.RunQuery(query, qid)

        lObjId = open(self.NodeDir +
                      IndriSearchCenterC.GenerateQueryTargetName(query)).read(
                      ).splitlines()

        lObj = [self.ObjCenter.FetchObj(ObjId) for ObjId in lObjId]
        logging.info('q[%s] [%d] doc [%d] obj', query, len(lDoc), len(lObj))
        return lDoc, lObj
    def DumpRawFormat(self, qid, query, lDoc, lQObj, llDocObj, OutName):

        if not os.path.exists(OutName):
            os.makedirs(OutName)

        out = open(
            OutName + '/' + IndriSearchCenterC.GenerateQueryTargetName(query),
            'w')

        logging.info('q[%s] has [%d] q node', qid, len(lQObj))
        for QObj in lQObj:
            print >> out, 'q_' + qid + '\t' + QObj

        if llDocObj == []:
            logging.info('no doc node')
        else:
            for doc, lDocObj in zip(lDoc, llDocObj):
                logging.info('doc [%s] has [%d] node', doc.DocNo, len(lDocObj))
                for DocObj in lDocObj:
                    print >> out, doc.DocNo + '\t' + DocObj

        out.close()
        logging.info('q [%s] raw node res dumpped', qid)
        return