Beispiel #1
0
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \
        skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None):
    """ run jobs to convert the articles to a bag-of-words matrix """

    assert (outFormat in ["svml", "arff", "pmidsvml"])

    if isinstance(datasets, basestring):
        datasets = [datasets]

    if runner == None:
        runner = pubGeneric.makeClusterRunner(__file__)

    logging.debug("pos and neg pmid fnames are: %s, %s" %
                  (posPmidFname, negPmidFname))
    if posPmidFname != None:
        posPmids = parsePmids(posPmidFname)
    if negPmidFname != None:
        negPmids = parsePmids(negPmidFname)

    termList = parseTerms(wordListFname)

    paramDict = {"termList" : termList, "posPmids"  : posPmids, \
                 "negPmids" : negPmids, "outFormat" : outFormat }
    paramDict["docIdOutFname"] = docIdFname

    pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \
        outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
Beispiel #2
0
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \
        skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None):
    """ run jobs to convert the articles to a bag-of-words matrix """

    assert (outFormat in ["svml", "arff", "pmidsvml"])

    if isinstance(datasets, basestring):
        datasets = [datasets]

    if runner==None:
        runner = pubGeneric.makeClusterRunner(__file__)

    logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname))
    if posPmidFname!=None:
        posPmids = parsePmids(posPmidFname)
    if negPmidFname!=None:
        negPmids = parsePmids(negPmidFname)

    termList = parseTerms(wordListFname)

    paramDict = {"termList" : termList, "posPmids"  : posPmids, \
                 "negPmids" : negPmids, "outFormat" : outFormat }
    paramDict["docIdOutFname"] = docIdFname

    pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \
        outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
Beispiel #3
0
 def getRunner(self, step):
     " return a runner object for the current dataset and pipelineStep"
     headNode = pubConf.stepHosts.get(step, None)
     logging.debug("Headnode for step %s is %s" % (step, headNode))
     return pubGeneric.makeClusterRunner("pubMap-" + self.dataset + "-" +
                                         step,
                                         headNode=headNode)
Beispiel #4
0
def submitJobs(inSpec, filterSpec, outDir):
    inDirs = pubConf.resolveTextDirs(inSpec)
    runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, algName=inSpec)

    outFnames = []
    for inDir in inDirs:
        inFnames = glob.glob(join(inDir, "*.articles.gz"))
        for inFname in inFnames:
            outFname = join(outDir, basename(dirname(inFname))+"-"+basename(inFname))
            outFnames.append(outFname)
            outFnames.append(outFname.replace('.articles.gz','.files.gz'))
            #command = "%s %s filterJob {check in exists %s} %s %s" % \
                #(sys.executable, __file__, inFname, pmidFname, outFname)
            runner.submitPythonFunc(__file__, "filterOneChunk", [inFname, filterSpec, outFname])
    runner.finish(wait=True)
    return outFnames
Beispiel #5
0
def submitJobs(inSpec, filterSpec, outDir):
    inDirs = pubConf.resolveTextDirs(inSpec)
    runner = pubGeneric.makeClusterRunner(__file__,
                                          maxJob=pubConf.convertMaxJob,
                                          algName=inSpec)

    outFnames = []
    for inDir in inDirs:
        inFnames = glob.glob(join(inDir, "*.articles.gz"))
        for inFname in inFnames:
            outFname = join(
                outDir,
                basename(dirname(inFname)) + "-" + basename(inFname))
            outFnames.append(outFname)
            outFnames.append(outFname.replace('.articles.gz', '.files.gz'))
            #command = "%s %s filterJob {check in exists %s} %s %s" % \
            #(sys.executable, __file__, inFname, pmidFname, outFname)
            runner.submitPythonFunc(__file__, "filterOneChunk",
                                    [inFname, filterSpec, outFname])
    runner.finish(wait=True)
    return outFnames
Beispiel #6
0
 def getRunner(self, step):
     " return a runner object for the current dataset and pipelineStep"
     headNode = pubConf.stepHosts.get(step, None)
     logging.debug("Headnode for step %s is %s" % (step, headNode))
     return pubGeneric.makeClusterRunner("pubMap-"+self.dataset+"-"+step, headNode=headNode)