Example #1
0
def launchSnakemake(snakeFilePath,useCluster=True,parameters={}):
	globalSettings = pubrunner.getGlobalSettings()
	
	clusterFlags = ""
	if useCluster and "cluster" in globalSettings:
		clusterSettings = globalSettings["cluster"]
		jobs = 1
		if "jobs" in globalSettings["cluster"]:
			jobs = int(globalSettings["cluster"]["jobs"])
		clusterFlags = "--jobs %d --latency-wait 60" % jobs

		if "drmaa" in clusterSettings and clusterSettings["drmaa"] == True:
			clusterFlags += ' --drmaa'
		elif "drmaa" in clusterSettings:
			clusterFlags += " --drmaa ' %s'" % clusterSettings["drmaa"]
		elif "options" in clusterSettings:
			clusterFlags += " --cluster '%s'" % clusterSettings["options"]
		else:
			raise RuntimeError("Cluster must either have drmaa = true or provide options (e.g. using qsub)")

	makecommand = "snakemake %s --nolock -s %s" % (clusterFlags,snakeFilePath)

	env = os.environ.copy()
	env.update(parameters)

	retval = subprocess.call(shlex.split(makecommand),env=env)
	if retval != 0:
		raise RuntimeError("Snake make call FAILED (file:%s)" % snakeFilePath)
Example #2
0
def cleanWorkingDirectory(directory, doTest, execute=False):
    mode = "test" if doTest else "full"

    globalSettings = pubrunner.getGlobalSettings()
    os.chdir(directory)

    toolYamlFile = 'pubrunner.yml'
    if not os.path.isfile(toolYamlFile):
        raise RuntimeError("Expected a %s file in root of codebase" %
                           toolYamlFile)

    toolSettings = pubrunner.loadYAML(toolYamlFile)
    toolName = toolSettings["name"]

    workspaceDir = os.path.expanduser(globalSettings["storage"]["workspace"])
    workingDirectory = os.path.join(workspaceDir, toolName, mode)

    if os.path.isdir(workingDirectory):
        print("Removing working directory for tool %s" % toolName)
        print("Directory: %s" % workingDirectory)
        shutil.rmtree(workingDirectory)
    else:
        print("No working directory to remove for tool %s" % toolName)
        print("Expected directory: %s" % workingDirectory)
Example #3
0
def getResource(resource):
    print("Fetching resource: %s" % resource)

    globalSettings = pubrunner.getGlobalSettings()
    resourceDir = os.path.expanduser(globalSettings["storage"]["resources"])
    thisResourceDir = os.path.join(resourceDir, resource)

    resourceInfo = getResourceInfo(resource)
    #print(json.dumps(resourceInfo,indent=2))

    if resourceInfo['type'] == 'git':
        assert isinstance(
            resourceInfo['url'], six.string_types
        ), 'The URL for a git resource must be a single address'

        if os.path.isdir(thisResourceDir):
            # Assume it is an existing git repo
            repo = git.Repo(thisResourceDir)
            repo.remote().pull()
        else:
            os.makedirs(thisResourceDir)
            git.Repo.clone_from(resourceInfo["url"], thisResourceDir)

        #generateFileListing(thisResourceDir)

        return thisResourceDir
    elif resourceInfo['type'] == 'zenodo':
        assert isinstance(resourceInfo['record'],
                          int), 'The Zenodo record must be an integer'

        print("  Starting Zenodo download...")
        downloadZenodo(resourceInfo['record'], thisResourceDir)

        return thisResourceDir
    elif resourceInfo['type'] == 'remote':
        assert isinstance(resourceInfo['url'], six.string_types) or isinstance(
            resourceInfo['url'], list
        ), 'The URL for a remote resource must be a single or multiple addresses'
        if isinstance(resourceInfo['url'], six.string_types):
            urls = [resourceInfo['url']]
        else:
            urls = resourceInfo['url']

        if 'filter' in resourceInfo:
            fileSuffixFilter = resourceInfo['filter']
        else:
            fileSuffixFilter = None

        if not os.path.isdir(thisResourceDir):
            print("  Creating directory...")
            os.makedirs(thisResourceDir)

        print("  Starting download...")
        for url in urls:
            basename = url.split('/')[-1]
            assert isinstance(
                url, six.string_types
            ), 'Each URL for the dir resource must be a string'
            download(url, os.path.join(thisResourceDir, basename),
                     fileSuffixFilter)

        if 'unzip' in resourceInfo and resourceInfo['unzip'] == True:
            print("  Unzipping archives...")
            for filename in os.listdir(thisResourceDir):
                if filename.endswith('.tar.gz') or filename.endswith('.tgz'):
                    tar = tarfile.open(os.path.join(thisResourceDir, filename),
                                       "r:gz")
                    tar.extractall(thisResourceDir)
                    tar.close()
                elif filename.endswith('.gz'):
                    unzippedName = filename[:-3]
                    gunzip(os.path.join(thisResourceDir, filename),
                           os.path.join(thisResourceDir, unzippedName),
                           deleteSource=True)

        if not fileSuffixFilter is None:
            print("  Removing files not matching filter (%s)..." %
                  fileSuffixFilter)
            for root, subdirs, files in os.walk(thisResourceDir):
                for f in files:
                    if not f.endswith(fileSuffixFilter):
                        fullpath = os.path.join(root, f)
                        os.unlink(fullpath)

        if 'generatePubmedHashes' in resourceInfo and resourceInfo[
                'generatePubmedHashes'] == True:
            print("  Generating Pubmed hashes...")
            hashDir = os.path.join(resourceDir, resource + '.hashes')
            if not os.path.isdir(hashDir):
                os.makedirs(hashDir)

            generatePubmedHashes(thisResourceDir, hashDir)

        #generateFileListing(thisResourceDir)

        return thisResourceDir
    elif resourceInfo['type'] == 'local':
        assert isinstance(
            resourceInfo['directory'], six.string_types) and os.path.isdir(
                resourceInfo['directory']
            ), 'The directory for a remote resource must be a string and exist'

        if not os.path.islink(thisResourceDir) and os.path.isdir(
                thisResourceDir):
            shutil.rmtree(thisResourceDir)

        if not os.path.islink(thisResourceDir):
            os.symlink(resourceInfo['directory'], thisResourceDir)
    else:
        raise RuntimeError("Unknown resource type (%s) for resource: %s" %
                           (resourceInfo['type'], resource))
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'PubRunner will manage the download of needed resources for a text mining tool, build and execute it and then share the results publicly'
    )
    parser.add_argument(
        'codebase',
        nargs='?',
        type=str,
        help=
        'Code base containing the text mining tool to execute. Code base should contain a pubrunner.yml file. The code base can be a directory, Github repo or archive'
    )
    parser.add_argument(
        '--defaultsettings',
        action='store_true',
        help=
        'Use default .pubrunner.settings.xml. Ignore ~/.pubrunner.settings.yml if it exists.'
    )
    parser.add_argument(
        '--ignorecluster',
        action='store_true',
        help='Ignore any cluster settings and run everything locally')
    parser.add_argument('--clean',
                        action='store_true',
                        help='Remove the existing working directory')
    parser.add_argument(
        '--forceresource_dir',
        type=str,
        required=False,
        help=
        'Ignore the resources for the project and use this directory instead for the first one only (all others are empty)'
    )
    parser.add_argument(
        '--forceresource_format',
        type=str,
        required=False,
        help='The format of the resource to use instead for this run')
    parser.add_argument(
        '--outputdir',
        type=str,
        required=False,
        help=
        'Where to store the results of the run (instead of the default location defined by ~/.pubrunner.settings.yml)'
    )
    parser.add_argument(
        '--nogetresource',
        action='store_true',
        help=
        'Do not fetch resources before executing a project. Will fail if old versions of resources do not already exists.'
    )
    parser.add_argument(
        '--test',
        action='store_true',
        help='Run the test functionality instead of the full run')
    parser.add_argument(
        '--getresource',
        required=False,
        type=str,
        help=
        'Fetch a specific resource (instead of doing a normal PubRunner run). This is really only needed for debugging and understanding resources.'
    )

    args = parser.parse_args()

    print(
        pyfiglet.figlet_format('PubRunner',
                               font='cyberlarge',
                               justify='center'))
    print()

    if args.defaultsettings:
        globalSettings = pubrunner.getGlobalSettings(useDefault=True)

    if args.forceresource_dir:
        args.forceresource_dir = os.path.abspath(args.forceresource_dir)
    if args.outputdir:
        args.outputdir = os.path.abspath(args.outputdir)

    if args.getresource:
        location = pubrunner.getResource(args.getresource)
        print("Downloaded latest version of resource '%s' to location:" %
              args.getresource)
        print(location)
        print("")
        print("Exiting without doing PubRun")
        return

    if not args.codebase:
        print(
            "ERROR: codebase must be provided (if not downloading individual resources)"
        )
        print()
        parser.print_help()
        sys.exit(1)

    if args.ignorecluster:
        globalSettings = pubrunner.getGlobalSettings()
        if "cluster" in globalSettings:
            del globalSettings["cluster"]

    if os.path.isdir(args.codebase):
        if args.clean:
            pubrunner.cleanWorkingDirectory(args.codebase, args.test)
        else:
            pubrunner.pubrun(args.codebase,
                             args.test, (not args.nogetresource),
                             forceresource_dir=args.forceresource_dir,
                             forceresource_format=args.forceresource_format,
                             outputdir=args.outputdir)
    elif args.codebase.startswith('https://github.com/'):
        tempDir = ''
        try:
            print("Cloning Github repo")
            tempDir = cloneGithubRepoToTempDir(args.codebase)
            if args.clean:
                pubrunner.cleanWorkingDirectory(tempDir, args.test,
                                                (not args.nogetresource))
            else:
                pubrunner.pubrun(
                    tempDir,
                    args.test, (not args.nogetresource),
                    forceresource_dir=args.forceresource_dir,
                    forceresource_format=args.forceresource_format,
                    outputdir=args.outputdir)
            shutil.rmtree(tempDir)
        except:
            if os.path.isdir(tempDir):
                shutil.rmtree(tempDir)
            logging.error(traceback.format_exc())
            raise

    elif os.path.isfile(args.codebase):
        raise RuntimeError("Not implemented")
    else:
        raise RuntimeError(
            "Not sure what to do with codebase: %s. Doesn't appear to be a directory, Github repo or archive"
            % args.codebase)
Example #5
0
def pubrun(directory,
           doTest,
           doGetResources,
           forceresource_dir=None,
           forceresource_format=None,
           outputdir=None):
    mode = "test" if doTest else "full"

    assert forceresource_dir is None and forceresource_format is None

    globalSettings = pubrunner.getGlobalSettings()

    os.chdir(directory)

    if os.path.isdir('.pubrunner_lock'):
        raise RuntimeError(
            "A .pubrunner_lock directory exists in this project directory. These are created by PubRunner during an incomplete run. Are you sure another instance of PubRunner is not currently running? If you're sure, you will need to delete this directory before continuing. The directory is: %s"
            % os.path.join(directory, '.pubrunner_lock'))

    os.mkdir('.pubrunner_lock')
    atexit.register(cleanup)

    toolYamlFile = 'pubrunner.yml'
    if not os.path.isfile(toolYamlFile):
        raise RuntimeError("Expected a %s file in root of codebase" %
                           toolYamlFile)

    toolSettings = pubrunner.loadYAML(toolYamlFile)
    toolName = toolSettings["name"]

    workspacesDir = os.path.expanduser(globalSettings["storage"]["workspace"])
    workingDirectory = os.path.join(workspacesDir, toolName, mode)
    if not os.path.isdir(workingDirectory):
        os.makedirs(workingDirectory)

    print("Working directory: %s" % workingDirectory)

    if not "build" in toolSettings:
        toolSettings["build"] = []
    if not "all" in toolSettings["resources"]:
        toolSettings["resources"]["all"] = []
    if not mode in toolSettings["resources"]:
        toolSettings["resources"][mode] = []

    preprocessResourceSettings(toolSettings)

    prepareConversionAndHashingRuns(toolSettings, mode, workingDirectory)

    resourcesInUse = toolSettings["resources"]['all'] + toolSettings[
        "resources"][mode]
    if doGetResources:
        print("\nGetting resources")
        for resName, _ in resourcesInUse:
            if resName in ['PUBMED_CUSTOM', 'PMCOA_CUSTOM']:
                continue
            pubrunner.getResource(resName)

        pmidsFromPMCFile, pmcidsToLastUpdate = None, None
        needPMIDsFromPMC = any(hashesInfo['removePMCOADuplicates']
                               for hashesInfo in toolSettings["pubmed_hashes"])
        #pmcoaIsAResource = any( resName == 'PMCOA' for resName,_ in resourcesInUse )
        if needPMIDsFromPMC:
            print(
                "\nGetting Pubmed Central metadata for PMID info and/or file dates"
            )
            pmidsFromPMCFile, pmcidsToLastUpdate = downloadPMCOAMetadata(
                workingDirectory)

        directoriesWithHashes = set()
        if toolSettings["pubmed_hashes"] != []:
            print("\nUsing Pubmed Hashes to identify updates")
            for hashesInfo in toolSettings["pubmed_hashes"]:
                hashDirectory = hashesInfo['hashDir']
                whichHashes = hashesInfo['whichHashes']
                removePMCOADuplicates = hashesInfo['removePMCOADuplicates']

                directoriesWithHashes.add(hashesInfo['resourceDir'])

                pmidDirectory = hashesInfo["resourceDir"].rstrip(
                    '/') + '.pmids'
                print("Using hashes in %s to identify PMID updates" %
                      hashDirectory)
                if removePMCOADuplicates:
                    assert not pmidsFromPMCFile is None
                    pubrunner.gatherPMIDs(hashDirectory,
                                          pmidDirectory,
                                          whichHashes=whichHashes,
                                          pmidExclusions=pmidsFromPMCFile)
                else:
                    pubrunner.gatherPMIDs(hashDirectory,
                                          pmidDirectory,
                                          whichHashes=whichHashes)

        print("\nRunning conversions")
        for conversionInfo in toolSettings["conversions"]:
            inDir, inFormat = conversionInfo['inDir'], conversionInfo[
                'inFormat']
            outDir, outFormat = conversionInfo['outDir'], conversionInfo[
                'outFormat']
            chunkSize = conversionInfo['chunkSize']
            useHashes = (inDir in directoriesWithHashes)

            chunksFile = outDir + '.json'
            previousChunks = {}
            if os.path.isfile(chunksFile):
                with open(chunksFile, 'r') as f:
                    previousChunks = json.load(f)

            allInputFiles = findFiles(inDir)
            #timestamps = { f:os.path.getmtime(f) for f in allInputFiles }
            timestamps = [os.path.getmtime(f) for f in allInputFiles]
            allInputFiles = sorted(list(zip(timestamps, allInputFiles)))
            timestampMap = {f: timestamp for timestamp, f in allInputFiles}
            allInputFiles = [f for timestamp, f in allInputFiles]

            outPattern = os.path.basename(inDir).replace(
                '_UNCONVERTED', '') + ".%08d." + outFormat
            newChunks = assignFilesForConversion(allInputFiles, previousChunks,
                                                 outDir, outPattern, chunkSize)

            with open(chunksFile, 'w') as f:
                json.dump(newChunks, f, indent=2)

            chunkDir = outDir + '.CHUNKS'
            if os.path.isdir(chunkDir):
                shutil.rmtree(chunkDir)
            os.makedirs(chunkDir)
            for outputfile, inputfiles in newChunks.items():
                latestTimestamp = max(timestampMap[inputfile]
                                      for inputfile in inputfiles)
                chunkFile = os.path.join(chunkDir,
                                         os.path.basename(outputfile))
                with open(chunkFile, 'w') as f:
                    json.dump(inputfiles, f)
                os.utime(chunkFile, (latestTimestamp, latestTimestamp))
                #print(latestTimestamp)

            parameters = {
                'CHUNKDIR': chunkDir,
                'OUTDIR': outDir,
                'INFORMAT': inFormat,
                'OUTFORMAT': outFormat
            }

            if useHashes:
                pmidDir = inDir.rstrip('/') + '.pmids'
                assert os.path.isdir(
                    pmidDirectory
                ), "Cannot find PMIDs directory for resource. Tried: %s" % pmidDirectory

                pmidChunkDir = outDir + '.PMIDCHUNKS'
                if os.path.isdir(pmidChunkDir):
                    shutil.rmtree(pmidChunkDir)
                os.makedirs(pmidChunkDir)
                for outputfile, inputfiles in newChunks.items():
                    pmidfiles = [
                        os.path.join(pmidDir,
                                     os.path.basename(f) + '.pmids')
                        for f in inputfiles
                    ]
                    latestTimestamp = max(
                        os.path.getmtime(pmidfile) for pmidfile in pmidfiles)
                    pmidChunkFile = os.path.join(pmidChunkDir,
                                                 os.path.basename(outputfile))
                    with open(pmidChunkFile, 'w') as f:
                        json.dump(pmidfiles, f)
                    os.utime(pmidChunkFile, (latestTimestamp, latestTimestamp))

                parameters['PMIDCHUNKDIR'] = pmidChunkDir

            convertSnakeFile = os.path.join(pubrunner.__path__[0],
                                            'Snakefiles', 'Convert.py')
            pubrunner.launchSnakemake(convertSnakeFile, parameters=parameters)

    else:
        print("\nNot getting resources (--nogetresource)")

    runSnakeFile = os.path.join(pubrunner.__path__[0], 'Snakefiles', 'Run.py')
    for commandGroup in ["build", "run"]:
        for i, command in enumerate(toolSettings[commandGroup]):
            print("\nStarting '%s' command #%d: %s" %
                  (commandGroup, i + 1, command))
            useClusterIfPossible = True
            parameters = {'COMMAND': command, 'DATADIR': workingDirectory}
            pubrunner.launchSnakemake(runSnakeFile,
                                      useCluster=useClusterIfPossible,
                                      parameters=parameters)
            print("")

    if "output" in toolSettings:
        outputList = toolSettings["output"]
        if not isinstance(outputList, list):
            outputList = [outputList]

        outputLocList = [os.path.join(workingDirectory, o) for o in outputList]

        print(
            "\nExecution of tool is complete. Full paths of output files are below:"
        )
        for f in outputLocList:
            print('  %s' % f)
        print()

        if not outputdir is None:
            print("\nCopying results to output directory: %s" % outputdir)
            if not os.path.isdir(outputdir):
                os.makedirs(outputdir)
            for o in outputList:
                fromFile = os.path.join(workingDirectory, o)
                toFile = os.path.join(outputdir, o)
                shutil.copy(fromFile, toFile)

        if mode != 'test':

            dataurl = None
            if "upload" in globalSettings:
                if "ftp" in globalSettings["upload"]:
                    print("Uploading results to FTP")
                    pubrunner.pushToFTP(outputLocList, toolSettings,
                                        globalSettings)
                if "local-directory" in globalSettings["upload"]:
                    print("Uploading results to local directory")
                    pubrunner.pushToLocalDirectory(outputLocList, toolSettings,
                                                   globalSettings)
                if "zenodo" in globalSettings["upload"]:
                    print("Uploading results to Zenodo")
                    dataurl = pubrunner.pushToZenodo(outputLocList,
                                                     toolSettings,
                                                     globalSettings)
Example #6
0
def getResourceLocation(resource):
    globalSettings = pubrunner.getGlobalSettings()
    resourceDir = os.path.expanduser(globalSettings["storage"]["resources"])
    thisResourceDir = os.path.join(resourceDir, resource)
    return thisResourceDir