Esempio n. 1
0
def main():
    log.info("Deleting old distribution...")
    shutil.rmtree(BUILD_DIR, ignore_errors=True)
    shutil.rmtree(DIST_SUBDIR, ignore_errors=True)  # NOT DIST_DIR
    os.makedirs(BUILD_DIR, exist_ok=True)
    os.makedirs(DIST_DIR, exist_ok=True)

    log.info("Building new distribution...")
    args = (['pyinstaller', '--clean', '--log-level=INFO'] +
            PYINSTALLER_EXTRA_OPTIONS + [SPECFILE])
    with pushd(CWD_FOR_PYINSTALLER):
        log.debug("In directory: {}".format(os.getcwd()))
        log.debug("Running PyInstaller with args: {!r}".format(args))
        subprocess.check_call(args)

    log.info("Zipping to {!r}...".format(ZIPFILEBASE))
    zipfile = shutil.make_archive(ZIPFILEBASE, ZIPFORMAT, DIST_SUBDIR)

    log.info("""
The {DIST_SUBDIR} directory should contain everything you need to run.
Run with: {LAUNCHFILE}
Look for warnings in: {WARNFILE}
To distribute, use {zipfile}
    """.format(
        DIST_SUBDIR=DIST_SUBDIR,
        LAUNCHFILE=LAUNCHFILE,
        WARNFILE=WARNFILE,
        zipfile=zipfile,
    ))
Esempio n. 2
0
def untar_to_directory(tarfile: str,
                       directory: str,
                       verbose: bool = False,
                       gzipped: bool = False,
                       skip_if_dir_exists: bool = True,
                       run_func: RunFuncType = None,
                       chdir_via_python: bool = True,
                       tar_executable: str = None,
                       tar_supports_force_local: bool = None) -> None:
    """
    Unpacks a TAR file into a specified directory.

    Args:
        tarfile:
            filename of the ``.tar`` file
        directory:
            destination directory
        verbose:
            be verbose?
        gzipped:
            is the ``.tar`` also gzipped, e.g. a ``.tar.gz`` file?
        skip_if_dir_exists:
            don't do anything if the destrination directory exists?
        run_func:
            function to use to call an external command
        chdir_via_python:
            change directory via Python, not via ``tar``. Consider using this
            via Windows, because Cygwin ``tar`` v1.29 falls over when given a
            Windows path for its ``-C`` (or ``--directory``) option.
        tar_executable:
            name of the ``tar`` executable (default is ``tar``)
        tar_supports_force_local:
            does tar support the ``--force-local`` switch? If you pass ``None``
            (the default), this is checked directly via ``tar --help``.
            Linux/GNU tar does; MacOS tar doesn't; Cygwin tar does; Windows 10
            (build 17063+) tar doesn't.
    """
    if skip_if_dir_exists and os.path.isdir(directory):
        log.info("Skipping extraction of {} as directory {} exists", tarfile,
                 directory)
        return
    tar = which_and_require(tar_executable or "tar")
    if tar_supports_force_local is None:
        tar_supports_force_local = tar_supports_force_local_switch(tar)
    log.info("Extracting {} -> {}", tarfile, directory)
    mkdir_p(directory)
    args = [tar, "-x"]  # -x: extract
    if verbose:
        args.append("-v")  # -v: verbose
    if gzipped:
        args.append("-z")  # -z: decompress using gzip
    if tar_supports_force_local:
        args.append("--force-local")  # allows filenames with colons in
    args.extend(["-f", tarfile])  # -f: filename follows
    if chdir_via_python:
        with pushd(directory):
            run_func(args)
    else:
        # chdir via tar
        args.extend(["-C", directory])  # -C: change to directory
        run_func(args)
Esempio n. 3
0
def prepare_umls_for_bioyodie(cfg: UmlsBioyodieConversionConfig) -> None:
    """
    Prepare downloaded UMLS data for Bio-YODIE, according to the instructions
    at https://github.com/GateNLP/bio-yodie-resource-prep.
    """
    # -------------------------------------------------------------------------
    # Parameter checks
    # -------------------------------------------------------------------------
    assert cfg.java_home
    assert cfg.gate_home

    # -------------------------------------------------------------------------
    # Establish the release (version)
    # -------------------------------------------------------------------------
    # There are two releases per year, e.g. 2017AA and 2017AB.
    release_regex = regex.compile(r"umls-(\d\d\d\dA[AB])-full.zip")
    umls_zip_basename = os.path.basename(cfg.umls_zip)
    try:
        release = release_regex.match(umls_zip_basename).group(1)
    except AttributeError:  # 'NoneType' object has no attribute 'group'
        release = None  # for type-checker only (below)
        die(f"Unable to work out UMLS release from filename: "
            f"{umls_zip_basename!r}")

    # -------------------------------------------------------------------------
    # Directory names
    # -------------------------------------------------------------------------
    umls_root_dir = join(cfg.tmp_dir, "umls_data_with_mmsys")
    umls_metadir = umls_root_dir
    umls_mmsys_home = umls_metadir
    # ... because the GUI installer wants "release.dat" (which is in the root
    # and config/2017AA directories of "mmsys.zip") to be in the same directory
    # as the Metathesaurus files. Do NOT put it in a "MMSYS" subdirectory,
    # despite
    # https://www.nlm.nih.gov/research/umls/implementation_resources/community/mmsys/BatchMRCXTBuilder.html
    umls_lib_dir = join(umls_mmsys_home, "lib")
    umls_plugins_dir = join(umls_mmsys_home, "plugins")

    umls_output_dir = join(cfg.tmp_dir, "umls_output")
    # ... Where we tell it to store data.
    # Log files and other output go here.

    bioyodie_repo_dir = join(cfg.tmp_dir, "bio-yodie-resource-prep")
    bioyodie_db_dir = join(bioyodie_repo_dir, "databases")
    bioyodie_scala_dir = join(bioyodie_repo_dir, "scala")
    bioyodie_tmpdata_dir = join(bioyodie_repo_dir, "tmpdata")
    bioyodie_umls_dir_containing_symlink = join(
        bioyodie_repo_dir, "srcs", "umls", "2015AB")  # hard-coded "2015AB"
    bioyodie_umls_input_dir = join(bioyodie_umls_dir_containing_symlink,
                                   "META")  # hard-coded "META"
    bioyodie_output_dir = join(bioyodie_repo_dir, "output")

    # -------------------------------------------------------------------------
    # Filenames
    # -------------------------------------------------------------------------
    scala_tgz = join(bioyodie_scala_dir, "scala.tgz")
    builder_script = join(bioyodie_repo_dir, "bin", "all.sh")
    mmsys_zip = join(umls_root_dir, "mmsys.zip")
    config_file = join(umls_metadir, "config.properties")
    boot_config = join(umls_mmsys_home, "etc", "subset.boot.properties")
    log4j_config = join(umls_mmsys_home, "etc",
                        "rudolf.log4j.properties")  # new  # noqa

    system_java_home = cfg.java_home
    umls_java_home = join(umls_mmsys_home, "jre", "linux")  # it brings its own

    # -------------------------------------------------------------------------
    # Checks
    # -------------------------------------------------------------------------
    if os.path.exists(cfg.dest_dir):
        die(f"Directory already exists: {cfg.dest_dir}")
    system_unzip = require_external_tool("unzip")
    # These are required by the Bio-YODIE preprocessor:
    groovy_executable = cfg.groovy_executable or require_external_tool(
        "groovy")  # noqa
    require_external_tool("gzip")
    require_external_tool("zcat")

    # -------------------------------------------------------------------------
    # Environment variables
    # -------------------------------------------------------------------------
    # For UMLS
    umls_env = os.environ.copy()
    umls_env[EnvVar.JAVA_HOME] = umls_java_home
    # For Bio-YODIE preprocessor
    bioyodie_env = os.environ.copy()
    bioyodie_env[EnvVar.JAVA_HOME] = system_java_home
    bioyodie_env[EnvVar.GATE_HOME] = cfg.gate_home
    groovy_dir = os.path.dirname(os.path.abspath(groovy_executable))
    old_path = bioyodie_env.get(EnvVar.PATH, "")
    new_path_with_groovy = os.pathsep.join(x for x in [groovy_dir, old_path]
                                           if x)
    bioyodie_env[EnvVar.PATH] = new_path_with_groovy

    # -------------------------------------------------------------------------
    log.info("Cloning Bio-YODIE resource prep repository...")
    # -------------------------------------------------------------------------
    check_call_verbose(
        ["git", "clone", cfg.bioyodie_prep_repo_url, bioyodie_repo_dir])

    # -------------------------------------------------------------------------
    log.info("Making directories...")
    # -------------------------------------------------------------------------
    mkdir_p(umls_output_dir)
    mkdir_p(bioyodie_db_dir)
    # mkdir_p(bioyodie_scala_dir)  # already exists
    mkdir_p(bioyodie_tmpdata_dir)
    mkdir_p(bioyodie_umls_dir_containing_symlink)
    mkdir_p(bioyodie_output_dir)

    # -------------------------------------------------------------------------
    log.info("Fetching/building Scala for the BioYODIE processor...")
    # -------------------------------------------------------------------------
    # ... either before we set JAVA_HOME (to use the system Java) or after
    # we've unpacked MMSYS (which brings its own JRE), but not in between!
    download(cfg.scala_url, scala_tgz)
    with pushd(bioyodie_scala_dir):
        check_call_verbose(["tar", "-xzvf", scala_tgz])
        check_call_verbose(["ant"], env=bioyodie_env)

    # -------------------------------------------------------------------------
    log.info("Unzipping UMLS data...")
    # -------------------------------------------------------------------------
    check_call_verbose(["unzip", "-j", cfg.umls_zip, "-d", umls_root_dir])
    # -j: junk paths (extract "flat" into the specified directory)

    # -------------------------------------------------------------------------
    log.info("Unzipping UMLS MetamorphoSys (MMSYS) program (and its JRE)...")
    # -------------------------------------------------------------------------
    check_call_verbose(["unzip", mmsys_zip, "-d", umls_mmsys_home])
    # "To ensure proper functionality users must unzip mmsys.zip to the same
    # directory as the other downloaded files."
    # -- https://www.ncbi.nlm.nih.gov/books/NBK9683/
    # ... but see also example above.

    # -------------------------------------------------------------------------
    log.info("Running MetamorphoSys in batch mode...")
    # -------------------------------------------------------------------------
    # https://www.nlm.nih.gov/research/umls/implementation_resources/community/mmsys/BatchMetaMorphoSys.html  # noqa
    classpath = ":".join([
        umls_mmsys_home,
        umls_plugins_dir,  # RNC extra
        join(umls_lib_dir, "jpf-boot.jar"),
        join(umls_lib_dir, "jpf.jar"),  # RNC extra
        # You can use "dir/*" to mean "all JAR files in a directory":
        # https://en.wikipedia.org/wiki/Classpath
        join(umls_plugins_dir, "gov.nih.nlm.umls.meta", "lib",
             "*"),  # RNC extra  # noqa
        join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys", "lib",
             "*"),  # RNC extra  # noqa
        join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys.gui", "lib",
             "*"),  # RNC extra  # noqa
        join(umls_plugins_dir, "gov.nih.nlm.umls.mmsys.io", "lib",
             "*"),  # RNC extra  # noqa
        join(umls_plugins_dir, "gov.nih.nlm.umls.util", "lib",
             "*"),  # RNC extra  # noqa
    ])
    write_text(
        config_file,
        get_mmsys_configfile_text(metadir=umls_metadir,
                                  mmsys_home=umls_mmsys_home,
                                  release=release))
    write_text(log4j_config, LOG4J_PROPERTIES_TEXT)
    with pushd(umls_mmsys_home):
        log.warning(f"The next step is slow, and doesn't say much. "
                    f"It produces roughly 29 Gb at peak. "
                    f"Watch progress with: "
                    f"watch 'du -bc {cfg.tmp_dir} | tail -1'")
        check_call_verbose(
            [
                join(cfg.java_home, "bin", "java"),
                "-classpath",
                classpath,
                "-Djava.awt.headless=true",
                f"-Djpf.boot.config={boot_config}",
                f"-Dlog4j.configurationFile={log4j_config}",
                # not "log4j.configuration" as in the original! Argh.
                # http://logging.apache.org/log4j/2.x/manual/configuration.html
                f"-Dinput.uri={umls_metadir}",
                f"-Doutput.uri={umls_output_dir}",
                f"-Dmmsys.config.uri={config_file}",
                # Additional from run_linux.sh:
                "-client",  # JVM option: client rather than server mode
                "-Dunzip.native=true",
                f"-Dunzip.path={system_unzip}",
                "-Dfile.encoding=UTF-8",
                "-Xms1000M",  # was 300M, but it's 1000M in run_linux.sh
                "-Xmx2000M",  # was 1000M, but it's 2000M in run_linux.sh
                "org.java.plugin.boot.Boot"
            ],
            env=umls_env)

    # -------------------------------------------------------------------------
    log.info("Converting UMLS data to Bio-YODIE format...")
    # -------------------------------------------------------------------------
    os.symlink(src=umls_output_dir,
               dst=bioyodie_umls_input_dir,
               target_is_directory=True)
    with pushd(bioyodie_repo_dir):
        log.warning("The next step is also slow.")
        check_call_verbose([builder_script], env=bioyodie_env)

    # -------------------------------------------------------------------------
    log.info(f"Moving Bio-YODIE data to destination directory: {cfg.dest_dir}")
    # -------------------------------------------------------------------------
    output_files = os.listdir(bioyodie_output_dir)
    if output_files:
        shutil.copytree(bioyodie_output_dir, cfg.dest_dir)
        # ... destination should not already exist
        # ... it will make intermediate directories happily
    else:
        log.error(f"No output files in {bioyodie_output_dir}! "
                  f"Did the Bio-YODIE preprocessor partly crash?")
Esempio n. 4
0
def launch_slurm(jobname: str,
                 cmd: str,
                 memory_mb: int,
                 project: str,
                 qos: str,
                 email: str,
                 duration: timedelta,
                 tasks_per_node: int,
                 cpus_per_task: int,
                 partition: str = "",
                 modules: List[str] = None,
                 directory: str = os.getcwd(),
                 encoding: str = "ascii") -> None:
    """
    Launch a job into the SLURM environment.

    Args:
        jobname: name of the job
        cmd: command to be executed
        memory_mb: maximum memory requirement per process (Mb)
        project: project name
        qos: quality-of-service name
        email: user's e-mail address
        duration: maximum duration per job
        tasks_per_node: tasks per (cluster) node
        cpus_per_task: CPUs per task
        partition: cluster partition name
        modules: SLURM modules to load
        directory: directory to change to
        encoding: encoding to apply to launch script as sent to ``sbatch``
    """
    if partition:
        partition_cmd = f"#SBATCH -p {partition}"
    else:
        partition_cmd = ""
    if modules is None:
        modules = ["default-wbic"]

    log.info("Launching SLURM job: {}", jobname)
    script = f"""#!/bin/bash

#! Name of the job:
#SBATCH -J {jobname}

#! Which project should jobs run under:
#SBATCH -A {project}

#! What QoS [Quality of Service] should the job run in?
#SBATCH --qos={qos}

#! How much resource should be allocated?
#SBATCH --tasks-per-node={tasks_per_node}
#SBATCH --cpus-per-task={cpus_per_task}

#! Memory requirements
#SBATCH --mem={memory_mb}

#! How much wall-clock time will be required?
#SBATCH --time={strfdelta(duration, SLURM_TIMEDELTA_FMT)}

#! What e-mail address to use for notifications?
#SBATCH --mail-user={email}

#! What types of email messages do you wish to receive?
#SBATCH --mail-type=ALL

#! Uncomment this to prevent the job from being requeued (e.g. if
#! interrupted by node failure or system downtime):
#! SBATCH --no-requeue

#! Partition
{partition_cmd}

#! sbatch directives end here (put any additional directives above this line)

#! ############################################################
#! Modify the settings below to specify the application's environment, location
#! and launch method:

#! Optionally modify the environment seen by the application
#! (note that SLURM reproduces the environment at submission irrespective of ~/.bashrc):
. /etc/profile.d/modules.sh                # Leave this line (enables the module command)
module purge                               # Removes all modules still loaded
module load {" ".join(modules)}            # Basic one, e.g. default-wbic, is REQUIRED - loads the basic environment

#! Insert additional module load commands after this line if needed:

#! Full path to your application executable:
application="hostname"

#! Run options for the application:
options=""

#! Work directory (i.e. where the job will run):
workdir="$SLURM_SUBMIT_DIR"  # The value of SLURM_SUBMIT_DIR sets workdir to the directory
                             # in which sbatch is run.

#! Are you using OpenMP (NB this is **unrelated to OpenMPI**)? If so increase this
#! safe value to no more than 24:
export OMP_NUM_THREADS=24

# Command line to be submited by SLURM:
CMD="{cmd}"

###############################################################
### You should not have to change anything below this line ####
###############################################################

cd $workdir
echo -e "Changed directory to `pwd`.\n"

JOBID=$SLURM_JOB_ID

echo -e "JobID: $JOBID\n======"
echo "Time: `date`"
echo "Running on master node: `hostname`"
echo "Current directory: `pwd`"

if [ "$SLURM_JOB_NODELIST" ]; then
    #! Create a machine file:
    export NODEFILE=`/usr/bin/generate_pbs_nodefile`
    cat $NODEFILE | uniq > machine.file.$JOBID
    echo -e "\nNodes allocated:\n================"
    echo `cat machine.file.$JOBID | sed -e 's/\..*$//g'`
fi

echo -e "\nExecuting command:\n==================\n$CMD\n"

eval $CMD
    """  # noqa
    cmdargs = ["sbatch"]
    with pushd(directory):
        p = Popen(cmdargs, stdin=PIPE)
        p.communicate(input=script.encode(encoding))