Ejemplo n.º 1
0
def verify_conf(parser):
    # Verify whatever preconditions we can verify
    ref = GlobalConf['reference_archive']
    if not (ref.endswith('.tar.gz') or ref.endswith('.tar.bz2')
            or ref.endswith('.tar')):
        parser.error("Reference {} doesn't seem to be an archive!".format(ref))
    if not phdfs.path.exists(ref):
        parser.error("Reference {} doesn't seem to exist".format(ref))

    if GlobalConf['job_manager_mem'] <= 100:
        parser.error("job_manager_mem of {:d} is too low".format(
            GlobalConf['job_manager_mem']))

    if GlobalConf['task_manager_mem'] <= 1000:
        parser.error("task_manager_mem of {:d} is too low".format(
            GlobalConf['task_manager_mem']))

    if GlobalConf.get('session_wait', 0) < 0:
        parser.error(
            "session_wait, if present, must be >= 0 (found {})".format(
                GlobalConf['session_wait']))

    # test whether we can find the executables  we need to run
    for e in ('yarn-session.sh', 'flink', 'seal', 'yarn', 'hdfs'):
        get_exec(e)
Ejemplo n.º 2
0
def parse_args(args):
    p = make_parser()
    options = p.parse_args(args)

    # check bcl converter and bwa path
    if options.converter_path:
        if not os.path.exists(options.converter_path):
            p.error("Specified converter doesn't exist")
        if not os.access(options.converter_path, os.X_OK | os.R_OK):
            p.error("Specified converter is not executable")
    else:
        options.converter_path = get_exec('bcl2fastq')

    if options.bwa_path:
        if not os.path.exists(options.bwa_path):
            p.error("Specified bwa doesn't exist")
        if not os.access(options.bwa_path, os.X_OK | os.R_OK):
            p.error("Specified bwa is not executable")
    else:
        options.bwa_path = get_exec('bwa')

    if options.keep_intermediate and options.skip_bcl:
        p.error("--keep-intermediate and --skip-bcl are incompatible")

    try:
        log_level = getattr(logging, options.log_level)
        options.log_level = log_level # overwrite the existing value
    except AttributeError as e:
        # this should never happend since we restricted the valid
        # choices at the level of the argument parser
        p.error("Invalid log level! " + e.message)

    verify_conf(p)

    return options
Ejemplo n.º 3
0
def run_alignments(bcl_output_dir, output_dir):
    sample_directories = _get_samples_from_bcl_output(bcl_output_dir)
    logger.info("Found %d samples in bcl output directory", len(sample_directories))
    logger.debug("Making base output directory %s", output_dir)
    phdfs.mkdir(output_dir)
    # launch all the jobs
    base_cmd = [
            get_exec('seal'), 'seqal', '--align-only',
            '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']),
            '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']),
            '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'),
            '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'),
            '--ref-archive', GlobalConf['reference_archive'],
        ]
    def start_job(sample_dir):
        sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir))
        cmd = base_cmd + [ sample_dir, sample_output_dir ]
        # LP: should refactor to start the job within the AlignJob object
        job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir)
        logger.info("Launching alignment of sample %s", os.path.basename(sample_dir))
        logger.debug("executing command: %s", cmd)
        job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096)
        job.popen_obj.poll()
        logger.debug("job running with PID %d", job.popen_obj.pid)
        return job

    jobs = [ start_job(s) for s in sample_directories ]
    ok = _wait(jobs, GlobalConf['remove_output'])
    if not ok:
        errored_jobs = [ j for j in jobs if j.failed ]
        logger.error("%d alignment jobs failed", len(errored_jobs))
        logger.error("Here are the return codes: %s", ', '.join([ str(j.retcode) for j in errored_jobs ]))
        raise RuntimeError("Some alignment jobs failed")
Ejemplo n.º 4
0
def _start_flink_yarn_session(n_nodes):
    """
    :return: yarn application id of the session
    """
    cmd = [
        get_exec('yarn-session.sh'),
        '-n',
        n_nodes * 2,
        '-jm',
        GlobalConf['job_manager_mem'],  # job manager memory
        '-tm',
        GlobalConf['task_manager_mem'],  # task manager memory
        '-s',
        GlobalConf['slots'],
        '-d',  # run in detached mode
    ]
    logger.info("Starting flink session on Yarn in detached mode")
    logger.info(
        "Configuration:\n\tnodes: %d\n\tjm mem: %d\n\ttm mem: %d\n\tslots: %d",
        n_nodes, GlobalConf['job_manager_mem'], GlobalConf['task_manager_mem'],
        GlobalConf['slots'])
    logger.debug("executing command: %s", cmd)
    try:
        output = subprocess.check_output(map(str, cmd))
    except subprocess.CalledProcessError:
        logger.error("Failed to start Flink session on Yarn!")
        raise

    logger.debug(
        "Session output\n============================================================\n"
        "%s\n============================================================",
        output)
    app_id = _parse_session_output(output)
    logger.info("Flink session started with application id '%s'", app_id)
    state, final_state = _get_app_status(app_id)

    while state != 'RUNNING' and final_state == 'UNDEFINED':
        logger.debug(
            "Waiting for session to enter the RUNNING state (currently in %s)",
            state)
        time.sleep(2)
        state, final_state = _get_app_status(app_id)

    if final_state != 'UNDEFINED':
        raise RuntimeError(
            "Problem!! Flink session {} has terminated!  Final state: {}".
            format(app_id, final_state))

    logger.info("Flink session %s RUNNING", app_id)

    if GlobalConf.get('session_wait', 0) > 0:
        logger.info(
            "Waiting for %d seconds to flink session to start TaskManagers",
            GlobalConf['session_wait'])
        time.sleep(GlobalConf['session_wait'])
        logger.debug("Wait finished.")

    return app_id
Ejemplo n.º 5
0
def _run_converter_and_yarn_session(input_dir, output_dir, n_nodes, jar_path):
    # setup properties file
    run_dir = tempfile.mkdtemp(prefix="bclconverter_run_dir")
    try:
        ## start by preparing the properties file (at the moment the program
        # doesn't accept command line arguments
        tmp_conf_dir = os.path.join(run_dir, "conf")
        os.makedirs(tmp_conf_dir)
        props_file = os.path.join(tmp_conf_dir, GlobalConf['props_filename'])
        with open(props_file, 'w') as f:
            f.write("root = {}/\n".format(input_dir.rstrip('/')))
            f.write("fout = {}/\n".format(output_dir.rstrip('/')))
            f.write("numTasks = {:d}\n".format(GlobalConf['tasksPerNode'] *
                                               n_nodes))
            f.write("flinkpar = {:d}\n".format(GlobalConf['flinkpar']))
            f.write("jnum = {:d}\n".format(GlobalConf['jnum']))

        logger.info("Wrote properties in file %s", props_file)
        if logger.isEnabledFor(logging.DEBUG):
            with open(props_file) as f:
                logger.debug(
                    "\n=============================\n%s\n=====================\n",
                    f.read())
        # now run the program
        logger.debug("Running flink cwd %s", run_dir)
        cmd = [
            get_exec("flink"),
            "run",
            "-m",
            "yarn-cluster",
            '-yn',
            n_nodes,
            '-yjm',
            GlobalConf['job_manager_mem'],  # job manager memory
            '-ytm',
            GlobalConf['task_manager_mem'],  # task manager memory
            '-ys',
            GlobalConf['slots'],
            "-c",
            "bclconverter.bclreader.test",  # class name
            jar_path
        ]
        logger.debug("executing command: %s", cmd)
        with chdir(run_dir):
            logger.debug("In CWD, where we're going to run flink")
            logger.debug("cat conf/bclconverter.properties gives:")
            subprocess.check_call("cat conf/bclconverter.properties",
                                  shell=True)
            logger.debug("Now running flink")
            subprocess.check_call(map(str, cmd), cwd=run_dir)
    finally:
        logger.debug("Removing run directory %s", run_dir)
        try:
            shutil.rmtree(run_dir)
        except IOError as e:
            logger.debug("Error cleaning up temporary dir %s", run_dir)
            logger.debug(e.message)
Ejemplo n.º 6
0
def verify_conf(parser):
    # Verify whatever preconditions we can verify
    ref = GlobalConf['reference_archive']
    if not (ref.endswith('.tar.gz') or ref.endswith('.tar.bz2') or ref.endswith('.tar')):
        parser.error("Reference {} doesn't seem to be an archive!".format(ref))
    if not phdfs.path.exists(ref):
        parser.error("Reference {} doesn't seem to exist".format(ref))

    if GlobalConf['job_manager_mem'] <= 100:
        parser.error("job_manager_mem of {:d} is too low".format(GlobalConf['job_manager_mem']))

    if GlobalConf['task_manager_mem'] <= 1000:
        parser.error("task_manager_mem of {:d} is too low".format(GlobalConf['task_manager_mem']))

    if GlobalConf.get('session_wait', 0) < 0:
        parser.error("session_wait, if present, must be >= 0 (found {})".format(GlobalConf['session_wait']))

    # test whether we can find the executables  we need to run
    for e in ('yarn-session.sh', 'flink', 'seal', 'yarn', 'hdfs'):
        get_exec(e)
Ejemplo n.º 7
0
def _yarn_kill_all_apps():
    error = False
    yarn_exec = get_exec('yarn')
    for app_id in _yarn_get_app_ids():
        cmd = [ yarn_exec, 'application', '-kill', app_id ]
        logger.debug("killing application %s: %s", app_id, cmd)
        retcode = subprocess.call(cmd)
        if retcode != 0:
            logger.info("Failed to kill yarn application %s", app_id)
            error = True
    if error:
        raise RuntimeError("Failed to kill some running yarn applications")
Ejemplo n.º 8
0
def _yarn_kill_all_apps():
    error = False
    yarn_exec = get_exec('yarn')
    for app_id in _yarn_get_app_ids():
        cmd = [yarn_exec, 'application', '-kill', app_id]
        logger.debug("killing application %s: %s", app_id, cmd)
        retcode = subprocess.call(cmd)
        if retcode != 0:
            logger.info("Failed to kill yarn application %s", app_id)
            error = True
    if error:
        raise RuntimeError("Failed to kill some running yarn applications")
Ejemplo n.º 9
0
    def _clear_caches(self):
        logger.info("Clearing system caches on cluster")
        nodes = yarn_get_node_list()
        hostnames = set([n.split(':')[0].strip() for n in nodes])
        logger.debug("Found %d yarn nodemanager hosts", len(hostnames))

        clean_cmd = "sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'"
        logger.debug("Using pdsh")
        pdsh_cmd = [
            get_exec('pdsh'), '-R', 'ssh', '-w', ','.join(hostnames), clean_cmd
        ]

        logger.debug("cmd: %s", pdsh_cmd)
        subprocess.check_call(pdsh_cmd)
Ejemplo n.º 10
0
    def _clear_caches(self):
        logger.info("Clearing system caches on cluster")
        nodes = yarn_get_node_list()
        hostnames = set([ n.split(':')[0].strip() for n in nodes ])
        logger.debug("Found %d yarn nodemanager hosts", len(hostnames))

        clean_cmd = "sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'"
        logger.debug("Using pdsh")
        pdsh_cmd = [ get_exec('pdsh'),
                     '-R', 'ssh',
                     '-w', ','.join(hostnames),
                     clean_cmd ]

        logger.debug("cmd: %s", pdsh_cmd)
        subprocess.check_call(pdsh_cmd)
Ejemplo n.º 11
0
def _start_flink_yarn_session(n_nodes):
    """
    :return: yarn application id of the session
    """
    cmd = [ get_exec('yarn-session.sh'),
             '-n',  n_nodes * 2,
             '-jm', GlobalConf['job_manager_mem'], # job manager memory
             '-tm', GlobalConf['task_manager_mem'], # task manager memory
             '-s',  GlobalConf['slots'],
             '-d', # run in detached mode
          ]
    logger.info("Starting flink session on Yarn in detached mode")
    logger.info("Configuration:\n\tnodes: %d\n\tjm mem: %d\n\ttm mem: %d\n\tslots: %d",
            n_nodes, GlobalConf['job_manager_mem'], GlobalConf['task_manager_mem'], GlobalConf['slots'])
    logger.debug("executing command: %s", cmd)
    try:
        output = subprocess.check_output(map(str, cmd))
    except subprocess.CalledProcessError:
        logger.error("Failed to start Flink session on Yarn!")
        raise

    logger.debug(
            "Session output\n============================================================\n"
            "%s\n============================================================", output)
    app_id = _parse_session_output(output)
    logger.info("Flink session started with application id '%s'", app_id)
    state, final_state = _get_app_status(app_id)

    while state != 'RUNNING' and final_state == 'UNDEFINED':
        logger.debug("Waiting for session to enter the RUNNING state (currently in %s)", state)
        time.sleep(2)
        state, final_state = _get_app_status(app_id)

    if final_state != 'UNDEFINED':
        raise RuntimeError("Problem!! Flink session {} has terminated!  Final state: {}".format(app_id, final_state))

    logger.info("Flink session %s RUNNING", app_id)

    if GlobalConf.get('session_wait', 0) > 0:
        logger.info("Waiting for %d seconds to flink session to start TaskManagers",
                GlobalConf['session_wait'])
        time.sleep(GlobalConf['session_wait'])
        logger.debug("Wait finished.")

    return app_id
Ejemplo n.º 12
0
def run_alignments(bcl_output_dir, output_dir):
    sample_directories = _get_samples_from_bcl_output(bcl_output_dir)
    logger.info("Found %d samples in bcl output directory",
                len(sample_directories))
    logger.debug("Making base output directory %s", output_dir)
    phdfs.mkdir(output_dir)
    # launch all the jobs
    base_cmd = [
        get_exec('seal'),
        'seqal',
        '--align-only',
        '-D',
        'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']),
        '-D',
        'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']),
        '--input-format',
        GlobalConf.get('seqal_input_fmt', 'prq'),
        '--output-format',
        GlobalConf.get('seqal_output_fmt', 'sam'),
        '--ref-archive',
        GlobalConf['reference_archive'],
    ]

    def start_job(sample_dir):
        sample_output_dir = phdfs.path.join(output_dir,
                                            os.path.basename(sample_dir))
        cmd = base_cmd + [sample_dir, sample_output_dir]
        # LP: should refactor to start the job within the AlignJob object
        job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir)
        logger.info("Launching alignment of sample %s",
                    os.path.basename(sample_dir))
        logger.debug("executing command: %s", cmd)
        job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096)
        job.popen_obj.poll()
        logger.debug("job running with PID %d", job.popen_obj.pid)
        return job

    jobs = [start_job(s) for s in sample_directories]
    ok = _wait(jobs, GlobalConf['remove_output'])
    if not ok:
        errored_jobs = [j for j in jobs if j.failed]
        logger.error("%d alignment jobs failed", len(errored_jobs))
        logger.error("Here are the return codes: %s",
                     ', '.join([str(j.retcode) for j in errored_jobs]))
        raise RuntimeError("Some alignment jobs failed")
Ejemplo n.º 13
0
def _run_converter_and_yarn_session(input_dir, output_dir, n_nodes, jar_path):
    # setup properties file
    run_dir = tempfile.mkdtemp(prefix="bclconverter_run_dir")
    try:
        ## start by preparing the properties file (at the moment the program
        # doesn't accept command line arguments
        tmp_conf_dir = os.path.join(run_dir, "conf")
        os.makedirs(tmp_conf_dir)
        props_file = os.path.join(tmp_conf_dir, GlobalConf['props_filename'])
        with open(props_file, 'w') as f:
            f.write("root = {}/\n".format(input_dir.rstrip('/')))
            f.write("fout = {}/\n".format(output_dir.rstrip('/')))
            f.write("numTasks = {:d}\n".format(GlobalConf['tasksPerNode'] * n_nodes))
            f.write("flinkpar = {:d}\n".format(GlobalConf['flinkpar']))
            f.write("jnum = {:d}\n".format(GlobalConf['jnum']))

        logger.info("Wrote properties in file %s", props_file)
        if logger.isEnabledFor(logging.DEBUG):
            with open(props_file) as f:
                logger.debug("\n=============================\n%s\n=====================\n", f.read())
        # now run the program
        logger.debug("Running flink cwd %s", run_dir)
        cmd = [ get_exec("flink"), "run",
                "-m", "yarn-cluster",
                 '-yn',  n_nodes,
                 '-yjm', GlobalConf['job_manager_mem'], # job manager memory
                 '-ytm', GlobalConf['task_manager_mem'], # task manager memory
                 '-ys',  GlobalConf['slots'],
                "-c", "bclconverter.bclreader.test", # class name
                jar_path ]
        logger.debug("executing command: %s", cmd)
        with chdir(run_dir):
            logger.debug("In CWD, where we're going to run flink")
            logger.debug("cat conf/bclconverter.properties gives:")
            subprocess.check_call("cat conf/bclconverter.properties", shell=True)
            logger.debug("Now running flink")
            subprocess.check_call(map(str, cmd), cwd=run_dir)
    finally:
        logger.debug("Removing run directory %s", run_dir)
        try:
            shutil.rmtree(run_dir)
        except IOError as e:
            logger.debug("Error cleaning up temporary dir %s", run_dir)
            logger.debug(e.message)
Ejemplo n.º 14
0
def _wait(jobs, remove_output):
    logger.info("Waiting for jobs to finish")
    running = list(jobs)
    secs = 0
    poll_freq = 2
    failed = False
    while running and not failed:
        failed = any((j.failed for j in running))
        if failed:
            break
        # update running list
        new_running = []
        for j in running:
            if j.done:  # job just finished
                logger.info("Alignment job writing to %s just finished",
                            phdfs.path.basename(j.output_path))
                if remove_output:
                    logger.info("Removing output path %s", j.output_path)
                    _try_remove_hdfs_dir(j.output_path)
            else:
                new_running.append(j)
        running = new_running
        if secs % 8 == 0:
            logger.info("%d jobs (out of %d) haven't finished", len(running),
                        len(jobs))
        if secs % 60 == 0:
            logger.debug("Logging free disk space situation")
            subprocess.call([get_exec('hdfs'), 'dfsadmin', '-report'])
        if running:
            time.sleep(poll_freq)
            secs += poll_freq
    if failed:
        logger.error("We have failed jobs :-(")
        logger.error("Killing  all remaining jobs on Yarn cluster")
        try:
            _yarn_kill_all_apps()
        except StandardError as e:
            logger.error("Failed to clean up yarn cluster.  Sorry!")
            logger.exception(e)
    else:
        logger.info("All jobs finished")

    return not failed
Ejemplo n.º 15
0
def _wait(jobs, remove_output):
    logger.info("Waiting for jobs to finish")
    running = list(jobs)
    secs = 0
    poll_freq = 2
    failed = False
    while running and not failed:
        failed = any( (j.failed for j in running ) )
        if failed:
            break
        # update running list
        new_running = []
        for j in running:
            if j.done: # job just finished
                logger.info("Alignment job writing to %s just finished", phdfs.path.basename(j.output_path))
                if remove_output:
                    logger.info("Removing output path %s", j.output_path)
                    _try_remove_hdfs_dir(j.output_path)
            else:
                new_running.append(j)
        running = new_running
        if secs % 8 == 0:
            logger.info("%d jobs (out of %d) haven't finished", len(running), len(jobs))
        if secs % 60 == 0:
            logger.debug("Logging free disk space situation")
            subprocess.call([get_exec('hdfs'), 'dfsadmin', '-report'])
        if running:
            time.sleep(poll_freq)
            secs += poll_freq
    if failed:
        logger.error("We have failed jobs :-(")
        logger.error("Killing  all remaining jobs on Yarn cluster")
        try:
            _yarn_kill_all_apps()
        except StandardError as e:
            logger.error("Failed to clean up yarn cluster.  Sorry!")
            logger.exception(e)
    else:
        logger.info("All jobs finished")

    return not failed
Ejemplo n.º 16
0
def _yarn_get_app_ids():
    yarn_exec = get_exec('yarn')
    yarn_output = subprocess.check_output([ yarn_exec, 'application', '-list' ])
    app_ids = [ line.split('\t', 1)[0] for line in yarn_output.split('\n')[2:] ]
    return app_ids
Ejemplo n.º 17
0
def _yarn_get_app_ids():
    yarn_exec = get_exec('yarn')
    yarn_output = subprocess.check_output([yarn_exec, 'application', '-list'])
    app_ids = [line.split('\t', 1)[0] for line in yarn_output.split('\n')[2:]]
    return app_ids