Beispiel #1
0
def _count(job, workers):
    
    # if we are on Mac OS X and using docker-machine to run docker, we need to
    # get the IP of the docker-machine box
    #
    # this is necessary because docker-machine runs docker in a virtualbox
    # vm which has a different IP address from localhost
    ip = None
    if os.uname()[0] == "Darwin":
        # check what machines docker-machine is running
        # strip leading and trailing whitespace, and split lines
        machines = check_output(["docker-machine", "ls"]).strip().rstrip().split("\n")
        
        # we take the first docker-machine environment that is running
        # this means two lines including the header
        if len(machines) != 2:
            raise RuntimeError('Expected a single docker-machine to be running.'
                               'Got %d:\n%r.' % (len(machines) - 1, machines))

        machine = machines[1].split()[0]
        ip = check_output(["docker-machine", "ip", machine]).strip().rstrip()

    # set up cluster
    masterHostname = spawn_spark_cluster(job,
                                         workers,
                                         cores=1,
                                         overrideLeaderIP=ip)

    job.addChildJobFn(_count_child, masterHostname)
Beispiel #2
0
def static_avocado_dag(job, inputs, sample, output_dir, suffix=''):
    """
    A Toil job function performing Avocado preprocessing on a single sample
    """
    inputs.sample = sample
    inputs.output_dir = output_dir
    inputs.suffix = suffix

    if inputs.master_ip is not None or inputs.run_local:
        # Static, external Spark cluster
        spark_on_toil = False
        spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip,
                                   inputs, spark_on_toil)
        job.addChild(spark_work)
    else:
        # Dynamic subclusters, i.e. Spark-on-Toil
        spark_on_toil = True
        cores = multiprocessing.cpu_count()
        master_ip = spawn_spark_cluster(
            job,
            inputs.num_nodes - 1,
            cores=cores,
            memory=inputs.memory,
            sparkMasterContainer="fnothaft/apache-spark-master",
            sparkWorkerContainer="fnothaft/apache-spark-worker")
        spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs,
                                   spark_on_toil)
        job.addChild(spark_work)
Beispiel #3
0
def static_cannoli_dag(job, inputs, sample, sample_id, output_dir, suffix=''):
    """
    A Toil job function performing alignment using cannoli on a single sample
    """
    inputs.sample = sample
    inputs.sample_id = sample_id
    inputs.output_dir = output_dir
    inputs.suffix = suffix

    if inputs.master_ip is not None or inputs.run_local:
        # Static, external Spark cluster
        spark_on_toil = False
        spark_work = job.wrapJobFn(download_run_and_upload, inputs.master_ip,
                                   inputs, spark_on_toil)
        job.addChild(spark_work)
    else:
        # Dynamic subclusters, i.e. Spark-on-Toil
        spark_on_toil = True
        cores = multiprocessing.cpu_count()
        master_ip = spawn_spark_cluster(
            job,
            inputs.num_nodes - 1,
            cores=cores,
            memory=inputs.memory,
            sparkMasterContainer=
            "fnothaft/apache-spark-master:2.1.0--74e45e9a58550e14db0e1ad48624c839ebd5e8f8",
            sparkWorkerContainer=
            "fnothaft/apache-spark-worker:2.1.0--74e45e9a58550e14db0e1ad48624c839ebd5e8f8"
        )
        spark_work = job.wrapJobFn(download_run_and_upload, master_ip, inputs,
                                   spark_on_toil)
        job.addChild(spark_work)
Beispiel #4
0
def setup_deca_state(job, input_files, targets, output, memory, run_local,
                     num_nodes, aws_access_key_id, aws_secret_access_key):

    if run_local:

        # import bams
        loaded_files = []
        for f in input_files:

            file_name = os.path.basename(f)
            file_id = job.wrapJobFn(download_url_job, f)
            job.addChild(file_id)

            loaded_files.append((file_name, file_id.rv()))

        # import target file
        target_id = job.wrapJobFn(download_url_job, targets)
        job.addChild(target_id)
        target = (os.path.basename(targets), target_id.rv())

        call_cnvs = job.wrapJobFn(call_deca_cnvs, loaded_files, target, output,
                                  memory, run_local, None, aws_access_key_id,
                                  aws_secret_access_key)
        job.addFollowOn(call_cnvs)

    else:

        # all files must have s3 urls
        def is_s3(f):
            require(f.startswith("s3a"),
                    "url for file %s did not start with s3a scheme" % f)

        is_s3(targets)
        is_s3(output)
        for f in input_files:
            is_s3(f)

        # launch the spark cluster
        master_ip = spawn_spark_cluster(job,
                                        int(num_nodes) - 1,
                                        cores=multiprocessing.cpu_count(),
                                        memory=memory)

        call_cnvs = job.wrapJobFn(call_deca_cnvs, input_files, targets, output,
                                  memory, False, master_ip, aws_access_key_id,
                                  aws_secret_access_key)
        job.addChild(call_cnvs)