Esempio n. 1
0
def apply_vqsr_indel(job, shared_ids, input_args):
    """
    Apply variant quality score recalibration for indel variants.
    Writes vcf file to output directory

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    suffix = input_args['suffix']
    input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf',
                   'HAPINDEL.recal', 'HAPINDEL.tranches', 'HAPINDEL.plots']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    output = '{}.HAPSNP.vqsr.INDEL{}.vcf'.format(uuid, suffix)
    command = ['-T', 'ApplyRecalibration',
               '-input', 'unified.raw.BOTH.gatk.vcf',
               '-o', output,
               '-R', 'ref.fa',
               '-nt', '1',
               '-ts_filter_level', '99.0',
               '-tranchesFile', 'HAPINDEL.tranches',
               '-recalFile', 'HAPINDEL.recal',
               '-mode', 'INDEL']
    docker_call(work_dir = work_dir,
                tool_parameters = command,
                tool = 'quay.io/ucsc_cgl/gatk',
                sudo = input_args['sudo'])

    upload_or_move_hc(work_dir, input_args, output)
Esempio n. 2
0
    def start(self):
        """
        Start spark and hdfs master containers
        """
        log.write("start masters\n")
        log.flush()
        
        if (os.uname()[0] == "Darwin"):
            machine = check_output(["docker-machine", "ls"]).split("\n")[1].split()[0]
            self.IP = check_output(["docker-machine", "ip", machine]).strip().rstrip()
        else:
            self.IP = check_output(["hostname", "-f",])[:-1]

        self.sparkContainerID = docker_call(no_rm = True,
                                            work_dir = os.getcwd(),
                                            tool = "quay.io/ucsc_cgl/apache-spark-master:1.5.2",
                                            docker_parameters = ["--net=host",
                                                                 "-d",
                                                                 "-v", "/mnt/ephemeral/:/ephemeral/:rw",
                                                                 "-e", "SPARK_MASTER_IP="+self.IP,
                                                                 "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local",
                                                                 "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"],
                                            tool_parameters = [],
                                            sudo = self.sudo,
                                            check_output = True)[:-1]
        self.hdfsContainerID = docker_call(no_rm = True,
                                           work_dir = os.getcwd(),
                                           tool = "quay.io/ucsc_cgl/apache-hadoop-master:2.6.2",
                                           docker_parameters = ["--net=host",
                                                                "-d"],
                                           tool_parameters = [self.IP],
                                           sudo = self.sudo,
                                           check_output = True)[:-1]
        return self.IP
Esempio n. 3
0
def vqsr_snp(job, shared_ids, input_args):
    """
    Variant quality score recalibration for SNP variants.
    Calls next step in pipeline - apply SNP recalibration

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf',
                   'hapmap.vcf', 'omni.vcf', 'dbsnp.vcf', 'phase.vcf']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    outputs = ['HAPSNP.recal', 'HAPSNP.tranches', 'HAPSNP.plots']
    command = ['-T', 'VariantRecalibrator',
               '-R', 'ref.fa',
               '-input', 'unified.raw.BOTH.gatk.vcf',
               '-nt', input_args['cpu_count'],
               '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', 'hapmap.vcf',
               '-resource:omni,known=false,training=true,truth=false,prior=12.0', 'omni.vcf',
               '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', 'dbsnp.vcf',
               '-resource:1000G,known=false,training=true,truth=false,prior=10.0', 'phase.vcf',
               '-an', 'QD', '-an', 'DP', '-an', 'FS', '-an', 'ReadPosRankSum',
               '-mode', 'SNP', '-minNumBad', '1000',
               '-recalFile', 'HAPSNP.recal',
               '-tranchesFile', 'HAPSNP.tranches',
               '-rscriptFile', 'HAPSNP.plots']
    docker_call(work_dir = work_dir,
                tool_parameters = command,
                tool ='quay.io/ucsc_cgl/gatk',
                sudo = input_args['sudo'])
    shared_ids = write_to_filestore(job, work_dir, shared_ids, *outputs)
    job.addChildJobFn(apply_vqsr_snp, shared_ids, input_args)
Esempio n. 4
0
def vqsr_indel(job, shared_ids, input_args):
    """
    Variant quality score recalibration for Indel variants.
    Calls next step in pipeline - apply indel recalibration

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'unified.raw.BOTH.gatk.vcf', 'mills.vcf']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    outputs = ['HAPINDEL.recal', 'HAPINDEL.tranches', 'HAPINDEL.plots']
    command = ['-T', 'VariantRecalibrator',
               '-R', 'ref.fa',
               '-input', 'unified.raw.BOTH.gatk.vcf',
               '-nt', input_args['cpu_count'],
               '-resource:mills,known=true,training=true,truth=true,prior=12.0', 'mills.vcf',
               '-an', 'DP', '-an', 'FS', '-an', 'ReadPosRankSum',
               '-mode', 'INDEL',
               '-minNumBad', '1000',
               '-recalFile', 'HAPINDEL.recal',
               '-tranchesFile', 'HAPINDEL.tranches',
               '-rscriptFile', 'HAPINDEL.plots',
               '--maxGaussians', '4']
    docker_call(work_dir = work_dir,
                tool_parameters = command,
                tool ='quay.io/ucsc_cgl/gatk',
                sudo = input_args['sudo'])
    shared_ids = write_to_filestore(job, work_dir, shared_ids, *outputs)
    job.addChildJobFn(apply_vqsr_indel, shared_ids, input_args)
Esempio n. 5
0
def call_conductor(masterIP, inputs, src, dst):
    """
    Invokes the conductor container.
    """
    docker_call(no_rm = True,
                work_dir = os.getcwd(),
                tool = "quay.io/ucsc_cgl/conductor",
                docker_parameters = ["--net=host"],
                tool_parameters = ["--master", "spark://"+masterIP+":"+SPARK_MASTER_PORT,
                 "--conf", "spark.driver.memory=%sg" % inputs["driverMemory"],
                 "--conf", "spark.executor.memory=%sg" % inputs["executorMemory"],
                 "--", "-C", src, dst],
                sudo = inputs['sudo'])
Esempio n. 6
0
def call_adam(masterIP, inputs, arguments):

    default_params = ["--master", ("spark://%s:%s" % (masterIP, SPARK_MASTER_PORT)), 
                      "--conf", ("spark.driver.memory=%sg" % inputs["driverMemory"]),
                      "--conf", ("spark.executor.memory=%sg" % inputs["executorMemory"]),
                      "--conf", ("spark.hadoop.fs.default.name=hdfs://%s:%s" % (masterIP, HDFS_MASTER_PORT)),
                      "--"]

    docker_call(no_rm = True,
                work_dir = os.getcwd(),
                tool = "quay.io/ucsc_cgl/adam:962-ehf--6e7085f8cac4b9a927dc9fb06b48007957256b80",
                docker_parameters = ["--net=host"],
                tool_parameters = default_params + arguments,
                sudo = inputs['sudo'])
Esempio n. 7
0
def haplotype_caller(job, shared_ids, input_args):
    """
    Uses GATK HaplotypeCaller to identify SNPs and Indels and writes a gVCF.
    Calls per-sample genotyper to genotype gVCF.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['ref.fa', 'ref.fa.fai', 'ref.dict', 'toil.bam', 'toil.bam.bai']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    output = '%s.raw.BOTH%s.gvcf' % (input_args['uuid'],
                                     input_args['suffix'])
    
    # Call GATK -- HaplotypeCaller
    command = ['-nct', input_args['cpu_count'],
               '-R', 'ref.fa',
               '-T', 'HaplotypeCaller',
               '--genotyping_mode', 'Discovery',
               '--emitRefConfidence', 'GVCF',
               '-I', 'toil.bam',
               '-o', output,
               '-variant_index_type', 'LINEAR',
               '-variant_index_parameter', '128000',
               '--annotation', 'QualByDepth',
               '--annotation', 'DepthPerSampleHC',
               '--annotation', 'FisherStrand',
               '--annotation', 'ReadPosRankSumTest']
    try:
        docker_call(work_dir = work_dir,
                    tool_parameters = command,
                    tool = 'quay.io/ucsc_cgl/gatk',
                    sudo = input_args['sudo'])
    except:
        sys.stderr.write("Running haplotype caller with %s in %s failed." % (
            " ".join(command), work_dir))
        raise

    # Update fileStore and spawn child job
    shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output))

    # upload gvcf
    upload_or_move_hc(work_dir, input_args, output)

    # call variants prior to vqsr
    job.addChildJobFn(genotype_gvcf, shared_ids, input_args)
Esempio n. 8
0
def genotype_gvcf(job, shared_ids, input_args):
    """
    Genotypes the gVCF generated by the HaplotypeCaller.
    Calls variant quality score recalibration functions.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """

    work_dir = job.fileStore.getLocalTempDir()
    input_files = ['%s.raw.BOTH%s.gvcf' % (input_args['uuid'],
                                           input_args['suffix']),
                   'ref.fa', 'ref.fa.fai', 'ref.dict']
    read_from_filestore_hc(job, work_dir, shared_ids, *input_files)
    output = 'unified.raw.BOTH.gatk.vcf'
    
    command = ['-nt', input_args['cpu_count'],
               '-R', 'ref.fa',
               '-T', 'GenotypeGVCFs',
               '--variant', '%s.raw.BOTH.gatk.gvcf' % input_args['uuid'],
               '--out', output,
               '-stand_emit_conf', '10.0',
               '-stand_call_conf', '30.0']

    try:
        docker_call(work_dir = work_dir,
                    tool_parameters = command,
                    tool = 'quay.io/ucsc_cgl/gatk',
                    sudo = input_args['sudo'])
    except:
        sys.stderr.write("Running GenotypeGVCFs with %s in %s failed." % (
            " ".join(command), work_dir))
        raise

    # Update fileStore and spawn child job
    shared_ids[output] = job.fileStore.writeGlobalFile(os.path.join(work_dir, output))

    # run vqsr
    job.addChildJobFn(vqsr_snp, shared_ids, input_args)
    job.addChildJobFn(vqsr_indel, shared_ids, input_args)
Esempio n. 9
0
def index(job, shared_ids, input_args):
    """
    Index sample bam using samtools, calls haplotypeCaller.

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    bam_path = return_input_paths(job, work_dir, shared_ids, 'toil.bam')
    output_path = os.path.join(work_dir, 'toil.bam.bai')
    # Call: index the normal.bam
    parameters = ['index', 'toil.bam']
    docker_call(work_dir = work_dir,
                tool_parameters = parameters,
                tool = 'quay.io/ucsc_cgl/samtools',
                sudo = input_args['sudo'])
    # Update FileStore and call child
    shared_ids['toil.bam.bai'] = job.fileStore.writeGlobalFile(output_path)
    job.addChildJobFn(haplotype_caller, shared_ids, input_args)
Esempio n. 10
0
def create_reference_dict_hc(job, shared_ids, input_args):
    """
    Uses Picardtools to create sequence dictionary for reference genome.
    Calls next step in pipeline - spawn batch jobs

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    # Unpack convenience variables for job
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa')
    # Call: picardtools
    picard_output = os.path.join(work_dir, 'ref.dict')
    command = ['CreateSequenceDictionary', 'R=ref.fa', 'O=ref.dict']
    docker_call(work_dir = work_dir,
                tool_parameters = command,
                tool = 'quay.io/ucsc_cgl/picardtools',
                sudo = input_args['sudo'])
    # Update fileStore for output
    shared_ids['ref.dict'] = job.fileStore.writeGlobalFile(picard_output)
    job.addChildJobFn(spawn_batch_variant_calling, shared_ids, input_args)
Esempio n. 11
0
def create_reference_index_hc(job, shared_ids, input_args):
    """
    Uses samtools to create reference index file in working directory,
    spawns next job in pipeline - create reference dictionary

    :param job: Job instance
    :param shared_ids: dictionary of shared file promises
    :param input_args: dictionary of input arguments
    """
    # Unpack convenience variables for job
    work_dir = job.fileStore.getLocalTempDir()
    # Retrieve file path
    # FIXME: unused variable
    ref_path = return_input_paths(job, work_dir, shared_ids, 'ref.fa')
    faidx_output = os.path.join(work_dir, 'ref.fa.fai')
    # Call: Samtools
    faidx_command = ['faidx', 'ref.fa']
    docker_call(work_dir = work_dir,
                tool_parameters = faidx_command,
                tool = 'quay.io/ucsc_cgl/samtools',
                sudo = input_args['sudo'])
    # Update fileStore for output
    shared_ids['ref.fa.fai'] = job.fileStore.writeGlobalFile(faidx_output)
    job.addChildJobFn(create_reference_dict_hc, shared_ids, input_args)
Esempio n. 12
0
    def start(self):
        """
        Start spark and hdfs worker containers
        """
        log.write("start workers\n")
        log.flush()

        self.sparkContainerID = docker_call(no_rm = True,
                                            work_dir = os.getcwd(),
                                            tool = "quay.io/ucsc_cgl/apache-spark-worker:1.5.2",
                                            docker_parameters = ["--net=host", 
                                                                 "-d",
                                                                 "-v", "/mnt/ephemeral/:/ephemeral/:rw",
                                                                 "-e", "\"SPARK_MASTER_IP="+self.masterIP+":"+SPARK_MASTER_PORT+"\"",
                                                                 "-e", "SPARK_LOCAL_DIRS=/ephemeral/spark/local",
                                                                 "-e", "SPARK_WORKER_DIR=/ephemeral/spark/work"],
                                            tool_parameters = [self.masterIP+":"+SPARK_MASTER_PORT],
                                            sudo = self.sudo,
                                            check_output = True)[:-1]
        
        self.hdfsContainerID = docker_call(no_rm = True,
                                           work_dir = os.getcwd(),
                                           tool = "quay.io/ucsc_cgl/apache-hadoop-worker:2.6.2",
                                           docker_parameters = ["--net=host",
                                                                "-d",
                                                                "-v", "/mnt/ephemeral/:/ephemeral/:rw"],
                                           tool_parameters = [self.masterIP],
                                           sudo = self.sudo,
                                           check_output = True)[:-1]
        
        # fake do/while to check if HDFS is up
        hdfs_down = True
        retries = 0
        while hdfs_down and (retries < 5):

            sys.stderr.write("Sleeping 30 seconds before checking HDFS startup.")
            time.sleep(30)
            clusterID = ""
            try:
                clusterID = check_output(["docker",
                                          "exec",
                                          self.hdfsContainerID,
                                          "grep",
                                          "clusterID",
                                          "-R",
                                          "/opt/apache-hadoop/logs"])
            except:
                # grep returns a non-zero exit code if the pattern is not found
                # we expect to not find the pattern, so a non-zero code is OK
                pass

            if "Incompatible" in clusterID:
                sys.stderr.write("Hadoop Datanode failed to start with: %s" % clusterID)
                sys.stderr.write("Retrying container startup, retry #%d." % retries)
                retries += 1

                sys.stderr.write("Removing ephemeral hdfs directory.")
                check_call(["docker",
                            "exec",
                            self.hdfsContainerID,
                            "rm",
                            "-rf",
                            "/ephemeral/hdfs"])

                sys.stderr.write("Killing container %s." % self.hdfsContainerID)
                check_call(["docker",
                            "kill",
                            self.hdfsContainerID])

                # todo: this is copied code. clean up!
                sys.stderr.write("Restarting datanode.")
                self.hdfsContainerID = docker_call(no_rm = True,
                                                   work_dir = os.getcwd(),
                                                   tool = "quay.io/ucsc_cgl/apache-hadoop-worker:2.6.2",
                                                   docker_parameters = ["--net=host",
                                                                        "-d",
                                                                        "-v", "/mnt/ephemeral/:/ephemeral/:rw"],
                                                   tool_parameters = [self.masterIP],
                                                   sudo = self.sudo,
                                                   check_output = True)[:-1]

            else:
                sys.stderr.write("HDFS datanode started up OK!")
                hdfs_down = False

        if retries >= 5:
            raise RuntimeError("Failed %d times trying to start HDFS datanode." % retries)
                                   
        return