Esempio n. 1
0
def run_adam_markdups(job, sampleId):

    work_dir = job.fileStore.getLocalTempDir()

    # Retrieve file path
    job.fileStore.readGlobalFile(sampleId, os.path.join(work_dir, 'reads.bam'))

    add_docker_parameters = ['-v', '{}:/data'.format(work_dir)]
    _log.info("Converting BAM to ADAM format.")
    call_adam(job,
              None, ["transform", "/data/reads.bam", "/data/reads.adam"],
              memory=str(job.memory),
              run_local=True,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    _log.info("Marking duplicate reads using ADAM.")
    call_adam(job,
              None, [
                  "transform", "/data/reads.adam", "/data/reads.sorted.adam",
                  "-mark_duplicate_reads", "-limit_projection"
              ],
              memory=str(job.memory),
              run_local=True,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)
Esempio n. 2
0
def adam_convert(job, master_ip, inputs, in_file, in_snps, adam_file,
                 adam_snps, spark_on_toil):
    """
    Convert input sam/bam file and known SNPs file into ADAM format
    """

    add_docker_parameters = []
    if inputs.run_local:
        add_docker_parameters.extend(
            ['-v', '{}:/data'.format(inputs.local_dir)])

    _log.info("Converting input BAM to ADAM.")
    call_adam(job,
              master_ip, ["transform", in_file, adam_file],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    in_file_name = in_file.split("/")[-1]
    remove_file(master_ip, in_file_name, spark_on_toil)

    _log.info("Converting known sites VCF to ADAM.")

    call_adam(job,
              master_ip, ["vcf2adam", "-only_variants", in_snps, adam_snps],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    in_snps_name = in_snps.split("/")[-1]
    remove_file(master_ip, in_snps_name, spark_on_toil)
Esempio n. 3
0
def run_adam_bqsr(job, sampleId, dbsnp):

    work_dir = job.fileStore.getLocalTempDir()

    # Retrieve file path
    job.fileStore.readGlobalFile(sampleId, os.path.join(work_dir, 'reads.bam'))

    add_docker_parameters = ['-v', '{}:/data'.format(work_dir)]
    _log.info("Converting BAM to ADAM format.")
    call_adam(job,
              None, ["transform", "/data/reads.bam", "/data/reads.adam"],
              memory=str(job.memory),
              run_local=True,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    # Retrieve file path
    job.fileStore.readGlobalFile(dbsnp, os.path.join(work_dir, 'dbsnp.vcf'))

    add_docker_parameters = ['-v', '{}:/data'.format(work_dir)]
    _log.info("Converting dbSNP VCF to ADAM format.")
    call_adam(
        job,
        None,
        ["vcf2adam", "/data/dbsnp.vcf", "/data/dbsnp.adam", "-only_variants"],
        memory=str(job.memory),
        run_local=True,
        container='fnothaft/adam',
        add_docker_parameters=add_docker_parameters)

    _log.info("Recalibrating base qualities using ADAM.")
    call_adam(job,
              None, [
                  "transform", "/data/reads.adam", "/data/reads.sorted.adam",
                  "-recalibrate_base_qualities", "-known_snps",
                  "/data/dbsnp.adam", "-limit_projection"
              ],
              memory=str(job.memory),
              run_local=True,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)
Esempio n. 4
0
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, call, upload.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        if not inputs.run_local:
            _log.info("Downloading input BAM %s to %s.", bam_name, hdfs_bam)
            call_conductor(job,
                           master_ip,
                           inputs.sample,
                           hdfs_bam,
                           container='fnothaft/conductor',
                           memory=inputs.memory)
        else:
            copy_files([inputs.sample], inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        _log.info("Converting input BAM to ADAM.")
        call_adam(job,
                  master_ip, ["transform", hdfs_bam, adam_input],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        avocado_output = hdfs_prefix + ".gt.adam"
        _log.info("Calling variants with avocado.")
        call_avocado(
            job,
            master_ip,
            ["biallelicGenotyper", "-is_not_grc", adam_input, avocado_output],
            memory=inputs.memory,
            container='fnothaft/avocado')

        output_vcf = hdfs_prefix + ".vcf"
        _log.info("Converting output ADAM Genotypes to VCF.")
        call_adam(job,
                  master_ip, [
                      "adam2vcf", avocado_output, output_vcf, "-single",
                      "-sort_on_save", "-stringency", "LENIENT"
                  ],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            _log.info("Uploading output VCF %s to %s.", output_vcf, out_file)
            call_conductor(job,
                           master_ip,
                           output_vcf,
                           out_file,
                           memory=inputs.memory,
                           container='fnothaft/conductor')
            remove_file(master_ip, output_vcf, spark_on_toil)
        else:
            local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir,
                                                         sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Esempio n. 5
0
def adam_transform(job, master_ip, inputs, in_file, snp_file, hdfs_dir,
                   out_file, spark_on_toil):
    """
    Preprocess in_file with known SNPs snp_file:
        - mark duplicates
        - realign indels
        - recalibrate base quality scores
    """

    add_docker_parameters = []
    if inputs.run_local:
        add_docker_parameters.extend(
            ['-v', '{}:/data'.format(inputs.local_dir)])

    _log.info("Marking duplicate reads.")
    call_adam(job,
              master_ip, [
                  "transform", in_file, hdfs_dir + "/mkdups.adam",
                  "-aligned_read_predicate", "-limit_projection",
                  "-mark_duplicate_reads"
              ],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    #FIXME
    in_file_name = in_file.split("/")[-1]
    remove_file(master_ip, in_file_name + "*", spark_on_toil)

    _log.info("Realigning INDELs.")
    call_adam(job,
              master_ip, [
                  "transform", hdfs_dir + "/mkdups.adam",
                  hdfs_dir + "/ri.adam", "-realign_indels"
              ],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    remove_file(master_ip, hdfs_dir + "/mkdups.adam*", spark_on_toil)

    _log.info("Recalibrating base quality scores.")
    call_adam(job,
              master_ip, [
                  "transform", hdfs_dir + "/ri.adam", hdfs_dir + "/bqsr.adam",
                  "-recalibrate_base_qualities", "-known_snps", snp_file
              ],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    remove_file(master_ip, "ri.adam*", spark_on_toil)

    _log.info("Sorting reads and saving a single BAM file.")
    call_adam(job,
              master_ip, [
                  "transform", hdfs_dir + "/bqsr.adam", out_file,
                  "-sort_reads", "-single"
              ],
              memory=inputs.memory,
              run_local=inputs.run_local,
              native_adam_path=inputs.native_adam_path,
              container='fnothaft/adam',
              add_docker_parameters=add_docker_parameters)

    remove_file(master_ip, "bqsr.adam*", spark_on_toil)

    return out_file