Example #1
0
def merge_vcf(b: hb.batch.Batch,
              gvcf_list: List = None,
              output_vcf_name: str = None,
              merge_vcfs_img: str = None,
              memory: int = 3,
              out_dir: str = None,
              storage: int = None):
    # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
    docker_image = merge_vcfs_img if merge_vcfs_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'
    outname = output_vcf_name + '.g.vcf.gz'

    # disk_size = bytes_to_gb((inputs_vcfs_list * 2.5)) + 10

    merge_vcf_i = ''

    for line in gvcf_list:
        input_gvcf = b.read_input(line)
        merge_vcf_i += f'I={input_gvcf} \t'

    merge_vcfs = b.new_job(name=output_vcf_name)
    merge_vcfs.image(docker_image)
    merge_vcfs.memory(f'{memory}Gi')
    merge_vcfs.storage(f'{storage}Gi')
    merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      MergeVcfs \
      {merge_vcf_i} \
      O={outname}')
    merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}')
    b.write_output(merge_vcfs.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}')

    return merge_vcfs
Example #2
0
def merge_vcf(b: hb.batch.Batch,
              gvcf_list: List = None,
              output_vcf_name: str = None,
              merge_vcfs_img: str = None,
              memory: int = 3,
              out_dir: str = None,
              storage: int = None):
    """
    Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
    :param b: batch
    :param gvcf_list: list of GVCF files to merge
    :param output_vcf_name: output GVCF name
    :param merge_vcfs_img: image to use for the job
    :param storage: Storage to use fo the job
    :param out_dir: output directory
    :param memory: job memory
    :return:
    """

    docker_image = merge_vcfs_img if merge_vcfs_img else\
        'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    outname = output_vcf_name + '.g.vcf.gz'

    merge_vcf_i = ''

    for line in gvcf_list:
        input_gvcf = b.read_input(line)
        merge_vcf_i += f'I={input_gvcf} \t'

    merge_vcfs = b.new_job(name=output_vcf_name)
    merge_vcfs.image(docker_image)
    merge_vcfs.memory(f'{memory}Gi')
    merge_vcfs.storage(f'{storage}Gi')
    merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      MergeVcfs \
      {merge_vcf_i} \
      O={outname}')
    merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}')
    b.write_output(merge_vcfs.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}')

    return merge_vcfs
Example #3
0
def cram_to_bam(b: hb.batch.Batch,
                input_cram_file: str = None,
                ref_fasta: str = None,
                ref_dict: str = None,
                ref_ind: str = None,
                bam_out_name: str = None,
                memory: int = 15,
                samtools_image: str = None,
                out_dir: str = None):
    docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools'

    out_bam_name = bam_out_name + '.bam'

    output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40
    ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind)
    disk_size: int = round(
        bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25

    job_memory = str(memory) + 'Gi'
    job_storage = str(disk_size) + 'Gi'

    crams_to_bams = b.new_job(name=out_bam_name)
    in_cram = b.read_input(input_cram_file)
    fasta = b.read_input_group(**{
        'fasta': ref_fasta,
        'fasta.fai': ref_ind,
        'dict': ref_dict
    })

    crams_to_bams.memory(job_memory)
    crams_to_bams.image(docker_image)
    crams_to_bams.storage(job_storage)
    crams_to_bams.command(
        f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}')
    crams_to_bams.command(f'samtools index {out_bam_name}')
    crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}')
    crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}')
    b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}')
    b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai')

    return crams_to_bams