Python batch.Batch.new_job Examples, hailtop.batch.batch.Batch.new_job Python Examples

Example #1

0

Show file

def index_gvcf(b: hb.batch.Batch,
               input_vcf: hb.resource.ResourceFile,
               output_vcf_ind_name: str = None,
               memory: int = 3,
               storage: int = 5,
               docker_img: str = None,
               out_dir: str = None):
    """
    Index a GVCF file
    :param b: batch
    :param input_vcf: GVCF file to index
    :param output_vcf_ind_name: output GVCF index name
    :param memory: job memory
    :param storage: storage to use fo the job
    :param docker_img: image to use for the job
    :param out_dir: output directory
    :return:
    """

    docker_image = docker_img if docker_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'
    outname = output_vcf_ind_name + '.g.vcf.gz.tbi'

    index_gvcf_file = b.new_job(name=f'index-{output_vcf_ind_name}')
    index_gvcf_file.image(docker_image)
    index_gvcf_file.memory(f'{memory}Gi')
    index_gvcf_file.storage(f'{storage}Gi')
    index_gvcf_file.command(f'gatk IndexFeatureFile \
         -I {input_vcf} \
         -O {outname}')
    index_gvcf_file.command(f'mv {outname} {index_gvcf_file.ofile}')
    b.write_output(index_gvcf_file.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_ind_name}/{outname}')

    return index_gvcf_file

Example #2

0

Show file

def merge_vcf(b: hb.batch.Batch,
              gvcf_list: List = None,
              output_vcf_name: str = None,
              merge_vcfs_img: str = None,
              memory: int = 3,
              out_dir: str = None,
              storage: int = None):
    # Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
    docker_image = merge_vcfs_img if merge_vcfs_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'
    outname = output_vcf_name + '.g.vcf.gz'

    # disk_size = bytes_to_gb((inputs_vcfs_list * 2.5)) + 10

    merge_vcf_i = ''

    for line in gvcf_list:
        input_gvcf = b.read_input(line)
        merge_vcf_i += f'I={input_gvcf} \t'

    merge_vcfs = b.new_job(name=output_vcf_name)
    merge_vcfs.image(docker_image)
    merge_vcfs.memory(f'{memory}Gi')
    merge_vcfs.storage(f'{storage}Gi')
    merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      MergeVcfs \
      {merge_vcf_i} \
      O={outname}')
    merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}')
    b.write_output(merge_vcfs.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}')

    return merge_vcfs

Example #3

0

Show file

def validate_vcf(b: hb.batch.Batch,
                 input_vcf: hb.resource.ResourceFile,
                 ref_fasta: hb.resource.ResourceGroup,
                 dbsnp_vcf_file: hb.resource.ResourceGroup,
                 calling_int_file: hb.resource.ResourceFile,
                 validate_vcf_img: str = None,
                 memory: int = 7,
                 storage: int = None,
                 output_vcf_ind_name: str = None):
    # Validate the (g)VCF output of HaplotypeCaller

    docker_image = validate_vcf_img if validate_vcf_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'

    # ref_size = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_fasta_index) + bytes_to_gb(ref_dict)
    # disk_size = bytes_to_gb(input_vcf) + bytes_to_gb(dbsnp_vcf) + ref_size + 20

    validate_gvcf = b.new_job(name=output_vcf_ind_name)
    validate_gvcf.image(docker_image)
    validate_gvcf.memory(f'{memory}Gi')
    validate_gvcf.storage(f'{storage}Gi')
    validate_gvcf.command(f'gatk IndexFeatureFile \
             -I {input_vcf}')
    validate_gvcf.command(f'gatk --java-options -Xms6000m \
      ValidateVariants \
      -V {input_vcf} \
      -R {ref_fasta.fasta} \
      -L {calling_int_file} \
      -gvcf \
      --validation-type-to-exclude ALLELES \
      --dbsnp {dbsnp_vcf_file.vcf}')

    return validate_gvcf

Example #4

0

Show file

def scatter_interval_list(b: hb.batch.Batch,
                          interval_list_file: hb.resource.ResourceFile,
                          scatter_count: int = 50,
                          break_bands_at_multiples_of: int = 1000000,
                          scatter_img: str = None,
                          memory: int = 2,
                          out_dir: str = None):
    """
    break the calling interval list into sub-intervals
    :param b: batch
    :param interval_list_file: one or more interval lists
    :param scatter_count: the number of files into which to scatter the resulting list by locus
    :param break_bands_at_multiples_of: if set to a positive value will create a new interval list with the original
    intervals broken up at integer multiples of this value. Set to 0 to NOT break up intervals
    :param scatter_img: image to use for the job
    :param memory: job memory
    :param out_dir: output directory
    :return:
    """

    # break the calling interval list into sub-intervals
    docker_image = scatter_img if scatter_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    scatter_list = b.new_job(name='scatter-interval-list')

    scatter_list.image(docker_image)
    scatter_list.cpu(1)
    scatter_list.memory(f'{memory}Gi')
    scatter_list.command('mkdir /scatter_intervals')
    scatter_list.command(f'java -Xms1g -jar /usr/gitc/picard.jar \
      IntervalListTools \
      SCATTER_COUNT={scatter_count} \
      SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
      UNIQUE=true \
      SORT=true \
      BREAK_BANDS_AT_MULTIPLES_OF={break_bands_at_multiples_of} \
      INPUT={interval_list_file} \
      OUTPUT=/scatter_intervals')
    scatter_list.command('''
    cat > my_script.py <<EOF
import sys
import os
import glob

intervals = sorted(glob.glob('/scatter_intervals/*/*.interval_list'))
for i, interval in enumerate(intervals):
      (directory, filename) = os.path.split(interval)
      newName = os.path.join(directory, str(i + 1) + filename)
      os.rename(interval, newName)
EOF
python3 my_script.py
    ''')
    scatter_list.command(f'mv /scatter_intervals {scatter_list.outfiles}')
    b.write_output(scatter_list.outfiles, f'{out_dir}/scatter-intervals')

    # We return the `scatter_list` Job object that can be used in downstream jobs.
    return scatter_list

Example #5

0

Show file

File: collect_calling_metrics.py Project: atgu/hgdp_tgp

def collect_variant_calling_metrics(
        b: hb.batch.Batch,
        input_vcf: hb.resource.ResourceGroup,
        dbsnp_vcf_file: hb.resource.ResourceGroup,
        ref_dict: hb.resource.ResourceGroup,
        evaluation_int_list: hb.resource.ResourceFile,
        metrics_basename: str = None,
        memory: int = 6,
        docker_img: str = None,
        storage: int = None,
        out_dir: str = None):
    """
    Call germline SNPs and indels
    :param b: batch
    :param input_vcf: GVCF file to collect variant calling metrics for
    :param dbsnp_vcf_file: DBSNP VCF and its index to use in collecting metrics
    :param ref_dict: reference dictionary file from a reference Group (fasta, index, and dict) Resource
    :param evaluation_int_list: evaluation list file
    :param metrics_basename: name to be used for output
    :param memory: job memory
    :param docker_img: image to use for the job
    :param storage: storage to use fo the job
    :param out_dir: output directory
    :return:
    """

    docker_image = docker_img if docker_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    collect_vc_metrics = b.new_job(name=metrics_basename)
    collect_vc_metrics.image(docker_image)
    collect_vc_metrics.memory(f'{memory}Gi')
    collect_vc_metrics.storage(f'{storage}Gi')
    collect_vc_metrics.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      CollectVariantCallingMetrics \
      INPUT={input_vcf.vcf} \
      OUTPUT={metrics_basename} \
      DBSNP={dbsnp_vcf_file.vcf} \
      SEQUENCE_DICTIONARY={ref_dict.dict} \
      TARGET_INTERVALS={evaluation_int_list} \
      GVCF_INPUT=true')
    collect_vc_metrics.command(
        f'mv {metrics_basename}.variant_calling_detail_metrics {collect_vc_metrics.detail}'
    )
    collect_vc_metrics.command(
        f'mv {metrics_basename}.variant_calling_summary_metrics {collect_vc_metrics.summary}'
    )
    b.write_output(
        collect_vc_metrics.detail,
        f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_detail_metrics'
    )
    b.write_output(
        collect_vc_metrics.summary,
        f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_summary_metrics'
    )

    return collect_vc_metrics

Example #6

0

Show file

def haplotype_caller_gatk(b: hb.batch.Batch,
                          input_bam: hb.resource.ResourceGroup,
                          ref_fasta: hb.resource.ResourceGroup,
                          interval_list_file: hb.resource.ResourceFile,
                          bam_filename_no_ext: str = None,
                          out_dir: str = None,
                          interval_list_name: str = None,
                          storage: int = None,
                          contamination: float = None,
                          gatk_img: str = None,
                          memory: float = 6.5,
                          ncpu: int = 2):
    """
    Call germline SNPs and indels
    :param b: batch
    :param input_bam: BAM file
    :param ref_fasta: reference files, including fasta and index
    :param interval_list_file: interval list file with intervals to run variant calling on
    :param bam_filename_no_ext: BAM filename without extension
    :param interval_list_name: interval list name, used to name output GVCF
    :param storage: storage to use fo the job
    :param contamination: fraction of contamination in sequencing data to aggressively remove
    :param gatk_img: image to use for the job
    :param out_dir: output directory
    :param memory: job memory
    :param ncpu: number of CPUs
    :return:
    """

    docker_image = gatk_img if gatk_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'

    output_file_name = bam_filename_no_ext + '_' + interval_list_name + '.g.vcf.gz'

    variant_calling = b.new_job(name=bam_filename_no_ext)

    variant_calling.image(docker_image)
    variant_calling.cpu(ncpu)
    variant_calling.memory(f'{memory}Gi')
    variant_calling.storage(f'{storage}Gi')
    variant_calling.command(
        f'gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
      HaplotypeCaller \
      -R {ref_fasta.fasta} \
      -I {input_bam.bam} \
      -L {interval_list_file} \
      -O {variant_calling.ofile}\
      -contamination {contamination} \
      -G StandardAnnotation -G StandardHCAnnotation -G AS_StandardAnnotation \
      -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
      -ERC GVCF')
    # variant_calling.command(f'mv {output_file_name} {variant_calling.ofile}')
    b.write_output(
        variant_calling.ofile,
        f'{out_dir}/variant-calling/{bam_filename_no_ext}/{output_file_name}')

    return variant_calling

Example #7

0

Show file

File: impute_vcf.py Project: atgu/GWASpy

def imputation(b: hb.batch.Batch,
               vcf: str = None,
               vcf_filename_no_ext: str = None,
               ref: hb.ResourceGroup = None,
               ref_size: Union[int, float] = None,
               region: str = None,
               chromosome: str = None,
               cpu: int = 8,
               memory: str = 'highmem',
               img: str = 'docker.io/lindonkambule/gwaspy:v1',
               threads: int = 7,
               out_dir: str = None):

    # in_vcf = b.read_input(vcf)
    in_vcf = b.read_input_group(**{'bcf': vcf, 'bcf.csi': f'{vcf}.csi'})
    vcf_size = bytes_to_gb(vcf)

    output_file_name = vcf_filename_no_ext + '.imputed.bcf'
    file_dir = vcf_filename_no_ext.split('.')[0]

    disk_size = ref_size + (vcf_size * 4)

    map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz'

    impute = b.new_job(name=output_file_name)
    impute.cpu(cpu)
    impute.memory(memory)
    impute.storage(f'{disk_size}Gi')
    impute.image(img)

    cmd = f'''
        impute5_1.1.5_static \
            --h {ref.bcf} \
            --m {map_file} \
            --g {in_vcf.bcf} \
            --r {region} \
            --out-gp-field \
            --o {output_file_name} \
            --threads {threads}
    '''

    impute.command(cmd)
    # index file to use when merging
    impute.command(f'bcftools index {output_file_name}')

    impute.command(f'mv {output_file_name} {impute.ofile}')
    impute.command(f'mv {output_file_name}.csi {impute.ind}')
    b.write_output(
        impute.ofile,
        f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}'
    )
    b.write_output(
        impute.ind,
        f'{out_dir}/GWASpy/Imputation/{file_dir}/imputed_chunks/{output_file_name}.csi'
    )

Example #8

0

Show file

def concat_vcfs(b: hb.batch.Batch,
                vcf_basename: str = None,
                vcfs_to_merge: List = None,
                output_type: str = 'vcf',
                chrom: str = None,
                cpu: int = 16,
                memory: str = 'standard',
                docker_img: str = 'docker.io/lindonkambule/gwaspy:v1',
                out_dir: str = None):

    global index_cmd

    out_type = 'b' if output_type == 'bcf' else 'z'
    vcfs_sizes_sum = 0
    merge_vcf_i = ''

    out_filename = f'{vcf_basename}.{chrom}.merged.bcf' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.merged.vcf.gz'
    out_index_name = f'{vcf_basename}.{chrom}.merged.bcf.csi' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.merged.vcf.gz.csi'

    for line in vcfs_to_merge:
        vcfs_sizes_sum += 2 + bytes_to_gb(line)

    disk_size = int(round(10 + (2 * vcfs_sizes_sum)))
    threads = cpu - 1

    concat = b.new_job(name=f'concat-{vcf_basename}')
    concat.memory(memory)
    concat.storage(f'{disk_size}Gi')
    concat.image(docker_img)
    concat.cpu(cpu)

    for line in vcfs_to_merge:
        input_vcf = b.read_input_group(vcf=line,
                                       ind=f'{line}.csi')
        merge_vcf_i += f'{input_vcf.vcf} '

    cmd = f'''
        bcftools concat \
            --no-version \
            --output-type {out_type} \
            --output {out_filename} \
            --threads {threads} \
            {merge_vcf_i}
    '''

    concat.command(cmd)
    # index the merged output
    concat.command(f'bcftools index --force {out_filename}')

    concat.command(f'mv {out_filename} {concat.ofile}')
    concat.command(f'mv {out_index_name} {concat.idx}')
    b.write_output(concat.ofile, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_filename}')
    b.write_output(concat.idx, f'{out_dir}/GWASpy/{vcf_basename}/Imputation/imputed_merged/{out_index_name}')

Example #9

0

Show file

File: sex_aut_imp.py Project: atgu/GWASpy

def aut_impute(b: hb.batch.Batch,
               vcf: hb.ResourceGroup = None,
               vcf_filename_no_ext: str = None,
               ref: hb.ResourceGroup = None,
               region: str = None,
               chromosome: str = None,
               buffer: int = 250,
               storage: int = None,
               memory: str = None,
               cpu: int = None,
               img: str = 'docker.io/lindonkambule/gwaspy:v1',
               out_dir: str = None):

    out_file_name = vcf_filename_no_ext + '.imputed.bcf'
    file_dir = vcf_filename_no_ext.split('.')[0]

    map_file = f'/shapeit4/maps/b38/{chromosome}.b38.gmap.gz'

    threads = cpu - 1

    impute = b.new_job(name=out_file_name)
    impute.cpu(cpu)
    impute.memory(memory)
    impute.storage(f'{storage}Gi')
    impute.image(img)

    cmd = f'''
        impute5_1.1.5_static \
            --h {ref.bcf} \
            --m {map_file} \
            --g {vcf.bcf} \
            --r {region} \
            --out-gp-field \
            --o {out_file_name} \
            --b {buffer} \
            --threads {threads}
    '''

    impute.command(cmd)
    # index file to use when merging
    impute.command(f'bcftools index {out_file_name}')

    impute.command(f'mv {out_file_name} {impute.ofile}')
    impute.command(f'mv {out_file_name}.csi {impute.idx}')
    b.write_output(
        impute.ofile,
        f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}'
    )
    b.write_output(
        impute.idx,
        f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi'
    )

Example #10

0

Show file

def clump(batch: hb.batch.Batch, image: str, bfile: hb.resource.ResourceGroup, assoc: hb.resource.ResourceFile, chr: int) -> hb.job.Job:
    """
    Clump association results with PLINK.

    https://zzz.bwh.harvard.edu/plink/clump.shtml
    """

    # Define a new job `c` in Batch `batch` with name `clump-CHR`
    c = batch.new_job(name=f'clump-{chr}')

    # TODO: Tell the new `c` job the location of the image of where you pushed your image in GCR that contains your Hail script
    # This image name is defined by the `image` variable as an argument to the function
    c.image(image)

    # Tell Batch to use 1Gi of memory for this job
    c.memory('1Gi')

    # Tell Batch to use 1 cpu for this job
    c.cpu(1)
    
    # Notice that we can simply call plink2 here because we put it on the PATH in the Dockerfile
    # TODO: Fill in the argument <BFILE> which uses the `bfile` argument we passed in to the function above.
    # `bfile` is a resource group and is expected to have three files at a common root name: {root}.bed, {root}.bim, {root}.fam
    # TODO: Fill in the argument <ASSOC> which uses the `assoc` argument we passed in to the function above. This file has the p-values of the GWAS.
    # TODO: Fill in the argument <CHR> which uses the `chr` we passed in to the function above. This will tell PLINK to only compute
    # the clumping results for this chromosome. This is how we achieve parallelism by chromosome.
    c.command(f'''
    plink --bfile {bfile} \
    --clump {assoc} \
    --chr {chr} \
    --clump-p1 0.0001 \
    --clump-p2 0.001 \
    --clump-r2 0.5 \
    --clump-kb 1000 \
    --memory 1024 \
    --threads 1

mv plink.clumped {c.clumped} || \
echo " CHR    F              SNP         BP        P    TOTAL   NSIG    S05    S01   S001  S0001    SP2" > {c.clumped}
''')

    # PLINK outputs the results at a hardcoded path. So we'll move it to a path Batch will know to copy.
    # PLINK doesn't output a file if there are no results so we'll make one

    # We return the `c` Job object that can be used in downstream jobs.
    return c

Example #11

0

Show file

def scatter_interval_list(b: hb.batch.Batch,
                          interval_list_file: hb.resource.ResourceFile,
                          scatter_count: int = 50,
                          break_bands_at_multiples_of: int = 1000000,
                          scatter_img: str = None,
                          memory: int = 2,
                          out_dir: str = None):
    # break the calling interval list into sub-intervals
    docker_image = scatter_img if scatter_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    scatter_list = b.new_job(name='scatter-interval-list')

    scatter_list.image(docker_image)
    scatter_list.cpu(1)  # this should be lower, check DSP pipeline
    scatter_list.memory(f'{memory}Gi')
    scatter_list.command('mkdir /scatter_intervals')
    scatter_list.command(f'java -Xms1g -jar /usr/gitc/picard.jar \
      IntervalListTools \
      SCATTER_COUNT={scatter_count} \
      SUBDIVISION_MODE=BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \
      UNIQUE=true \
      SORT=true \
      BREAK_BANDS_AT_MULTIPLES_OF={break_bands_at_multiples_of} \
      INPUT={interval_list_file} \
      OUTPUT=/scatter_intervals')
    scatter_list.command('''
    cat > my_script.py <<EOF
import sys
import os
import glob

intervals = sorted(glob.glob('/scatter_intervals/*/*.interval_list'))
for i, interval in enumerate(intervals):
      (directory, filename) = os.path.split(interval)
      newName = os.path.join(directory, str(i + 1) + filename)
      os.rename(interval, newName)
EOF
python3 my_script.py
    ''')
    scatter_list.command(f'mv /scatter_intervals {scatter_list.outfiles}')
    b.write_output(scatter_list.outfiles, f'{out_dir}/scatter-intervals')

    # We return the `scatter_list` Job object that can be used in downstream jobs.
    return scatter_list

Example #12

0

Show file

def run_gwas(batch: hb.batch.Batch, image: str, vcf: hb.resource.ResourceFile, phenotypes: hb.resource.ResourceFile) -> hb.job.Job:
    """
    QC data
    Get association test statistics
    Also, export PLINK file
    """

    # TODO: Take the input batch and create a new job object called `gwas`. You can give it the name 'run-gwas'
    # which will be useful when looking at the Batch UI to see what job corresponds to your code.
    gwas = batch.new_job(name='run-gwas')

    # TODO: Tell the new `gwas` job the image name of where you pushed your image in GCR that contains your Hail script
    # This image name is defined by the `image` variable as an argument to the function    
    gwas.image(image)
    
    # This is how we tell Batch that we want this job to have 4 cores. This number must match the argument
    # to `gwas_hail.py`, which tells Hail to run in local mode with 4 cores available.
    gwas.cpu(4)

    # This is how we tell Batch that we're defining a new ResourceGroup that is the output of the `gwas` Job.
    # PLINK will output four files here with a common root name. We designate this common file root with `{root}`
    # and hard code the extensions pertaining to each file. Now we can reference the common file root as `gwas.ofile`
    # To reference the bim file specifically, we can use `gwas.ofile.bim` or `gwas.ofile['bim']`
    gwas.declare_resource_group(ofile={
        'bed': '{root}.bed',
        'bim': '{root}.bim',
        'fam': '{root}.fam',
        'assoc': '{root}.assoc'
    })

    # The command definition below uses f-strings. The contents in between curly braces ({, }) are evaluated as Python expressions.
    # TODO: Fill in the <PATH> to the Python script `gwas_hail.py` with its location in the Docker image specified above.
    # TODO: Fill in the argument <VCF> which represents the VCF file we passed in to the function above.
    # TODO: Fill in the argument <OUTPUT_FILE> to the `--output-file` argument below. This should be the file root of the resource group declared above.
    gwas.command(f'''
python3 /gwas_hail.py \
    --vcf {vcf} \
    --phenotypes {phenotypes} \
    --output-file {gwas.ofile} \
    --cores 4
''')

    # We return the `gwas` Job object that can be used in downstream jobs.
    return gwas

Example #13

0

Show file

def merge_vcf(b: hb.batch.Batch,
              gvcf_list: List = None,
              output_vcf_name: str = None,
              merge_vcfs_img: str = None,
              memory: int = 3,
              out_dir: str = None,
              storage: int = None):
    """
    Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
    :param b: batch
    :param gvcf_list: list of GVCF files to merge
    :param output_vcf_name: output GVCF name
    :param merge_vcfs_img: image to use for the job
    :param storage: Storage to use fo the job
    :param out_dir: output directory
    :param memory: job memory
    :return:
    """

    docker_image = merge_vcfs_img if merge_vcfs_img else\
        'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    outname = output_vcf_name + '.g.vcf.gz'

    merge_vcf_i = ''

    for line in gvcf_list:
        input_gvcf = b.read_input(line)
        merge_vcf_i += f'I={input_gvcf} \t'

    merge_vcfs = b.new_job(name=output_vcf_name)
    merge_vcfs.image(docker_image)
    merge_vcfs.memory(f'{memory}Gi')
    merge_vcfs.storage(f'{storage}Gi')
    merge_vcfs.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      MergeVcfs \
      {merge_vcf_i} \
      O={outname}')
    merge_vcfs.command(f'mv {outname} {merge_vcfs.ofile}')
    b.write_output(merge_vcfs.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_name}/{outname}')

    return merge_vcfs

Example #14

0

Show file

def collect_variant_calling_metrics(
        b: hb.batch.Batch,
        input_vcf: hb.resource.ResourceGroup,
        dbsnp_vcf_file: hb.resource.ResourceGroup,
        ref_dict: hb.resource.ResourceGroup,
        evaluation_int_list: hb.resource.ResourceFile,
        metrics_basename: str = None,
        memory: int = 6,
        docker_img: str = None,
        storage: int = None,
        out_dir: str = None):
    docker_image = docker_img if docker_img else 'us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.3-1564508330'

    collect_vc_metrics = b.new_job(name=metrics_basename)
    collect_vc_metrics.image(docker_image)
    collect_vc_metrics.memory(f'{memory}Gi')
    collect_vc_metrics.storage(f'{storage}Gi')
    collect_vc_metrics.command(f'java -Xms2000m -jar /usr/gitc/picard.jar \
      CollectVariantCallingMetrics \
      INPUT={input_vcf.vcf} \
      OUTPUT={metrics_basename} \
      DBSNP={dbsnp_vcf_file.vcf} \
      SEQUENCE_DICTIONARY={ref_dict.dict} \
      TARGET_INTERVALS={evaluation_int_list} \
      GVCF_INPUT=true')
    # collect_vc_metrics.command(f'ls')
    collect_vc_metrics.command(
        f'mv {metrics_basename}.variant_calling_detail_metrics {collect_vc_metrics.detail}'
    )
    collect_vc_metrics.command(
        f'mv {metrics_basename}.variant_calling_summary_metrics {collect_vc_metrics.summary}'
    )
    b.write_output(
        collect_vc_metrics.detail,
        f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_detail_metrics'
    )
    b.write_output(
        collect_vc_metrics.summary,
        f'{out_dir}/variant-calling-metrics/{metrics_basename}/{metrics_basename}.variant_calling_summary_metrics'
    )

    return collect_vc_metrics

Example #15

0

Show file

File: cram_to_bam.py Project: atgu/hgdp_tgp

def cram_to_bam(b: hb.batch.Batch,
                input_cram_file: str = None,
                ref_fasta: str = None,
                ref_dict: str = None,
                ref_ind: str = None,
                bam_out_name: str = None,
                memory: int = 15,
                samtools_image: str = None,
                out_dir: str = None):
    docker_image = samtools_image if samtools_image else 'gcr.io/genomics-tools/samtools'

    out_bam_name = bam_out_name + '.bam'

    output_bam_size: float = bytes_to_gb(input_cram_file) / 0.40
    ref_size: float = bytes_to_gb(ref_fasta) + bytes_to_gb(ref_ind)
    disk_size: int = round(
        bytes_to_gb(input_cram_file) + output_bam_size + ref_size) + 25

    job_memory = str(memory) + 'Gi'
    job_storage = str(disk_size) + 'Gi'

    crams_to_bams = b.new_job(name=out_bam_name)
    in_cram = b.read_input(input_cram_file)
    fasta = b.read_input_group(**{
        'fasta': ref_fasta,
        'fasta.fai': ref_ind,
        'dict': ref_dict
    })

    crams_to_bams.memory(job_memory)
    crams_to_bams.image(docker_image)
    crams_to_bams.storage(job_storage)
    crams_to_bams.command(
        f'samtools view -b -T {fasta.fasta} -o {out_bam_name} {in_cram}')
    crams_to_bams.command(f'samtools index {out_bam_name}')
    crams_to_bams.command(f'mv {out_bam_name} {crams_to_bams.bamout}')
    crams_to_bams.command(f'mv {out_bam_name}.bai {crams_to_bams.bamind}')
    b.write_output(crams_to_bams.bamout, f'{out_dir}/BAMS/{out_bam_name}')
    b.write_output(crams_to_bams.bamind, f'{out_dir}/BAMS/{out_bam_name}.bai')

    return crams_to_bams

Example #16

0

Show file

def haplotype_caller_gatk(b: hb.batch.Batch,
                          input_bam: hb.resource.ResourceGroup,
                          ref_fasta: hb.resource.ResourceGroup,
                          interval_list_file: hb.resource.ResourceFile,
                          bam_filename_no_ext: str = None,
                          out_dir: str = None,
                          interval_list_name: str = None,
                          storage: int = None,
                          contamination: float = None,
                          gatk_img: str = None,
                          memory: float = 6.5,
                          ncpu: int = 2):
    docker_image = gatk_img if gatk_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'

    output_file_name = bam_filename_no_ext + '_' + interval_list_name + '.g.vcf.gz'

    variant_calling = b.new_job(name=bam_filename_no_ext)

    variant_calling.image(docker_image)
    variant_calling.cpu(ncpu)
    variant_calling.memory(f'{memory}Gi')
    variant_calling.storage(f'{storage}Gi')
    variant_calling.command(
        f'gatk --java-options "-Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
      HaplotypeCaller \
      -R {ref_fasta.fasta} \
      -I {input_bam.bam} \
      -L {interval_list_file} \
      -O {variant_calling.ofile}\
      -contamination {contamination} \
      -G StandardAnnotation -G StandardHCAnnotation -G AS_StandardAnnotation \
      -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \
      -ERC GVCF')
    # variant_calling.command(f'mv {output_file_name} {variant_calling.ofile}')
    b.write_output(
        variant_calling.ofile,
        f'{out_dir}/variant-calling/{bam_filename_no_ext}/{output_file_name}')

    # We return the `variant_calling` Job object that can be used in downstream jobs.
    return variant_calling

Example #17

0

Show file

def index_gvcf(b: hb.batch.Batch,
               input_vcf: hb.resource.ResourceFile,
               output_vcf_ind_name: str = None,
               memory: int = 3,
               storage: int = 5,
               docker_img: str = None,
               out_dir: str = None):

    docker_image = docker_img if docker_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'
    outname = output_vcf_ind_name + '.g.vcf.gz.tbi'

    index_gvcf_file = b.new_job(name=f'index-{output_vcf_ind_name}')
    index_gvcf_file.image(docker_image)
    index_gvcf_file.memory(f'{memory}Gi')
    index_gvcf_file.storage(f'{storage}Gi')
    index_gvcf_file.command(f'gatk IndexFeatureFile \
         -I {input_vcf} \
         -O {outname}')
    index_gvcf_file.command(f'mv {outname} {index_gvcf_file.ofile}')
    b.write_output(index_gvcf_file.ofile,
                   f'{out_dir}/merged-gvcf/{output_vcf_ind_name}/{outname}')

    return index_gvcf_file

Example #18

0

Show file

File: validate_gvcf.py Project: atgu/hgdp_tgp

def validate_vcf(b: hb.batch.Batch, input_vcf: hb.resource.ResourceFile, ref_fasta: hb.resource.ResourceGroup,
                 dbsnp_vcf_file: hb.resource.ResourceGroup, calling_int_file: hb.resource.ResourceFile,
                 validate_vcf_img: str = None, memory: int = 7, storage: int = None,
                 output_vcf_ind_name: str = None):
    """
    Validate the GVCF output of HaplotypeCaller
    :param b: batch
    :param input_vcf: GVCF file to validate
    :param ref_fasta: reference files, including fasta and index
    :param dbsnp_vcf_file: DBSNP VCF and its index to use in the validation
    :param calling_int_file: calling interval file
    :param validate_vcf_img: image to use for the job
    :param memory: job memory
    :param storage: storage to use fo the job
    :param output_vcf_ind_name: name to be used for the job
    :return:
    """

    docker_image = validate_vcf_img if validate_vcf_img else 'us.gcr.io/broad-gatk/gatk:4.2.0.0'

    validate_gvcf = b.new_job(name=output_vcf_ind_name)
    validate_gvcf.image(docker_image)
    validate_gvcf.memory(f'{memory}Gi')
    validate_gvcf.storage(f'{storage}Gi')
    validate_gvcf.command(f'gatk IndexFeatureFile \
             -I {input_vcf}')
    validate_gvcf.command(f'gatk --java-options -Xms6000m \
      ValidateVariants \
      -V {input_vcf} \
      -R {ref_fasta.fasta} \
      -L {calling_int_file} \
      -gvcf \
      --validation-type-to-exclude ALLELES \
      --dbsnp {dbsnp_vcf_file.vcf}')

    return validate_gvcf

Example #19

0

Show file

def merge(batch: hb.batch.Batch, results: List[hb.resource.ResourceFile]) -> hb.job.Job:
    """
    Merge clumped results files together
    """

    # Define a new job `merger` in Batch `batch` with name `merge-results`
    merger = batch.new_job(name='merge-results')

    # Use the ubuntu:18.04 image which Batch caches
    merger.image('ubuntu:18.04')

    # Do some file munging to concatenate all of the clumped results together for all chromosomes
    if results:
        merger.command(f'''
head -n 1 {results[0]} > {merger.ofile}
for result in {" ".join(results)}
do
    tail -n +2 "$result" >> {merger.ofile}
done
sed -i '/^$/d' {merger.ofile}
''')

    # We return the `merger` Job object that can be used in downstream jobs.
    return merger

Example #20

0

Show file

File: sex_aut_imp.py Project: atgu/GWASpy

def sex_impute(b: hb.batch.Batch,
               vcf: hb.ResourceGroup = None,
               vcf_filename_no_ext: str = None,
               females_list: hb.ResourceFile = None,
               ref: hb.ResourceGroup = None,
               region: str = None,
               buffer: int = 250,
               storage: int = None,
               memory: str = None,
               cpu: int = None,
               img: str = 'docker.io/lindonkambule/gwaspy:v1',
               out_dir: str = None):

    global cmd

    in_females = females_list

    out_file_name = vcf_filename_no_ext + '.imputed.bcf'
    file_dir = vcf_filename_no_ext.split('.')[0]

    threads = cpu - 1

    impute = b.new_job(name=out_file_name)
    impute.cpu(cpu)
    impute.memory(memory)
    impute.storage(f'{storage}Gi')
    impute.image(img)

    start = int(region.split(':')[1].split('-')[0])
    end = int(region.split(':')[1].split('-')[1])

    # A. PAR1 REGION ONLY
    if end <= 2781479:
        # run diploid imputation
        impute.command(
            'echo THIS CHUNK IS IN PAR1 REGION, SO WE WILL RUN DIPLOID IMPUTATION'
        )
        map_file = '/shapeit4/maps/b38/chrX_par1.b38.gmap.gz'

        if start < 10001:
            start_new = 10001
            end_new = end
        else:
            start_new = start
            end_new = end

        new_imp_region = f'chrX:{start_new}-{end_new}'

        cmd = f'''
            impute5_1.1.5_static \
                --h {ref.bcf} \
                --m {map_file} \
                --g {vcf.bcf} \
                --r {new_imp_region} \
                --out-gp-field \
                --o {out_file_name} \
                --b {buffer} \
                --threads {threads}
        '''

        impute.command(cmd)
        # index file to use when merging
        impute.command(f'bcftools index {out_file_name}')

        impute.command(f'mv {out_file_name} {impute.ofile}')
        impute.command(f'mv {out_file_name}.csi {impute.idx}')
        b.write_output(
            impute.ofile,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}'
        )
        b.write_output(
            impute.idx,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi'
        )

    # B. PAR2 REGION ONLY
    elif start >= 155701383:
        # run diploid imputation
        impute.command(
            'echo THIS CHUNK IS IN PAR2 REGION, SO WE WILL RUN DIPLOID IMPUTATION'
        )
        map_file = '/shapeit4/maps/b38/chrX_par2.b38.gmap.gz'

        if end > 156030895:
            end_new = 156030895
            start_new = start
        else:
            start_new = start
            end_new = end

        new_imp_region = f'chrX:{start_new}-{end_new}'

        cmd = f'''
            impute5_1.1.5_static \
                --h {ref.bcf} \
                --m {map_file} \
                --g {vcf.bcf} \
                --r {new_imp_region} \
                --out-gp-field \
                --o {out_file_name} \
                --b {buffer} \
                --threads {threads}
        '''

        impute.command(cmd)
        # index file to use when merging
        impute.command(f'bcftools index {out_file_name}')

        impute.command(f'mv {out_file_name} {impute.ofile}')
        impute.command(f'mv {out_file_name}.csi {impute.idx}')
        b.write_output(
            impute.ofile,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}'
        )
        b.write_output(
            impute.idx,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi'
        )

    # C. NON-PAR REGION ONLY
    elif (start >= 2781479) & (end <= 155701382):
        map_file = '/shapeit4/maps/b38/chrX.b38.gmap.gz'
        start_new = start
        end_new = end
        new_imp_region = f'chrX:{start_new}-{end_new}'

        # (1) split by sex NB: THE USER SHOULD SUPPLY A FILE CONTAINING EITHER ONLY FEMALES OR MALE SAMPLE IDS
        # WITH ONE SAMPLE ID PER LINE
        cmd_split = f'''
                echo THIS CHUNK IS ONLY IN THE NON-PAR REGION, SO WE WILL SPLIT IT BY SEX BEFORE RUNNING IMPUTATION
                echo GETTING THE ORDER OF SAMPLES
                bcftools query -l {vcf.bcf} > samples_order.txt
                echo SPLITTING SAMPLES BY SEX
                bcftools view -S {in_females} {vcf.bcf} --output-type b --output females.bcf
                bcftools view -S ^{in_females} {vcf.bcf} --output-type b --output males.bcf
                echo INDEXING THE FILES BEFORE RUNNING IMPUTATION
                bcftools index females.bcf
                bcftools index males.bcf
        '''
        impute.command(cmd_split)

        # (2) run imputation separately for females and males
        cmd_impute = f'''
                impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g females.bcf --r {new_imp_region} --out-gp-field \
                    --o females.imputed.bcf --b {buffer} --threads {threads}
                impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g males.bcf --r {new_imp_region} --out-gp-field \
                    --o males.imputed.bcf --b {buffer} --threads {threads} -- haploid
        '''
        impute.command(cmd_impute)

        # (3) merge the imputed files back together into one chunk
        # (3a) index females and males files to be merged and create a file with these for bcftools
        cmd_index_chunks = f'''
                bcftools index females.imputed.bcf
                bcftools index males.imputed.bcf
                rm females.bcf* males.bcf*
        '''
        impute.command(cmd_index_chunks)

        # (3b) sometimes there are duplicates (raw and imputed with flipped alleles) and this causes an error when
        # merging, so we remove any duplicates
        cmd_sort = f'''
                echo CHECKING FOR DUPLICATE VARIANTS AND REMOVING THEM
                bcftools norm -d any females.imputed.bcf --output-type b --output females.imputed.sorted.bcf
                rm females.imputed.bcf*
                bcftools norm -d any males.imputed.bcf --output-type b --output males.imputed.sorted.bcf
                rm males.imputed.bcf*
                bcftools index females.imputed.sorted.bcf
                bcftools index males.imputed.sorted.bcf
                echo -e "females.imputed.sorted.bcf\nmales.imputed.sorted.bcf" > merge.txt
        '''

        impute.command(cmd_sort)

        # (3c) merge the files into one
        cmd_merge = f'''
                echo MERGING THE MALES AND FEMALES FILE BACK TOGETHER
                bcftools merge --file-list merge.txt --output-type b --output merged.sex.bcf
        '''
        impute.command(cmd_merge)

        # (3d) reorder samples back to how they were initially
        # splitting and re-merging samples will result in change in order compared to the initial file
        # so we have to use the initial samples order for the concat step by chromosome of imputation
        cmd_order = f'''
                echo ORDERING SAMPLES TO HOW THEY WERE INITIALLY
                bcftools view -S samples_order.txt merged.sex.bcf --output-type b --output {out_file_name}
        '''
        impute.command(cmd_order)

        impute.command(f'bcftools index {out_file_name}')

        impute.command(f'mv {out_file_name} {impute.ofile}')
        impute.command(f'mv {out_file_name}.csi {impute.idx}')
        b.write_output(
            impute.ofile,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}'
        )
        b.write_output(
            impute.idx,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi'
        )

    # D. MIXED REGIONS
    else:
        if (start <= 2781479) & (end >= 2781479):
            map_file_par = '/shapeit4/maps/b38/chrX_par1.b38.gmap.gz'
            par_region = f'chrX:{start}-{2781479}'
            non_par_region = f'chrX:{2781479}-{end}'

            # to be used in step 3d
            cmd_merge_regs = f'''
                    echo ORDERING PAR1 SAMPLES TO HOW THEY WERE INITIALLY
                    bcftools view -S samples_order.txt par.imputed.bcf --output-type b --output par.imputed.sort.bcf
                    bcftools index par.imputed.sort.bcf
                    rm par.imputed.bcf*
                    echo MERGING THE PAR1 AND NON-PAR FILES BACK TOGETHER
                    echo -e "par.imputed.sort.bcf\nnonpar.imputed.sort.bcf" > merge_regions.txt
                    bcftools concat --naive --file-list merge_regions.txt --output-type b --output {out_file_name}
            '''

        else:
            map_file_par = '/shapeit4/maps/b38/chrX_par2.b38.gmap.gz'
            if end > 156030895:
                par_region = f'chrX:{155701383}-{156030895}'
            else:
                par_region = f'chrX:{155701383}-{end}'

            non_par_region = f'chrX:{start}-{155701382}'

            # to be used in step 3c
            cmd_merge_regs = f'''
                    echo ORDERING PAR2 SAMPLES TO HOW THEY WERE INITIALLY
                    bcftools view -S samples_order.txt par.imputed.bcf --output-type b --output par.imputed.sort.bcf
                    bcftools index par.imputed.sort.bcf
                    rm par.imputed.bcf*
                    echo MERGING THE PAR2 AND NON-PAR FILES BACK TOGETHER
                    echo -e "nonpar.imputed.sort.bcf\npar.imputed.sort.bcf" > merge_regions.txt
                    bcftools concat --naive --file-list merge_regions.txt --output-type b --output {out_file_name}
            '''

        cmd_split = f'''
                echo THIS CHUNK IS IN PAR1/2 and NON-PAR REGION, SO WE WILL SPLIT THESE TWO REGIONS
                echo GETTING THE ORDER OF SAMPLES
                bcftools query -l {vcf.bcf} > samples_order.txt
                echo SPLITTING OUT THE PAR REGION
                bcftools view {vcf.bcf} --regions {par_region} --output-type b --output par.bcf
                echo SPLITTING OUT THE NON-PAR REGION
                bcftools view {vcf.bcf} --regions {non_par_region} --output-type b --output nonpar.bcf
                echo SPLITTING THE NON-PAR REGION BY SEX
                bcftools view -S {in_females} nonpar.bcf --output-type b --output females.bcf
                bcftools view -S ^{in_females} nonpar.bcf --output-type b --output males.bcf
                echo INDEXING THE FILES BEFORE RUNNING IMPUTATION
                bcftools index par.bcf
                bcftools index females.bcf
                bcftools index males.bcf
                rm {vcf.bcf}
                rm nonpar.bcf
        '''

        impute.command(cmd_split)

        # (2) run imputation separately for par, females, and males
        map_file = '/shapeit4/maps/b38/chrX.b38.gmap.gz'
        cmd_impute = f'''
                impute5_1.1.5_static --h {ref.bcf} --m {map_file_par} --g par.bcf --r {par_region} --out-gp-field \
                    --o par.imputed.bcf --b {buffer} --threads {threads}
                impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g females.bcf --r {non_par_region} --out-gp-field \
                    --o females.imputed.bcf --b {buffer} --threads {threads}
                impute5_1.1.5_static --h {ref.bcf} --m {map_file} --g males.bcf --r {non_par_region} --out-gp-field \
                    --o males.imputed.bcf --b {buffer} --threads {threads} -- haploid
        '''
        impute.command(cmd_impute)

        # (3) merge the imputed files back together into one chunk
        # (3a) index females and males files to be merged and create a file with these for bcftools
        cmd_index_chunks = f'''
                    bcftools index par.imputed.bcf
                    bcftools index females.imputed.bcf
                    bcftools index males.imputed.bcf
        '''
        impute.command(cmd_index_chunks)

        # (3b) sometimes there are duplicates (raw and imputed with flipped alleles) and this causes an error when
        # merging, so we remove any duplicates
        # bcftools view Nepal_PTSD_GSA_Updated_May2021_qced.chrX.phased.bcf | grep "#"
        cmd_sort = f'''
                echo CHECKING FOR DUPLICATE VARIANTS AND REMOVING THEM
                bcftools norm -d any females.imputed.bcf --output-type b --output females.imputed.sorted.bcf
                rm females.imputed.bcf*
                bcftools norm -d any males.imputed.bcf --output-type b --output males.imputed.sorted.bcf
                rm males.imputed.bcf*
                bcftools index females.imputed.sorted.bcf
                bcftools index males.imputed.sorted.bcf
                echo -e "females.imputed.sorted.bcf\nmales.imputed.sorted.bcf" > merge_sex.txt
        '''
        impute.command(cmd_sort)

        # (3c) merge the sex files into one
        cmd_merge_sex = f'''
                echo MERGING THE MALES AND FEMALES FILE BACK TOGETHER
                bcftools merge --file-list merge_sex.txt --output-type b --output nonpar.imputed.bcf
                bcftools index nonpar.imputed.bcf
                echo ORDERING NON-PAR SAMPLES TO HOW THEY WERE INITIALLY
                bcftools view -S samples_order.txt nonpar.imputed.bcf --output-type b --output nonpar.imputed.sort.bcf
                bcftools index nonpar.imputed.sort.bcf
                rm nonpar.imputed.bcf* females.imputed.sorted.bcf* males.imputed.sorted.bcf
        '''
        impute.command(cmd_merge_sex)

        # (3d) merge the non-par and par regions together
        impute.command(cmd_merge_regs)

        impute.command(f'bcftools index {out_file_name}')

        impute.command(f'mv {out_file_name} {impute.ofile}')
        impute.command(f'mv {out_file_name}.csi {impute.idx}')
        b.write_output(
            impute.ofile,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}'
        )
        b.write_output(
            impute.idx,
            f'{out_dir}/GWASpy/{file_dir}/Imputation/imputed_chunks/{out_file_name}.csi'
        )

Example #21

0

Show file

def concat_vcfs(b: hb.batch.Batch,
                vcf_basename: str = None,
                vcfs_to_merge: List = None,
                output_type: str = 'bcf',
                software: str = None,
                chrom: str = None,
                docker_img: str = 'docker.io/lindonkambule/gwaspy:v1',
                cpu: int = 8,
                out_dir: str = None):

    global index_cmd

    out_type = 'b' if output_type == 'bcf' else 'z'
    threads = cpu - 1
    vcfs_sizes_sum = 0
    merge_vcf_i = ''

    out_filename = f'{vcf_basename}.{chrom}.phased.{software}.bcf' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz'
    out_index_name = f'{vcf_basename}.{chrom}.phased.{software}.bcf.csi' if output_type == 'bcf' else \
        f'{vcf_basename}.{chrom}.phased.{software}.vcf.gz.csi'

    for line in vcfs_to_merge:
        vcfs_sizes_sum += 1 + bytes_to_gb(line)

    mem = 'highmem' if vcfs_sizes_sum > 2 else 'standard'
    disk_size = 10 + vcfs_sizes_sum

    concat = b.new_job(name=f'concat-{vcf_basename}')
    concat.memory(mem)
    concat.storage(f'{disk_size}Gi')
    concat.image(docker_img)
    concat.cpu(cpu)

    for line in vcfs_to_merge:
        input_vcf = b.read_input_group(vcf=line, ind=f'{line}.csi')
        merge_vcf_i += f'{input_vcf.vcf} '

    cmd = f'''
        bcftools concat \
            --no-version \
            --output-type {out_type} \
            --output {out_filename} \
            --threads {threads} \
            --ligate \
            {merge_vcf_i}
    '''

    concat.command(cmd)
    # index the merged output
    concat.command(f'bcftools index {out_filename}')

    concat.command(f'mv {out_filename} {concat.ofile}')
    concat.command(f'mv {out_index_name} {concat.idx}')
    b.write_output(
        concat.ofile,
        f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_filename}'
    )
    b.write_output(
        concat.idx,
        f'{out_dir}/GWASpy/{vcf_basename}/Phasing/phased_merged/{out_index_name}'
    )