Example #1
0
def combine_gvcf(args, recalibrate=False, all_gvcf=False):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php
    #combined multi-sample gVCF:
    gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz
    """
    output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    group_name = output.split("/")[-1]  #group_name

    if recalibrate:
        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")

    gvcf_output_file = group_name + ".cohort.g.vcf"
    gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file)

    check_create_dir(gvcf_input_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--output", gvcf_output_full
    ]

    for root, _, files in os.walk(gvcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".g.vcf"):
                cmd.append("--variant")
                cmd.append(filename)
    if all_gvcf != False:
        if os.path.isdir(all_gvcf):
            all_gvcf = os.path.abspath(all_gvcf)
            print("Using gvcf from enricment folder:" + all_gvcf)
            for root, _, files in os.walk(all_gvcf):
                for name in files:
                    filename = os.path.join(root, name)
                    if filename.endswith(".g.vcf"):
                        cmd.append("--variant")
                        cmd.append(filename)
        else:
            print("GVCF enrichment folder does not exist")

    execute_subprocess(cmd)
Example #2
0
def bbduk_trimming(args):
    """
    TODO : handle params
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    output_dir = obtain_output_dir(args, "Trimmed")

    in1_param = "in1=" + r1
    in2_param = "in2=" + r2

    sample = extract_sample(r1, r2)

    out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz"
    out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz"

    stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats"

    adapter_path = "ref=" + get_bbduk_adapters()

    memory_param = "-Xmx" + str(args.memory) + "g"
    threads_param = "threads=" + str(args.threads)

    check_create_dir(output_dir)

    #bbduk.sh
    cmd = [
        "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param,
        adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21",
        "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo",
        stats_param
    ]

    execute_subprocess(cmd)
Example #3
0
def bwa_mapping(args):
    """
    #Store output in a file when it is outputted in stdout
    https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    cmd_index = ["bwa", "index", reference]
    execute_subprocess(cmd_index)

    cmd_map = [
        "bwa", "mem", "-t",
        str(args.threads), "-o", output_file, reference, r1, r2
    ]
    execute_subprocess(cmd_map)
    """
Example #4
0
def bowtie2_mapping(args):
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    if args.extensive_mapping:
        extensive_command = "-a"
    else:
        extensive_command = ""
    #bowtie2 index
    cmd_index = ["bowtie2-build", reference, reference]
    execute_subprocess(cmd_index)

    #bowtie map
    cmd_map = [
        "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q",
        "--very-sensitive-local", "-p",
        str(args.threads), "-x", reference, extensive_command
    ]
    execute_subprocess(cmd_map)
Example #5
0
def call_variants(args, recalibrate=False, group=True):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php
    #Call variants:
    gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz
    """
    output = os.path.abspath(args.output)

    input_reference = os.path.abspath(args.reference)

    if not args.sample:
        args.sample = "nosample"

    file_name = args.sample  #sample_name
    group_name = output.split("/")[-1]  #group_name

    if recalibrate:

        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
        vcf_output_dir = obtain_output_dir(args, "VCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")
        vcf_output_dir = obtain_output_dir(args, "VCF")

    if group:
        gvcf_input_file = group_name + ".cohort.g.vcf"
        vcf_output_file = group_name + ".cohort.raw.vcf"
    else:
        gvcf_input_file = file_name + ".g.vcf"
        vcf_output_file = file_name + ".raw.vcf"

    gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file)
    vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file)

    check_create_dir(gvcf_input_dir)
    check_create_dir(vcf_output_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--variant", gvcf_input_full, "--output",
        vcf_output_full
    ]

    execute_subprocess(cmd)
Example #6
0
def sam_to_index_bam(args):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    check_create_dir(output_dir)
    """
    #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam
    with open(output_bam_path, "w") as outfile:
        #map reads and save it in th eoutput file
        subprocess.run(["samtools", "view", "-Sb", input_sam_path], 
        stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True)
    """
    cmd = [
        "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path,
        "--threads",
        str(args.threads)
    ]
    execute_subprocess(cmd)

    check_remove_file(input_sam_path)

    add_SG(args, output_bam_path, output_bg_sorted_path)

    check_remove_file(output_bam_path)
    """
Example #7
0
def picard_markdup(args):
    #java -jar picard.jar MarkDuplicates \
    #  I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt
    picard_jar = get_picard_path()

    input_bam = os.path.abspath(args.input_bam)
    in_param = "I=" + input_bam

    path_file_name = input_bam.split(".")[0]
    file_name = path_file_name.split("/")[-1]
    output_markdup = path_file_name + ".rg.markdup.bam"
    output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam"
    out_param = "O=" + output_markdup

    stat_output_dir = obtain_output_dir(args, "Stats")
    stat_output_file = file_name + ".markdup.metrics.txt"
    stat_output_full = os.path.join(stat_output_dir, stat_output_file)
    stats_param = "M=" + stat_output_full

    check_create_dir(stat_output_dir)

    cmd_markdup = [
        "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param,
        stats_param
    ]
    execute_subprocess(cmd_markdup)

    #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam"
    cmd_sort = [
        "samtools", "sort", output_markdup, "-o", output_markdup_sorted
    ]
    execute_subprocess(cmd_sort)

    #Handled in Haplotype Caller function
    #samtools index: samtools index $output_dir/$sample".sorted.bam"
    subprocess.run(["samtools", "index", output_markdup_sorted],
                   stdout=subprocess.PIPE,
                   stderr=subprocess.PIPE,
                   check=True)
    check_remove_file(input_bam)
    check_remove_file(output_markdup)
def mash_screen(
        args,
        winner=True,
        r2=False,
        mash_database="/home/laura/DATABASES/Mash/refseq.genomes.k21s1000.msh"
):
    #https://mash.readthedocs.io/en/latest/index.html
    #https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database
    # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab
    #identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment

    if not os.path.isfile(mash_database):
        print(RED + BOLD + "Mash database can't be found\n" + END_FORMATTING +
              "You can download it typing:\n\
            wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh")
        sys.exit(1)

    threads = args.threads

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)

    species_output_dir = obtain_output_dir(args, "Species")
    check_create_dir(species_output_dir)
    species_output_name = sample + ".screen.tab"
    species_output_file = os.path.join(species_output_dir, species_output_name)

    cmd = ["mash", "screen", "-p", str(threads), mash_database, r1]

    if winner == True:
        cmd.insert(2, "-w")
    #Use both r1 and r2 instead of just r1(faster)
    if r2 == True:
        cmd.append(r2)

    #cmd.extend([mash_database, r1, r2])

    prog = cmd[0]
    param = cmd[1:]

    try:
        #execute_subprocess(cmd)
        with open(species_output_file, "w+") as outfile:
            #calculate mash distance and save it in output file
            command = subprocess.run(cmd,
                                     stdout=outfile,
                                     stderr=subprocess.PIPE,
                                     universal_newlines=True)
        if command.returncode == 0:
            print(GREEN + "Program %s successfully executed" % prog +
                  END_FORMATTING)
        else:
            print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING +
                  BOLD + "WITH PARAMETERS: " + END_FORMATTING +
                  " ".join(param) + "\n" + BOLD +
                  "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" +
                  END_FORMATTING + command.stderr)
    except OSError as e:
        sys.exit(RED + BOLD + "failed to execute program '%s': %s" %
                 (prog, str(e)) + END_FORMATTING)
Example #9
0
def recalibrate_bam(args, tb=False):
    """
    BaseRecalibrator
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php
    #Recalibrate bam:
    gatk BaseRecalibrator --input my_reads.bam --reference reference.fasta --known-sites sites_of_variation.vcf \
    --known-sites another/optional/setOfSitesToMask.vcf --output recal_data.table
    ApplyBQSR
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php
    gatk ApplyBQSR --reference reference.fasta --input input.bam --bqsr-recal-file recalibration.table --output output.bam
    """
    #output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    #Automate M. tuberculosis reference for aditional recalibraion positions
    if ("NC_000962.3" in input_reference) or (
            "h37rv" in input_reference.lower()) or ("ancestor"
                                                    in input_reference):
        tb = True
        script_dir = os.path.dirname(os.path.realpath(__file__))
        reference_dir = os.path.join(script_dir, "reference")
        if ("NC_000962.3" in input_reference) or ("h37rv"
                                                  in input_reference.lower()):
            reference_file = os.path.join(
                reference_dir, "190508_ddtb.NC_000962.3.BQSR.table")
        elif ("ancestor" in input_reference):
            reference_file = os.path.join(reference_dir,
                                          "190508_ddtb.BQSR.table")

    #group_name = output.split("/")[-1] #group_name
    sample_name = args.sample
    bam_input_dir = obtain_output_dir(args, "Bam")
    vcf_input_dir = obtain_output_dir(args, "VCF_recal")

    bam_input_file_name = sample_name + ".rg.markdup.sorted.bam"
    bam_input_file = os.path.join(bam_input_dir, bam_input_file_name)

    table_output_file_name = sample_name + ".recall.table"
    table_output_file = os.path.join(vcf_input_dir, table_output_file_name)

    memory_param = "-Xmx" + str(args.memory) + "g"

    #BaseRecalibrator

    cmd_bqsr = [
        "gatk", "BaseRecalibrator", "--java-options", memory_param,
        "--reference", input_reference, "--input", bam_input_file, "--output",
        table_output_file
    ]

    if tb == True:
        cmd_bqsr.append("--known-sites")
        cmd_bqsr.append(reference_file)

    for root, _, files in os.walk(vcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".hf.pass.vcf"):
                cmd_bqsr.append("--known-sites")
                cmd_bqsr.append(filename)

    execute_subprocess(cmd_bqsr)

    #ApplyBQSR

    bam_output_file_name = sample_name + ".bqsr.bam"
    bam_output_file = os.path.join(bam_input_dir, bam_output_file_name)

    cmd_apply = [
        "gatk", "ApplyBQSR", "--reference", input_reference, "--input",
        bam_input_file, "--bqsr-recal-file", table_output_file, "--output",
        bam_output_file
    ]

    execute_subprocess(cmd_apply)
Example #10
0
def haplotype_caller(args,
                     recalibrate=False,
                     ploidy=2,
                     bamout=False,
                     forceactive=False,
                     intervals=False):
    #base_quality=13,
    """
    #No excuses
    https://software.broadinstitute.org/gatk/documentation/article?id=11081
    """
    #input_bam = os.path.abspath(args.input_bam)
    input_reference = os.path.abspath(args.reference)

    bam_output_dir = obtain_output_dir(args, "Bam")
    #file_name = path_file_name.split("/")[-1] #sample_name
    file_name = args.sample
    #path_file_name = os.path.join(output_dir, gvcf_output_file)

    if recalibrate:
        input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF_recal")
        gvcf_output_file = file_name + ".g.vcf"
    else:
        input_bam_to_call_name = file_name + ".bqsr.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF")
        gvcf_output_file = file_name + ".g.vcf"

    check_create_dir(gvcf_output_dir)

    input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name)
    gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file)

    memory_param = "-Xmx" + str(args.memory) + "g"

    hc_args = [
        "gatk", "HaplotypeCaller", "--java-options", memory_param,
        "--reference", input_reference, "--input", input_bam_to_call,
        "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF",
        "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy",
        str(ploidy)
    ]

    #"--min-base-quality-score", str(base_quality),

    #Create bam index
    #cmd_index = ["samtools", "index", input_bam_to_call]
    #execute_subprocess(cmd_index)

    if bamout:
        bamout_output_dir = obtain_output_dir(args, "Bamout")
        bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam"
        bamout_output_full = os.path.join(bamout_output_dir,
                                          bamout_output_file)
        check_create_dir(bamout_output_dir)
        bamout_params = ["--bam-output", bamout_output_full]
        hc_args.extend(bamout_params)

    if forceactive:
        force_params = ["--force-active", "--disable-optimizations"]
        hc_args.extend(force_params)

    execute_subprocess(hc_args)
    """