Exemple #1
0
def bowtie2_mapping(args):
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    if args.extensive_mapping:
        extensive_command = "-a"
    else:
        extensive_command = ""
    #bowtie2 index
    cmd_index = ["bowtie2-build", reference, reference]
    execute_subprocess(cmd_index)

    #bowtie map
    cmd_map = [
        "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q",
        "--very-sensitive-local", "-p",
        str(args.threads), "-x", reference, extensive_command
    ]
    execute_subprocess(cmd_map)
Exemple #2
0
def bbduk_trimming(args):
    """
    TODO : handle params
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    output_dir = obtain_output_dir(args, "Trimmed")

    in1_param = "in1=" + r1
    in2_param = "in2=" + r2

    sample = extract_sample(r1, r2)

    out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz"
    out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz"

    stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats"

    adapter_path = "ref=" + get_bbduk_adapters()

    memory_param = "-Xmx" + str(args.memory) + "g"
    threads_param = "threads=" + str(args.threads)

    check_create_dir(output_dir)

    #bbduk.sh
    cmd = [
        "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param,
        adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21",
        "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo",
        stats_param
    ]

    execute_subprocess(cmd)
Exemple #3
0
def select_variants(raw_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    gatk SelectVariants -V cohort.vcf.gz -select-type SNP -O snps.vcf.gz
    """
    if select_type == "SNP":
        extension = ".snp.vcf"
    elif select_type == "INDEL":
        extension = ".indel.vcf"
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-2])
    #file_name = raw_vcf_file_name.split("/")[-1] #sample_name

    vcf_selected_output_file = raw_vcf_file_name + extension

    #memory_param = "-Xmx" + str(args.memory) + "g"
    #"--java-options", memory_param,

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--select-type-to-include", select_type, "--select-type-to-include",
        "MIXED", "--output", vcf_selected_output_file
    ]

    #    "--remove-unused-alternates",

    execute_subprocess(cmd)
Exemple #4
0
def select_pass_variants(raw_vcf, nocall_fr=0.1):
    """
    Filter a vcf file. Output a vcf file with PASS positions adding a .pass to the output file
    Used since it creates the neccesasary vcf index
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    https://gatkforums.broadinstitute.org/gatk/discussion/13127/do-gatk4-tools-ignore-vcf-sites-marked-as-filtered-or-must-they-be-removed-from-the-file
    """
    #max_nocall=2,

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-1])

    extension = ".pass.vcf"
    vcf_selected_output_file = raw_vcf_file_name + extension

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--max-nocall-fraction",
        str(nocall_fr), "--exclude-filtered", "--remove-unused-alternates",
        "--output", vcf_selected_output_file
    ]

    #"--max-nocall-number", str(max_nocall),
    execute_subprocess(cmd)
Exemple #5
0
def make_blast(query_fasta,
               database,
               sample,
               output_folder,
               db_type="nucl",
               query_type="nucl",
               evalue=0.0001,
               threads=8):

    blast_command = 'blastn' if query_type == "nucl" else 'blastp'
    database_name = database.split("/")[-1].split(".")[0]
    output_database_tmp = os.path.join(output_folder,
                                       database_name + ".blast.tmp")
    output_blast = os.path.join(output_folder,
                                sample + "." + database_name + ".blast")
    blastdb_cmd = [
        'makeblastdb', '-in', database, '-out', output_database_tmp, '-dbtype',
        db_type
    ]

    logger.info((',').join(blastdb_cmd))

    execute_subprocess(blastdb_cmd)

    blast_cmd = [
        blast_command, "-query", query_fasta, "-db", output_database_tmp,
        "-out", output_blast, "-evalue",
        str(evalue), "-num_threads",
        str(threads), "-outfmt",
        "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
    ]

    logger.info((',').join(blast_cmd))

    execute_subprocess(blast_cmd)
Exemple #6
0
def bwa_mapping(args):
    """
    #Store output in a file when it is outputted in stdout
    https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    cmd_index = ["bwa", "index", reference]
    execute_subprocess(cmd_index)

    cmd_map = [
        "bwa", "mem", "-t",
        str(args.threads), "-o", output_file, reference, r1, r2
    ]
    execute_subprocess(cmd_map)
    """
Exemple #7
0
def fastp_trimming(r1,
                   r2,
                   sample,
                   output_dir,
                   threads=6,
                   min_qual=20,
                   window_size=10,
                   min_len=35):
    check_create_dir(output_dir)

    output_trimmed_r1 = os.path.join(output_dir,
                                     sample + ".trimmed_R1.fastq.gz")
    output_trimmed_r2 = os.path.join(output_dir,
                                     sample + ".trimmed_R2.fastq.gz")

    html_dir = os.path.join(output_dir, 'html')
    json_dir = os.path.join(output_dir, 'json')

    check_create_dir(html_dir)
    check_create_dir(json_dir)

    html_file = os.path.join(html_dir, sample + '_fastp.html')
    json_file = os.path.join(json_dir, sample + '_fastp.json')

    cmd = [
        'fastp', '--in1', r1, '--in2', r2, '--out1', output_trimmed_r1,
        '--out2', output_trimmed_r2, '--detect_adapter_for_pe', '--cut_tail',
        '--cut_window_size',
        str(window_size), '--cut_mean_quality',
        str(min_qual), '--length_required',
        str(min_len), '--json', json_file, '--html', html_file, '--thread',
        str(threads)
    ]

    execute_subprocess(cmd)
Exemple #8
0
def add_SG(args, input_bam, output_bg_sorted):
    """
    @MN00227:45:000H255J3:1:11102:21214:1110 1:N:0:18
    @NS500454:48:HKG57BGXX:1:11101:17089:1032 2:N:0:TCCTGAGC+TCTTACGC
    @NS500454:27:HJJ32BGXX:1:11101:12392:1099 1:N:0:2

    @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:
    <is filtered>:<control number>:<sample number | barcode1'+barcode2'>
    ID = Read group identifier {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE} 
    PU = Platform Unit #optional
    SM = Sample
    PL = Platform/technology used to produce the read (ILLUMINA, SOLID, LS454, HELICOS and PACBIO)
    LB = DNA preparation library identifier
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)

    with gzip.open(r1) as f:
        first_line = f.readline().strip().decode()
    #print(first_line)
    first_line_list = first_line.split(":")

    rg_id = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_pu = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_sm = sample
    rg_pl = "ILLUMINA"
    rg_lb = "lib_" + sample

    rg_id_param = "RGID=" + rg_id
    rg_pu_param = "RGPU=" + rg_pu
    rg_sm_param = "RGSM=" + rg_sm
    rg_pl_param = "RGPL=" + rg_pl
    rg_lb_param = "RGLB=" + rg_lb

    picard_jar = get_picard_path()

    input_param = "INPUT=" + input_bam
    output_param = "OUTPUT=" + output_bg_sorted

    # java -jar picard.jar AddOrReplaceReadGroups \
    # INPUT=reads.bam \ OUTPUT=reads_addRG.bam \ RGID=H0164.2 \ #be sure to change from default of 1
    # RGLB= library1 \ RGPL=illumina \ RGPU=H0164ALXX140820.2 \ RGSM=sample1 \
    # SORT_ORDER=coordinate \ CREATE_INDEX=true

    cmd = [
        "java", "-jar", picard_jar, "AddOrReplaceReadGroups", input_param,
        output_param, rg_id_param, rg_lb_param, rg_pl_param, rg_pu_param,
        rg_sm_param, "SORT_ORDER=coordinate"
    ]
    execute_subprocess(cmd)
Exemple #9
0
def samtools_faidx(args):
    #samtools faidx reference.fa

    input_reference = os.path.abspath(args.reference)
    fai_file_name = input_reference + ".fai"

    if os.path.exists(fai_file_name):
        logger.info(fai_file_name + " already EXIST")
    else:
        cmd = ["samtools", "faidx", input_reference]
        execute_subprocess(cmd)
Exemple #10
0
def hard_filter(selected_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/article.php?id=6925
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php
    https://software.broadinstitute.org/gatk/documentation/article?id=23216
    SNP:
    gatk VariantFiltration -V snps.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" \
    "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" "--filter-expression", "SOR > 3.0", "--filter-name", "SOR3" "--filter-expression", "FS > 60.0", "--filter-name", "FS60" \
    "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40" "--filter-expression", "MQRankSum < -12.5", "--filter-name", "MQRankSum-12.5" "--filter-expression", "ReadPosRankSum < -8.0" \
   , "--filter-name", "ReadPosRankSum-8" -O snps_filtered.vcf.gz
    INDEL:
    gatk VariantFiltration -V indels.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" \
    -"--filter-expression", "FS > 200.0", "--filter-name", "FS200" -"--filter-expression", "ReadPosRankSum < -20.0", "--filter-name", "ReadPosRankSum-20" -O indels_filtered.vcf.gz
    #--filterExpression "QD<2.0||FS>60.0||MQ<40.0||MQRankSum<-12.5||ReadPosRankSum<-8.0" --filterName "my_snp_filter" 
    """

    input_vcf = os.path.abspath(selected_vcf)
    check_file_exists(input_vcf)

    selected_vcf_file_name = (".").join(input_vcf.split(".")[:-2])

    if select_type == "SNP":
        extension = ".snp.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 3.5", "--filter-name", "SOR3",
            "--filter-expression", "FS > 60.0", "--filter-name", "FS60",
            "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40",
            "--filter-expression", "DP < 10", "--filter-name", "DP10",
            "--filter-expression", "MQRankSum < -12.5", "--filter-name",
            "MQRankSum-12.5", "--filter-expression", "ReadPosRankSum < -8.0",
            "--filter-name", "ReadPosRankSum-8", "--output",
            vcf_hard_filtered_output_file
        ]

    elif select_type == "INDEL":
        extension = ".indel.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 10.0", "--filter-name", "SOR10",
            "--filter-expression", "FS > 200.0", "--filter-name", "FS200",
            "--filter-expression", "ReadPosRankSum < -20.0", "--filter-name",
            "ReadPosRankSum-20", "--output", vcf_hard_filtered_output_file
        ]
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    execute_subprocess(cmd)
Exemple #11
0
def annotate_pangolin(input_file,
                      output_folder,
                      output_filename,
                      threads=8,
                      max_ambig=0.6):
    cmd = [
        "pangolin", input_file, "--outdir", output_folder, "--outfile",
        output_filename, "--threads",
        str(threads), "--max-ambig",
        str(max_ambig)
    ]
    execute_subprocess(cmd)
Exemple #12
0
def combine_gvcf(args, recalibrate=False, all_gvcf=False):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php
    #combined multi-sample gVCF:
    gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz
    """
    output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    group_name = output.split("/")[-1]  #group_name

    if recalibrate:
        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")

    gvcf_output_file = group_name + ".cohort.g.vcf"
    gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file)

    check_create_dir(gvcf_input_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--output", gvcf_output_full
    ]

    for root, _, files in os.walk(gvcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".g.vcf"):
                cmd.append("--variant")
                cmd.append(filename)
    if all_gvcf != False:
        if os.path.isdir(all_gvcf):
            all_gvcf = os.path.abspath(all_gvcf)
            print("Using gvcf from enricment folder:" + all_gvcf)
            for root, _, files in os.walk(all_gvcf):
                for name in files:
                    filename = os.path.join(root, name)
                    if filename.endswith(".g.vcf"):
                        cmd.append("--variant")
                        cmd.append(filename)
        else:
            print("GVCF enrichment folder does not exist")

    execute_subprocess(cmd)
Exemple #13
0
def run_snippy_core(input_dir, output_dir, reference, filter_sample=[]):
    samples_snippy = []

    output_dir = output_dir + "/core"

    for root, dirs, files in os.walk(input_dir):
        for name in dirs:
            if root == input_dir and not name in filter_sample:
                foldername = os.path.join(root, name)
                samples_snippy.append(foldername)
            elif root == input_dir and name in filter_sample:
                logger.debug(name + " discarded from core FAULTY")

    cmd = ["snippy-core", "-p", output_dir, "--ref", reference
           ] + samples_snippy

    execute_subprocess(cmd)
Exemple #14
0
def call_variants(args, recalibrate=False, group=True):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php
    #Call variants:
    gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz
    """
    output = os.path.abspath(args.output)

    input_reference = os.path.abspath(args.reference)

    if not args.sample:
        args.sample = "nosample"

    file_name = args.sample  #sample_name
    group_name = output.split("/")[-1]  #group_name

    if recalibrate:

        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
        vcf_output_dir = obtain_output_dir(args, "VCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")
        vcf_output_dir = obtain_output_dir(args, "VCF")

    if group:
        gvcf_input_file = group_name + ".cohort.g.vcf"
        vcf_output_file = group_name + ".cohort.raw.vcf"
    else:
        gvcf_input_file = file_name + ".g.vcf"
        vcf_output_file = file_name + ".raw.vcf"

    gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file)
    vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file)

    check_create_dir(gvcf_input_dir)
    check_create_dir(vcf_output_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--variant", gvcf_input_full, "--output",
        vcf_output_full
    ]

    execute_subprocess(cmd)
Exemple #15
0
def refseq_masher(r1_file, r2_file, output_file, threads=16, max_results=50):
    """
    refseq_masher contains --top-n-results 50 -p 16 -o HPR3641322-50.contains2.tsv \
     HPR3641322-50_S27_L000_R1_001.fastq.gz HPR3641322-50_S27_L000_R2_001.fastq.gz
    """
    r1 = os.path.abspath(r1_file)
    r2 = os.path.abspath(r2_file)

    output_file = os.path.abspath(output_file)

    cmd = [
        "refseq_masher", "contains", "--top-n-results",
        str(max_results), "-p",
        str(threads), "-o", output_file, r1, r2
    ]

    execute_subprocess(cmd)
Exemple #16
0
def picard_dictionary(args):
    #java -jar picard.jar CreateSequenceDictionary\
    # R=reference.fasta O=reference.dict
    #picard_jar = get_picard_path()

    input_reference = os.path.abspath(args.reference)
    ref_param = "R=" + input_reference

    path_file_list = input_reference.split(".")[:-1]
    path_file_name = ".".join(path_file_list)
    dict_file_name = path_file_name + ".dict"
    out_param = "O=" + dict_file_name

    if os.path.exists(dict_file_name):
        logger.info(dict_file_name + " already EXIST")
    else:
        cmd = ["picard", "CreateSequenceDictionary", ref_param, out_param]
        execute_subprocess(cmd)
def sam_to_index_bam(sample, output_dir, r1, threads):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_sorted_name = input_name + ".sorted.bam"
    output_sorted_path = os.path.join(output_dir, output_sorted_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    cmd_view = [
        "samtools",
        "view",
        "-Sb",
        input_sam_path,
        "--threads",
        str(threads),
        "-o",
        output_bam_path,
    ]
    execute_subprocess(cmd_view)

    check_remove_file(input_sam_path)

    cmd_sort = ["samtools", "sort", output_bam_path, "-o", output_sorted_path]
    execute_subprocess(cmd_sort)

    check_remove_file(output_bam_path)

    add_SG(sample, output_sorted_path, output_bg_sorted_path, r1)

    check_remove_file(output_sorted_path)
    """
Exemple #18
0
def sam_to_index_bam(args):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    check_create_dir(output_dir)
    """
    #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam
    with open(output_bam_path, "w") as outfile:
        #map reads and save it in th eoutput file
        subprocess.run(["samtools", "view", "-Sb", input_sam_path], 
        stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True)
    """
    cmd = [
        "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path,
        "--threads",
        str(args.threads)
    ]
    execute_subprocess(cmd)

    check_remove_file(input_sam_path)

    add_SG(args, output_bam_path, output_bg_sorted_path)

    check_remove_file(output_bam_path)
    """
Exemple #19
0
def ivar_consensus(input_bam,
                   output_consensus,
                   sample,
                   min_quality=20,
                   min_frequency_threshold=0.8,
                   min_depth=20,
                   uncovered_character='N'):
    """
    ivar consensus
        Usage: samtools mpileup -aa -A -d 0 -Q 0 <input.bam> | ivar consensus -p <prefix> 
        Note : samtools mpileup output must be piped into ivar consensus
        Input Options    Description
           -q    Minimum quality score threshold to count base (Default: 20)
           -t    Minimum frequency threshold(0 - 1) to call consensus. (Default: 0)
                 Frequently used thresholds | Description
                 ---------------------------|------------
                                          0 | Majority or most common base
                                        0.2 | Bases that make up atleast 20% of the depth at a position
                                        0.5 | Strict or bases that make up atleast 50% of the depth at a position
                                        0.9 | Strict or bases that make up atleast 90% of the depth at a position
                                          1 | Identical or bases that make up 100% of the depth at a position. Will have highest ambiguities
           -m    Minimum depth to call consensus(Default: 10)
           -k    If '-k' flag is added, regions with depth less than minimum depth will not be added to the consensus sequence. Using '-k' will override any option specified using -n 
           -n    (N/-) Character to print in regions with less than minimum coverage(Default: N)
        Output Options   Description
           -p    (Required) Prefix for the output fasta file and quality file
    """

    prefix = output_consensus + '/' + sample

    input = {
        'input_bam': input_bam,
        'prefix': prefix,
        'min_quality': str(min_quality),
        'min_frequency_threshold': str(min_frequency_threshold),
        'min_depth': str(min_depth),
        'uncovered_character': uncovered_character
    }

    cmd = "samtools mpileup -aa -A -d 0 -B -Q 0  {input_bam} | \
        ivar consensus -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -n {uncovered_character}".format(
        **input)

    execute_subprocess(cmd, isShell=True)
Exemple #20
0
def picard_markdup(args):
    #java -jar picard.jar MarkDuplicates \
    #  I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt
    picard_jar = get_picard_path()

    input_bam = os.path.abspath(args.input_bam)
    in_param = "I=" + input_bam

    path_file_name = input_bam.split(".")[0]
    file_name = path_file_name.split("/")[-1]
    output_markdup = path_file_name + ".rg.markdup.bam"
    output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam"
    out_param = "O=" + output_markdup

    stat_output_dir = obtain_output_dir(args, "Stats")
    stat_output_file = file_name + ".markdup.metrics.txt"
    stat_output_full = os.path.join(stat_output_dir, stat_output_file)
    stats_param = "M=" + stat_output_full

    check_create_dir(stat_output_dir)

    cmd_markdup = [
        "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param,
        stats_param
    ]
    execute_subprocess(cmd_markdup)

    #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam"
    cmd_sort = [
        "samtools", "sort", output_markdup, "-o", output_markdup_sorted
    ]
    execute_subprocess(cmd_sort)

    #Handled in Haplotype Caller function
    #samtools index: samtools index $output_dir/$sample".sorted.bam"
    subprocess.run(["samtools", "index", output_markdup_sorted],
                   stdout=subprocess.PIPE,
                   stderr=subprocess.PIPE,
                   check=True)
    check_remove_file(input_bam)
    check_remove_file(output_markdup)
Exemple #21
0
def ivar_variants(reference,
                  input_bam,
                  output_variant,
                  sample,
                  annotation,
                  min_quality=20,
                  min_frequency_threshold=0.8,
                  min_depth=20):
    """
    Usage: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file]
        Note : samtools mpileup output must be piped into ivar variants
        Input Options    Description
           -q    Minimum quality score threshold to count base (Default: 20)
           -t    Minimum frequency threshold(0 - 1) to call variants (Default: 0.03)
           -m    Minimum read depth to call variants (Default: 0)
           -r    Reference file used for alignment. This is used to translate the nucleotide sequences and identify intra host single nucleotide variants
           -g    A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs). In absence of GFF file, amino acid translation will not be done.
        Output Options   Description
           -p    (Required) Prefix for the output tsv variant file
    """
    ivar_folder = os.path.join(output_variant, 'ivar_raw')
    check_create_dir(ivar_folder)
    prefix = ivar_folder + '/' + sample

    input = {
        'reference': reference,
        'input_bam': input_bam,
        'prefix': prefix,
        'min_quality': str(min_quality),
        'min_frequency_threshold': str(min_frequency_threshold),
        'min_depth': str(min_depth),
        'annotation': annotation
    }

    cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 --reference {reference} {input_bam} | \
        ivar variants -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -r {reference} -g {annotation}".format(
        **input)

    execute_subprocess(cmd, isShell=True)
Exemple #22
0
def run_snippy(r1,
               r2,
               reference,
               output_dir,
               sample,
               threads=16,
               minqual=20,
               minfrac=0.1,
               mincov=1):
    """
    snippy --cpus 16 --outdir mysnps --ref Listeria.gbk --R1 FDA_R1.fastq.gz --R2 FDA_R2.fastq.gz
    """
    prefix = os.path.join(output_dir, sample)

    cmd = [
        "snippy", "--cpus",
        str(threads), "--outdir", prefix, "--minqual",
        str(minqual), "--mincov",
        str(mincov), "--minfrac",
        str(minfrac), "--ref", reference, "--R1", r1, "--R2", r2
    ]

    execute_subprocess(cmd)
Exemple #23
0
def split_vcf_saples(vcf_file, sample_list=False, nocall_fr=0.1):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    https://www.biostars.org/p/224702/
    #TODO: check if argument --exclude-filtered is suitable here. It would save select_pass_variants() step
    """

    if sample_list == False:
        #samples = subprocess.run(["bcftools", "query", "-l", vcf_file],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True)
        #sample_list = samples.stdout.split("\n")[:-1]
        sample_list = samples_from_vcf(vcf_file)
    else:
        sample_list = sample_list

    vcf_file_path = os.path.abspath(vcf_file)
    vcf_dir_name = os.path.dirname(vcf_file)
    vcf_file_name = vcf_file_path.split("/")[-1]
    vcf_file_extension = (".").join(vcf_file_name.split(".")[2:])

    for sample_name in sample_list:
        output_vcf_name = sample_name + "." + vcf_file_extension
        output_vcf_file = os.path.join(vcf_dir_name, output_vcf_name)
        cmd = [
            "gatk", "SelectVariants", "--max-nocall-fraction",
            str(nocall_fr), "--variant", vcf_file, "--sample-name",
            sample_name, "--exclude-non-variants", "--output", output_vcf_file
        ]
        #"--exclude-non-variants", #remove non genotyped variants
        #"--remove-unused-alternates", #avoid poblational polymorphism
        #--preserve-alleles
        #"--keep-original-dp",
        #"--keep-original-ac",
        #"--select-type-to-include", "SNP",
        #"--select-type-to-include", "MIXED",

        if not os.path.isfile(output_vcf_file):
            execute_subprocess(cmd)
Exemple #24
0
def ivar_trim(input_bam,
              primers_file,
              sample,
              min_length=30,
              min_quality=20,
              sliding_window_width=4):
    """
    Usage: ivar trim -i <input.bam> -b <primers.bed> -p <prefix> [-m <min-length>] [-q <min-quality>] [-s <sliding-window-width>]
        Input Options    Description
           -i    (Required) Sorted bam file, with aligned reads, to trim primers and quality
           -b    (Required) BED file with primer sequences and positions
           -m    Minimum length of read to retain after trimming (Default: 30)
           -q    Minimum quality threshold for sliding window to pass (Default: 20)
           -s    Width of sliding window (Default: 4)
           -e    Include reads with no primers. By default, reads with no primers are excluded
        Output Options   Description
           -p    (Required) Prefix for the output BAM file
    """

    input_bam = os.path.abspath(input_bam)
    input_bai = input_bam + ".bai"
    primers_file = os.path.abspath(primers_file)

    prefix = input_bam.split('.')[0] + ".rg.markdup.trimmed"
    output_trimmed_bam = prefix + ".bam"
    output_trimmed_sorted_bam = input_bam.split(
        '.')[0] + ".rg.markdup.trimmed.sorted.bam"

    cmd = [
        "ivar", "trim", "-i", input_bam, "-b", primers_file, "-p", prefix,
        "-m",
        str(min_length), "-q",
        str(min_quality), "-s",
        str(sliding_window_width), "-e"
    ]
    execute_subprocess(cmd)

    check_remove_file(input_bam)

    cmd_sort = [
        "samtools", "sort", output_trimmed_bam, "-o", output_trimmed_sorted_bam
    ]
    execute_subprocess(cmd_sort)

    check_remove_file(output_trimmed_bam)

    cmd_index = ["samtools", "index", output_trimmed_sorted_bam]
    execute_subprocess(cmd_index)

    check_remove_file(input_bai)
Exemple #25
0
def fastqc_quality(r1, r2, output_dir, threads=8):
    check_create_dir(output_dir)

    cmd = ['fastqc', r1, r2, '-o', output_dir, '--threads', str(threads)]

    execute_subprocess(cmd)
Exemple #26
0
def create_coverage(input_bam, output_dir, sample):
    output_file = os.path.join(output_dir, sample + ".cov")
    cmd = "samtools depth -aa {} > {}".format(input_bam, output_file)
    execute_subprocess(cmd, isShell=True)
Exemple #27
0
def create_bamstat(input_bam, output_dir, sample, threads=8):
    output_file = os.path.join(output_dir, sample + ".bamstats")
    cmd = "samtools flagstat --threads {} {} > {}".format(
        str(threads), input_bam, output_file)
    execute_subprocess(cmd, isShell=True)
Exemple #28
0
def haplotype_caller(args,
                     recalibrate=False,
                     ploidy=2,
                     bamout=False,
                     forceactive=False,
                     intervals=False):
    #base_quality=13,
    """
    #No excuses
    https://software.broadinstitute.org/gatk/documentation/article?id=11081
    """
    #input_bam = os.path.abspath(args.input_bam)
    input_reference = os.path.abspath(args.reference)

    bam_output_dir = obtain_output_dir(args, "Bam")
    #file_name = path_file_name.split("/")[-1] #sample_name
    file_name = args.sample
    #path_file_name = os.path.join(output_dir, gvcf_output_file)

    if recalibrate:
        input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF_recal")
        gvcf_output_file = file_name + ".g.vcf"
    else:
        input_bam_to_call_name = file_name + ".bqsr.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF")
        gvcf_output_file = file_name + ".g.vcf"

    check_create_dir(gvcf_output_dir)

    input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name)
    gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file)

    memory_param = "-Xmx" + str(args.memory) + "g"

    hc_args = [
        "gatk", "HaplotypeCaller", "--java-options", memory_param,
        "--reference", input_reference, "--input", input_bam_to_call,
        "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF",
        "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy",
        str(ploidy)
    ]

    #"--min-base-quality-score", str(base_quality),

    #Create bam index
    #cmd_index = ["samtools", "index", input_bam_to_call]
    #execute_subprocess(cmd_index)

    if bamout:
        bamout_output_dir = obtain_output_dir(args, "Bamout")
        bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam"
        bamout_output_full = os.path.join(bamout_output_dir,
                                          bamout_output_file)
        check_create_dir(bamout_output_dir)
        bamout_params = ["--bam-output", bamout_output_full]
        hc_args.extend(bamout_params)

    if forceactive:
        force_params = ["--force-active", "--disable-optimizations"]
        hc_args.extend(force_params)

    execute_subprocess(hc_args)
    """
Exemple #29
0
def recalibrate_bam(args, tb=False):
    """
    BaseRecalibrator
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php
    #Recalibrate bam:
    gatk BaseRecalibrator --input my_reads.bam --reference reference.fasta --known-sites sites_of_variation.vcf \
    --known-sites another/optional/setOfSitesToMask.vcf --output recal_data.table
    ApplyBQSR
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php
    gatk ApplyBQSR --reference reference.fasta --input input.bam --bqsr-recal-file recalibration.table --output output.bam
    """
    #output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    #Automate M. tuberculosis reference for aditional recalibraion positions
    if ("NC_000962.3" in input_reference) or (
            "h37rv" in input_reference.lower()) or ("ancestor"
                                                    in input_reference):
        tb = True
        script_dir = os.path.dirname(os.path.realpath(__file__))
        reference_dir = os.path.join(script_dir, "reference")
        if ("NC_000962.3" in input_reference) or ("h37rv"
                                                  in input_reference.lower()):
            reference_file = os.path.join(
                reference_dir, "190508_ddtb.NC_000962.3.BQSR.table")
        elif ("ancestor" in input_reference):
            reference_file = os.path.join(reference_dir,
                                          "190508_ddtb.BQSR.table")

    #group_name = output.split("/")[-1] #group_name
    sample_name = args.sample
    bam_input_dir = obtain_output_dir(args, "Bam")
    vcf_input_dir = obtain_output_dir(args, "VCF_recal")

    bam_input_file_name = sample_name + ".rg.markdup.sorted.bam"
    bam_input_file = os.path.join(bam_input_dir, bam_input_file_name)

    table_output_file_name = sample_name + ".recall.table"
    table_output_file = os.path.join(vcf_input_dir, table_output_file_name)

    memory_param = "-Xmx" + str(args.memory) + "g"

    #BaseRecalibrator

    cmd_bqsr = [
        "gatk", "BaseRecalibrator", "--java-options", memory_param,
        "--reference", input_reference, "--input", bam_input_file, "--output",
        table_output_file
    ]

    if tb == True:
        cmd_bqsr.append("--known-sites")
        cmd_bqsr.append(reference_file)

    for root, _, files in os.walk(vcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".hf.pass.vcf"):
                cmd_bqsr.append("--known-sites")
                cmd_bqsr.append(filename)

    execute_subprocess(cmd_bqsr)

    #ApplyBQSR

    bam_output_file_name = sample_name + ".bqsr.bam"
    bam_output_file = os.path.join(bam_input_dir, bam_output_file_name)

    cmd_apply = [
        "gatk", "ApplyBQSR", "--reference", input_reference, "--input",
        bam_input_file, "--bqsr-recal-file", table_output_file, "--output",
        bam_output_file
    ]

    execute_subprocess(cmd_apply)