コード例 #1
0
ファイル: pe_mapper.py プロジェクト: pedroscampoy/SNPTB
def bowtie2_mapping(args):
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    if args.extensive_mapping:
        extensive_command = "-a"
    else:
        extensive_command = ""
    #bowtie2 index
    cmd_index = ["bowtie2-build", reference, reference]
    execute_subprocess(cmd_index)

    #bowtie map
    cmd_map = [
        "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q",
        "--very-sensitive-local", "-p",
        str(args.threads), "-x", reference, extensive_command
    ]
    execute_subprocess(cmd_map)
コード例 #2
0
ファイル: bbduk_trimmer.py プロジェクト: pedroscampoy/SNPTB
def bbduk_trimming(args):
    """
    TODO : handle params
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    output_dir = obtain_output_dir(args, "Trimmed")

    in1_param = "in1=" + r1
    in2_param = "in2=" + r2

    sample = extract_sample(r1, r2)

    out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz"
    out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz"

    stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats"

    adapter_path = "ref=" + get_bbduk_adapters()

    memory_param = "-Xmx" + str(args.memory) + "g"
    threads_param = "threads=" + str(args.threads)

    check_create_dir(output_dir)

    #bbduk.sh
    cmd = [
        "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param,
        adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21",
        "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo",
        stats_param
    ]

    execute_subprocess(cmd)
コード例 #3
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def select_variants(raw_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    gatk SelectVariants -V cohort.vcf.gz -select-type SNP -O snps.vcf.gz
    """
    if select_type == "SNP":
        extension = ".snp.vcf"
    elif select_type == "INDEL":
        extension = ".indel.vcf"
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-2])
    #file_name = raw_vcf_file_name.split("/")[-1] #sample_name

    vcf_selected_output_file = raw_vcf_file_name + extension

    #memory_param = "-Xmx" + str(args.memory) + "g"
    #"--java-options", memory_param,

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--select-type-to-include", select_type, "--select-type-to-include",
        "MIXED", "--output", vcf_selected_output_file
    ]

    #    "--remove-unused-alternates",

    execute_subprocess(cmd)
コード例 #4
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def select_pass_variants(raw_vcf, nocall_fr=0.1):
    """
    Filter a vcf file. Output a vcf file with PASS positions adding a .pass to the output file
    Used since it creates the neccesasary vcf index
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    https://gatkforums.broadinstitute.org/gatk/discussion/13127/do-gatk4-tools-ignore-vcf-sites-marked-as-filtered-or-must-they-be-removed-from-the-file
    """
    #max_nocall=2,

    input_vcf = os.path.abspath(raw_vcf)
    check_file_exists(input_vcf)

    raw_vcf_file_name = (".").join(input_vcf.split(".")[:-1])

    extension = ".pass.vcf"
    vcf_selected_output_file = raw_vcf_file_name + extension

    cmd = [
        "gatk", "SelectVariants", "--variant", input_vcf,
        "--max-nocall-fraction",
        str(nocall_fr), "--exclude-filtered", "--remove-unused-alternates",
        "--output", vcf_selected_output_file
    ]

    #"--max-nocall-number", str(max_nocall),
    execute_subprocess(cmd)
コード例 #5
0
def make_blast(query_fasta,
               database,
               sample,
               output_folder,
               db_type="nucl",
               query_type="nucl",
               evalue=0.0001,
               threads=8):

    blast_command = 'blastn' if query_type == "nucl" else 'blastp'
    database_name = database.split("/")[-1].split(".")[0]
    output_database_tmp = os.path.join(output_folder,
                                       database_name + ".blast.tmp")
    output_blast = os.path.join(output_folder,
                                sample + "." + database_name + ".blast")
    blastdb_cmd = [
        'makeblastdb', '-in', database, '-out', output_database_tmp, '-dbtype',
        db_type
    ]

    logger.info((',').join(blastdb_cmd))

    execute_subprocess(blastdb_cmd)

    blast_cmd = [
        blast_command, "-query", query_fasta, "-db", output_database_tmp,
        "-out", output_blast, "-evalue",
        str(evalue), "-num_threads",
        str(threads), "-outfmt",
        "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
    ]

    logger.info((',').join(blast_cmd))

    execute_subprocess(blast_cmd)
コード例 #6
0
ファイル: pe_mapper.py プロジェクト: pedroscampoy/SNPTB
def bwa_mapping(args):
    """
    #Store output in a file when it is outputted in stdout
    https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    cmd_index = ["bwa", "index", reference]
    execute_subprocess(cmd_index)

    cmd_map = [
        "bwa", "mem", "-t",
        str(args.threads), "-o", output_file, reference, r1, r2
    ]
    execute_subprocess(cmd_map)
    """
コード例 #7
0
def fastp_trimming(r1,
                   r2,
                   sample,
                   output_dir,
                   threads=6,
                   min_qual=20,
                   window_size=10,
                   min_len=35):
    check_create_dir(output_dir)

    output_trimmed_r1 = os.path.join(output_dir,
                                     sample + ".trimmed_R1.fastq.gz")
    output_trimmed_r2 = os.path.join(output_dir,
                                     sample + ".trimmed_R2.fastq.gz")

    html_dir = os.path.join(output_dir, 'html')
    json_dir = os.path.join(output_dir, 'json')

    check_create_dir(html_dir)
    check_create_dir(json_dir)

    html_file = os.path.join(html_dir, sample + '_fastp.html')
    json_file = os.path.join(json_dir, sample + '_fastp.json')

    cmd = [
        'fastp', '--in1', r1, '--in2', r2, '--out1', output_trimmed_r1,
        '--out2', output_trimmed_r2, '--detect_adapter_for_pe', '--cut_tail',
        '--cut_window_size',
        str(window_size), '--cut_mean_quality',
        str(min_qual), '--length_required',
        str(min_len), '--json', json_file, '--html', html_file, '--thread',
        str(threads)
    ]

    execute_subprocess(cmd)
コード例 #8
0
ファイル: pe_mapper.py プロジェクト: pedroscampoy/SNPTB
def add_SG(args, input_bam, output_bg_sorted):
    """
    @MN00227:45:000H255J3:1:11102:21214:1110 1:N:0:18
    @NS500454:48:HKG57BGXX:1:11101:17089:1032 2:N:0:TCCTGAGC+TCTTACGC
    @NS500454:27:HJJ32BGXX:1:11101:12392:1099 1:N:0:2

    @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:
    <is filtered>:<control number>:<sample number | barcode1'+barcode2'>
    ID = Read group identifier {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE} 
    PU = Platform Unit #optional
    SM = Sample
    PL = Platform/technology used to produce the read (ILLUMINA, SOLID, LS454, HELICOS and PACBIO)
    LB = DNA preparation library identifier
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)

    with gzip.open(r1) as f:
        first_line = f.readline().strip().decode()
    #print(first_line)
    first_line_list = first_line.split(":")

    rg_id = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_pu = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_sm = sample
    rg_pl = "ILLUMINA"
    rg_lb = "lib_" + sample

    rg_id_param = "RGID=" + rg_id
    rg_pu_param = "RGPU=" + rg_pu
    rg_sm_param = "RGSM=" + rg_sm
    rg_pl_param = "RGPL=" + rg_pl
    rg_lb_param = "RGLB=" + rg_lb

    picard_jar = get_picard_path()

    input_param = "INPUT=" + input_bam
    output_param = "OUTPUT=" + output_bg_sorted

    # java -jar picard.jar AddOrReplaceReadGroups \
    # INPUT=reads.bam \ OUTPUT=reads_addRG.bam \ RGID=H0164.2 \ #be sure to change from default of 1
    # RGLB= library1 \ RGPL=illumina \ RGPU=H0164ALXX140820.2 \ RGSM=sample1 \
    # SORT_ORDER=coordinate \ CREATE_INDEX=true

    cmd = [
        "java", "-jar", picard_jar, "AddOrReplaceReadGroups", input_param,
        output_param, rg_id_param, rg_lb_param, rg_pl_param, rg_pu_param,
        rg_sm_param, "SORT_ORDER=coordinate"
    ]
    execute_subprocess(cmd)
コード例 #9
0
def samtools_faidx(args):
    #samtools faidx reference.fa

    input_reference = os.path.abspath(args.reference)
    fai_file_name = input_reference + ".fai"

    if os.path.exists(fai_file_name):
        logger.info(fai_file_name + " already EXIST")
    else:
        cmd = ["samtools", "faidx", input_reference]
        execute_subprocess(cmd)
コード例 #10
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def hard_filter(selected_vcf, select_type='SNP'):
    """
    https://software.broadinstitute.org/gatk/documentation/article.php?id=6925
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php
    https://software.broadinstitute.org/gatk/documentation/article?id=23216
    SNP:
    gatk VariantFiltration -V snps.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" \
    "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" "--filter-expression", "SOR > 3.0", "--filter-name", "SOR3" "--filter-expression", "FS > 60.0", "--filter-name", "FS60" \
    "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40" "--filter-expression", "MQRankSum < -12.5", "--filter-name", "MQRankSum-12.5" "--filter-expression", "ReadPosRankSum < -8.0" \
   , "--filter-name", "ReadPosRankSum-8" -O snps_filtered.vcf.gz
    INDEL:
    gatk VariantFiltration -V indels.vcf.gz "--filter-expression", "QD < 2.0", "--filter-name", "QD2" "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30" \
    -"--filter-expression", "FS > 200.0", "--filter-name", "FS200" -"--filter-expression", "ReadPosRankSum < -20.0", "--filter-name", "ReadPosRankSum-20" -O indels_filtered.vcf.gz
    #--filterExpression "QD<2.0||FS>60.0||MQ<40.0||MQRankSum<-12.5||ReadPosRankSum<-8.0" --filterName "my_snp_filter" 
    """

    input_vcf = os.path.abspath(selected_vcf)
    check_file_exists(input_vcf)

    selected_vcf_file_name = (".").join(input_vcf.split(".")[:-2])

    if select_type == "SNP":
        extension = ".snp.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 3.5", "--filter-name", "SOR3",
            "--filter-expression", "FS > 60.0", "--filter-name", "FS60",
            "--filter-expression", "MQ < 40.0", "--filter-name", "MQ40",
            "--filter-expression", "DP < 10", "--filter-name", "DP10",
            "--filter-expression", "MQRankSum < -12.5", "--filter-name",
            "MQRankSum-12.5", "--filter-expression", "ReadPosRankSum < -8.0",
            "--filter-name", "ReadPosRankSum-8", "--output",
            vcf_hard_filtered_output_file
        ]

    elif select_type == "INDEL":
        extension = ".indel.hf.vcf"
        vcf_hard_filtered_output_file = selected_vcf_file_name + extension
        cmd = [
            "gatk", "VariantFiltration", "--variant", input_vcf,
            "--filter-expression", "QD < 2.0", "--filter-name", "QD2",
            "--filter-expression", "QUAL < 30.0", "--filter-name", "QUAL30",
            "--filter-expression", "SOR > 10.0", "--filter-name", "SOR10",
            "--filter-expression", "FS > 200.0", "--filter-name", "FS200",
            "--filter-expression", "ReadPosRankSum < -20.0", "--filter-name",
            "ReadPosRankSum-20", "--output", vcf_hard_filtered_output_file
        ]
    else:
        print(RED + BOLD + "Choose a correct type to filter" + END_FORMATTING)

    execute_subprocess(cmd)
コード例 #11
0
def annotate_pangolin(input_file,
                      output_folder,
                      output_filename,
                      threads=8,
                      max_ambig=0.6):
    cmd = [
        "pangolin", input_file, "--outdir", output_folder, "--outfile",
        output_filename, "--threads",
        str(threads), "--max-ambig",
        str(max_ambig)
    ]
    execute_subprocess(cmd)
コード例 #12
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def combine_gvcf(args, recalibrate=False, all_gvcf=False):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php
    #combined multi-sample gVCF:
    gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz
    """
    output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    group_name = output.split("/")[-1]  #group_name

    if recalibrate:
        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")

    gvcf_output_file = group_name + ".cohort.g.vcf"
    gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file)

    check_create_dir(gvcf_input_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--output", gvcf_output_full
    ]

    for root, _, files in os.walk(gvcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".g.vcf"):
                cmd.append("--variant")
                cmd.append(filename)
    if all_gvcf != False:
        if os.path.isdir(all_gvcf):
            all_gvcf = os.path.abspath(all_gvcf)
            print("Using gvcf from enricment folder:" + all_gvcf)
            for root, _, files in os.walk(all_gvcf):
                for name in files:
                    filename = os.path.join(root, name)
                    if filename.endswith(".g.vcf"):
                        cmd.append("--variant")
                        cmd.append(filename)
        else:
            print("GVCF enrichment folder does not exist")

    execute_subprocess(cmd)
コード例 #13
0
def run_snippy_core(input_dir, output_dir, reference, filter_sample=[]):
    samples_snippy = []

    output_dir = output_dir + "/core"

    for root, dirs, files in os.walk(input_dir):
        for name in dirs:
            if root == input_dir and not name in filter_sample:
                foldername = os.path.join(root, name)
                samples_snippy.append(foldername)
            elif root == input_dir and name in filter_sample:
                logger.debug(name + " discarded from core FAULTY")

    cmd = ["snippy-core", "-p", output_dir, "--ref", reference
           ] + samples_snippy

    execute_subprocess(cmd)
コード例 #14
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def call_variants(args, recalibrate=False, group=True):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php
    #Call variants:
    gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz
    """
    output = os.path.abspath(args.output)

    input_reference = os.path.abspath(args.reference)

    if not args.sample:
        args.sample = "nosample"

    file_name = args.sample  #sample_name
    group_name = output.split("/")[-1]  #group_name

    if recalibrate:

        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
        vcf_output_dir = obtain_output_dir(args, "VCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")
        vcf_output_dir = obtain_output_dir(args, "VCF")

    if group:
        gvcf_input_file = group_name + ".cohort.g.vcf"
        vcf_output_file = group_name + ".cohort.raw.vcf"
    else:
        gvcf_input_file = file_name + ".g.vcf"
        vcf_output_file = file_name + ".raw.vcf"

    gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file)
    vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file)

    check_create_dir(gvcf_input_dir)
    check_create_dir(vcf_output_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--variant", gvcf_input_full, "--output",
        vcf_output_full
    ]

    execute_subprocess(cmd)
コード例 #15
0
def refseq_masher(r1_file, r2_file, output_file, threads=16, max_results=50):
    """
    refseq_masher contains --top-n-results 50 -p 16 -o HPR3641322-50.contains2.tsv \
     HPR3641322-50_S27_L000_R1_001.fastq.gz HPR3641322-50_S27_L000_R2_001.fastq.gz
    """
    r1 = os.path.abspath(r1_file)
    r2 = os.path.abspath(r2_file)

    output_file = os.path.abspath(output_file)

    cmd = [
        "refseq_masher", "contains", "--top-n-results",
        str(max_results), "-p",
        str(threads), "-o", output_file, r1, r2
    ]

    execute_subprocess(cmd)
コード例 #16
0
def picard_dictionary(args):
    #java -jar picard.jar CreateSequenceDictionary\
    # R=reference.fasta O=reference.dict
    #picard_jar = get_picard_path()

    input_reference = os.path.abspath(args.reference)
    ref_param = "R=" + input_reference

    path_file_list = input_reference.split(".")[:-1]
    path_file_name = ".".join(path_file_list)
    dict_file_name = path_file_name + ".dict"
    out_param = "O=" + dict_file_name

    if os.path.exists(dict_file_name):
        logger.info(dict_file_name + " already EXIST")
    else:
        cmd = ["picard", "CreateSequenceDictionary", ref_param, out_param]
        execute_subprocess(cmd)
コード例 #17
0
def sam_to_index_bam(sample, output_dir, r1, threads):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_sorted_name = input_name + ".sorted.bam"
    output_sorted_path = os.path.join(output_dir, output_sorted_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    cmd_view = [
        "samtools",
        "view",
        "-Sb",
        input_sam_path,
        "--threads",
        str(threads),
        "-o",
        output_bam_path,
    ]
    execute_subprocess(cmd_view)

    check_remove_file(input_sam_path)

    cmd_sort = ["samtools", "sort", output_bam_path, "-o", output_sorted_path]
    execute_subprocess(cmd_sort)

    check_remove_file(output_bam_path)

    add_SG(sample, output_sorted_path, output_bg_sorted_path, r1)

    check_remove_file(output_sorted_path)
    """
コード例 #18
0
ファイル: pe_mapper.py プロジェクト: pedroscampoy/SNPTB
def sam_to_index_bam(args):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    check_create_dir(output_dir)
    """
    #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam
    with open(output_bam_path, "w") as outfile:
        #map reads and save it in th eoutput file
        subprocess.run(["samtools", "view", "-Sb", input_sam_path], 
        stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True)
    """
    cmd = [
        "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path,
        "--threads",
        str(args.threads)
    ]
    execute_subprocess(cmd)

    check_remove_file(input_sam_path)

    add_SG(args, output_bam_path, output_bg_sorted_path)

    check_remove_file(output_bam_path)
    """
コード例 #19
0
def ivar_consensus(input_bam,
                   output_consensus,
                   sample,
                   min_quality=20,
                   min_frequency_threshold=0.8,
                   min_depth=20,
                   uncovered_character='N'):
    """
    ivar consensus
        Usage: samtools mpileup -aa -A -d 0 -Q 0 <input.bam> | ivar consensus -p <prefix> 
        Note : samtools mpileup output must be piped into ivar consensus
        Input Options    Description
           -q    Minimum quality score threshold to count base (Default: 20)
           -t    Minimum frequency threshold(0 - 1) to call consensus. (Default: 0)
                 Frequently used thresholds | Description
                 ---------------------------|------------
                                          0 | Majority or most common base
                                        0.2 | Bases that make up atleast 20% of the depth at a position
                                        0.5 | Strict or bases that make up atleast 50% of the depth at a position
                                        0.9 | Strict or bases that make up atleast 90% of the depth at a position
                                          1 | Identical or bases that make up 100% of the depth at a position. Will have highest ambiguities
           -m    Minimum depth to call consensus(Default: 10)
           -k    If '-k' flag is added, regions with depth less than minimum depth will not be added to the consensus sequence. Using '-k' will override any option specified using -n 
           -n    (N/-) Character to print in regions with less than minimum coverage(Default: N)
        Output Options   Description
           -p    (Required) Prefix for the output fasta file and quality file
    """

    prefix = output_consensus + '/' + sample

    input = {
        'input_bam': input_bam,
        'prefix': prefix,
        'min_quality': str(min_quality),
        'min_frequency_threshold': str(min_frequency_threshold),
        'min_depth': str(min_depth),
        'uncovered_character': uncovered_character
    }

    cmd = "samtools mpileup -aa -A -d 0 -B -Q 0  {input_bam} | \
        ivar consensus -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -n {uncovered_character}".format(
        **input)

    execute_subprocess(cmd, isShell=True)
コード例 #20
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def picard_markdup(args):
    #java -jar picard.jar MarkDuplicates \
    #  I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt
    picard_jar = get_picard_path()

    input_bam = os.path.abspath(args.input_bam)
    in_param = "I=" + input_bam

    path_file_name = input_bam.split(".")[0]
    file_name = path_file_name.split("/")[-1]
    output_markdup = path_file_name + ".rg.markdup.bam"
    output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam"
    out_param = "O=" + output_markdup

    stat_output_dir = obtain_output_dir(args, "Stats")
    stat_output_file = file_name + ".markdup.metrics.txt"
    stat_output_full = os.path.join(stat_output_dir, stat_output_file)
    stats_param = "M=" + stat_output_full

    check_create_dir(stat_output_dir)

    cmd_markdup = [
        "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param,
        stats_param
    ]
    execute_subprocess(cmd_markdup)

    #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam"
    cmd_sort = [
        "samtools", "sort", output_markdup, "-o", output_markdup_sorted
    ]
    execute_subprocess(cmd_sort)

    #Handled in Haplotype Caller function
    #samtools index: samtools index $output_dir/$sample".sorted.bam"
    subprocess.run(["samtools", "index", output_markdup_sorted],
                   stdout=subprocess.PIPE,
                   stderr=subprocess.PIPE,
                   check=True)
    check_remove_file(input_bam)
    check_remove_file(output_markdup)
コード例 #21
0
def ivar_variants(reference,
                  input_bam,
                  output_variant,
                  sample,
                  annotation,
                  min_quality=20,
                  min_frequency_threshold=0.8,
                  min_depth=20):
    """
    Usage: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file]
        Note : samtools mpileup output must be piped into ivar variants
        Input Options    Description
           -q    Minimum quality score threshold to count base (Default: 20)
           -t    Minimum frequency threshold(0 - 1) to call variants (Default: 0.03)
           -m    Minimum read depth to call variants (Default: 0)
           -r    Reference file used for alignment. This is used to translate the nucleotide sequences and identify intra host single nucleotide variants
           -g    A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs). In absence of GFF file, amino acid translation will not be done.
        Output Options   Description
           -p    (Required) Prefix for the output tsv variant file
    """
    ivar_folder = os.path.join(output_variant, 'ivar_raw')
    check_create_dir(ivar_folder)
    prefix = ivar_folder + '/' + sample

    input = {
        'reference': reference,
        'input_bam': input_bam,
        'prefix': prefix,
        'min_quality': str(min_quality),
        'min_frequency_threshold': str(min_frequency_threshold),
        'min_depth': str(min_depth),
        'annotation': annotation
    }

    cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 --reference {reference} {input_bam} | \
        ivar variants -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -r {reference} -g {annotation}".format(
        **input)

    execute_subprocess(cmd, isShell=True)
コード例 #22
0
def run_snippy(r1,
               r2,
               reference,
               output_dir,
               sample,
               threads=16,
               minqual=20,
               minfrac=0.1,
               mincov=1):
    """
    snippy --cpus 16 --outdir mysnps --ref Listeria.gbk --R1 FDA_R1.fastq.gz --R2 FDA_R2.fastq.gz
    """
    prefix = os.path.join(output_dir, sample)

    cmd = [
        "snippy", "--cpus",
        str(threads), "--outdir", prefix, "--minqual",
        str(minqual), "--mincov",
        str(mincov), "--minfrac",
        str(minfrac), "--ref", reference, "--R1", r1, "--R2", r2
    ]

    execute_subprocess(cmd)
コード例 #23
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def split_vcf_saples(vcf_file, sample_list=False, nocall_fr=0.1):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    https://www.biostars.org/p/224702/
    #TODO: check if argument --exclude-filtered is suitable here. It would save select_pass_variants() step
    """

    if sample_list == False:
        #samples = subprocess.run(["bcftools", "query", "-l", vcf_file],stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True)
        #sample_list = samples.stdout.split("\n")[:-1]
        sample_list = samples_from_vcf(vcf_file)
    else:
        sample_list = sample_list

    vcf_file_path = os.path.abspath(vcf_file)
    vcf_dir_name = os.path.dirname(vcf_file)
    vcf_file_name = vcf_file_path.split("/")[-1]
    vcf_file_extension = (".").join(vcf_file_name.split(".")[2:])

    for sample_name in sample_list:
        output_vcf_name = sample_name + "." + vcf_file_extension
        output_vcf_file = os.path.join(vcf_dir_name, output_vcf_name)
        cmd = [
            "gatk", "SelectVariants", "--max-nocall-fraction",
            str(nocall_fr), "--variant", vcf_file, "--sample-name",
            sample_name, "--exclude-non-variants", "--output", output_vcf_file
        ]
        #"--exclude-non-variants", #remove non genotyped variants
        #"--remove-unused-alternates", #avoid poblational polymorphism
        #--preserve-alleles
        #"--keep-original-dp",
        #"--keep-original-ac",
        #"--select-type-to-include", "SNP",
        #"--select-type-to-include", "MIXED",

        if not os.path.isfile(output_vcf_file):
            execute_subprocess(cmd)
コード例 #24
0
def ivar_trim(input_bam,
              primers_file,
              sample,
              min_length=30,
              min_quality=20,
              sliding_window_width=4):
    """
    Usage: ivar trim -i <input.bam> -b <primers.bed> -p <prefix> [-m <min-length>] [-q <min-quality>] [-s <sliding-window-width>]
        Input Options    Description
           -i    (Required) Sorted bam file, with aligned reads, to trim primers and quality
           -b    (Required) BED file with primer sequences and positions
           -m    Minimum length of read to retain after trimming (Default: 30)
           -q    Minimum quality threshold for sliding window to pass (Default: 20)
           -s    Width of sliding window (Default: 4)
           -e    Include reads with no primers. By default, reads with no primers are excluded
        Output Options   Description
           -p    (Required) Prefix for the output BAM file
    """

    input_bam = os.path.abspath(input_bam)
    input_bai = input_bam + ".bai"
    primers_file = os.path.abspath(primers_file)

    prefix = input_bam.split('.')[0] + ".rg.markdup.trimmed"
    output_trimmed_bam = prefix + ".bam"
    output_trimmed_sorted_bam = input_bam.split(
        '.')[0] + ".rg.markdup.trimmed.sorted.bam"

    cmd = [
        "ivar", "trim", "-i", input_bam, "-b", primers_file, "-p", prefix,
        "-m",
        str(min_length), "-q",
        str(min_quality), "-s",
        str(sliding_window_width), "-e"
    ]
    execute_subprocess(cmd)

    check_remove_file(input_bam)

    cmd_sort = [
        "samtools", "sort", output_trimmed_bam, "-o", output_trimmed_sorted_bam
    ]
    execute_subprocess(cmd_sort)

    check_remove_file(output_trimmed_bam)

    cmd_index = ["samtools", "index", output_trimmed_sorted_bam]
    execute_subprocess(cmd_index)

    check_remove_file(input_bai)
コード例 #25
0
def fastqc_quality(r1, r2, output_dir, threads=8):
    check_create_dir(output_dir)

    cmd = ['fastqc', r1, r2, '-o', output_dir, '--threads', str(threads)]

    execute_subprocess(cmd)
コード例 #26
0
def create_coverage(input_bam, output_dir, sample):
    output_file = os.path.join(output_dir, sample + ".cov")
    cmd = "samtools depth -aa {} > {}".format(input_bam, output_file)
    execute_subprocess(cmd, isShell=True)
コード例 #27
0
def create_bamstat(input_bam, output_dir, sample, threads=8):
    output_file = os.path.join(output_dir, sample + ".bamstats")
    cmd = "samtools flagstat --threads {} {} > {}".format(
        str(threads), input_bam, output_file)
    execute_subprocess(cmd, isShell=True)
コード例 #28
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def haplotype_caller(args,
                     recalibrate=False,
                     ploidy=2,
                     bamout=False,
                     forceactive=False,
                     intervals=False):
    #base_quality=13,
    """
    #No excuses
    https://software.broadinstitute.org/gatk/documentation/article?id=11081
    """
    #input_bam = os.path.abspath(args.input_bam)
    input_reference = os.path.abspath(args.reference)

    bam_output_dir = obtain_output_dir(args, "Bam")
    #file_name = path_file_name.split("/")[-1] #sample_name
    file_name = args.sample
    #path_file_name = os.path.join(output_dir, gvcf_output_file)

    if recalibrate:
        input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF_recal")
        gvcf_output_file = file_name + ".g.vcf"
    else:
        input_bam_to_call_name = file_name + ".bqsr.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF")
        gvcf_output_file = file_name + ".g.vcf"

    check_create_dir(gvcf_output_dir)

    input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name)
    gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file)

    memory_param = "-Xmx" + str(args.memory) + "g"

    hc_args = [
        "gatk", "HaplotypeCaller", "--java-options", memory_param,
        "--reference", input_reference, "--input", input_bam_to_call,
        "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF",
        "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy",
        str(ploidy)
    ]

    #"--min-base-quality-score", str(base_quality),

    #Create bam index
    #cmd_index = ["samtools", "index", input_bam_to_call]
    #execute_subprocess(cmd_index)

    if bamout:
        bamout_output_dir = obtain_output_dir(args, "Bamout")
        bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam"
        bamout_output_full = os.path.join(bamout_output_dir,
                                          bamout_output_file)
        check_create_dir(bamout_output_dir)
        bamout_params = ["--bam-output", bamout_output_full]
        hc_args.extend(bamout_params)

    if forceactive:
        force_params = ["--force-active", "--disable-optimizations"]
        hc_args.extend(force_params)

    execute_subprocess(hc_args)
    """
コード例 #29
0
ファイル: bam_recall.py プロジェクト: pedroscampoy/SNPTB
def recalibrate_bam(args, tb=False):
    """
    BaseRecalibrator
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php
    #Recalibrate bam:
    gatk BaseRecalibrator --input my_reads.bam --reference reference.fasta --known-sites sites_of_variation.vcf \
    --known-sites another/optional/setOfSitesToMask.vcf --output recal_data.table
    ApplyBQSR
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php
    gatk ApplyBQSR --reference reference.fasta --input input.bam --bqsr-recal-file recalibration.table --output output.bam
    """
    #output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    #Automate M. tuberculosis reference for aditional recalibraion positions
    if ("NC_000962.3" in input_reference) or (
            "h37rv" in input_reference.lower()) or ("ancestor"
                                                    in input_reference):
        tb = True
        script_dir = os.path.dirname(os.path.realpath(__file__))
        reference_dir = os.path.join(script_dir, "reference")
        if ("NC_000962.3" in input_reference) or ("h37rv"
                                                  in input_reference.lower()):
            reference_file = os.path.join(
                reference_dir, "190508_ddtb.NC_000962.3.BQSR.table")
        elif ("ancestor" in input_reference):
            reference_file = os.path.join(reference_dir,
                                          "190508_ddtb.BQSR.table")

    #group_name = output.split("/")[-1] #group_name
    sample_name = args.sample
    bam_input_dir = obtain_output_dir(args, "Bam")
    vcf_input_dir = obtain_output_dir(args, "VCF_recal")

    bam_input_file_name = sample_name + ".rg.markdup.sorted.bam"
    bam_input_file = os.path.join(bam_input_dir, bam_input_file_name)

    table_output_file_name = sample_name + ".recall.table"
    table_output_file = os.path.join(vcf_input_dir, table_output_file_name)

    memory_param = "-Xmx" + str(args.memory) + "g"

    #BaseRecalibrator

    cmd_bqsr = [
        "gatk", "BaseRecalibrator", "--java-options", memory_param,
        "--reference", input_reference, "--input", bam_input_file, "--output",
        table_output_file
    ]

    if tb == True:
        cmd_bqsr.append("--known-sites")
        cmd_bqsr.append(reference_file)

    for root, _, files in os.walk(vcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".hf.pass.vcf"):
                cmd_bqsr.append("--known-sites")
                cmd_bqsr.append(filename)

    execute_subprocess(cmd_bqsr)

    #ApplyBQSR

    bam_output_file_name = sample_name + ".bqsr.bam"
    bam_output_file = os.path.join(bam_input_dir, bam_output_file_name)

    cmd_apply = [
        "gatk", "ApplyBQSR", "--reference", input_reference, "--input",
        bam_input_file, "--bqsr-recal-file", table_output_file, "--output",
        bam_output_file
    ]

    execute_subprocess(cmd_apply)