Beispiel #1
0
def bbduk_trimming(args):
    """
    TODO : handle params
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    output_dir = obtain_output_dir(args, "Trimmed")

    in1_param = "in1=" + r1
    in2_param = "in2=" + r2

    sample = extract_sample(r1, r2)

    out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz"
    out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz"

    stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats"

    adapter_path = "ref=" + get_bbduk_adapters()

    memory_param = "-Xmx" + str(args.memory) + "g"
    threads_param = "threads=" + str(args.threads)

    check_create_dir(output_dir)

    #bbduk.sh
    cmd = [
        "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param,
        adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21",
        "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo",
        stats_param
    ]

    execute_subprocess(cmd)
Beispiel #2
0
def bwa_mapping(args):
    """
    #Store output in a file when it is outputted in stdout
    https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    cmd_index = ["bwa", "index", reference]
    execute_subprocess(cmd_index)

    cmd_map = [
        "bwa", "mem", "-t",
        str(args.threads), "-o", output_file, reference, r1, r2
    ]
    execute_subprocess(cmd_map)
    """
Beispiel #3
0
def bowtie2_mapping(args):
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    if args.extensive_mapping:
        extensive_command = "-a"
    else:
        extensive_command = ""
    #bowtie2 index
    cmd_index = ["bowtie2-build", reference, reference]
    execute_subprocess(cmd_index)

    #bowtie map
    cmd_map = [
        "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q",
        "--very-sensitive-local", "-p",
        str(args.threads), "-x", reference, extensive_command
    ]
    execute_subprocess(cmd_map)
Beispiel #4
0
def mash_screen(r1_file,
                out_dir,
                r2_file=False,
                winner=True,
                threads=16,
                mash_database="/home/laura/DATABASES/Mash/bacteria_mash.msh"):
    # https://mash.readthedocs.io/en/latest/index.html
    # https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database
    # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab
    # identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment

    if not os.path.isfile(mash_database):
        logger.info(RED + BOLD + "Mash database can't be found\n" +
                    END_FORMATTING + "You can download it typing:\n\
            wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh")
        sys.exit(1)

    r1_file = os.path.abspath(r1_file)

    sample = extract_sample(r1_file, r2_file)

    check_create_dir(out_dir)
    species_output_name = sample + ".screen.tab"
    species_output_file = os.path.join(out_dir, species_output_name)

    cmd = ["mash", "screen", "-p", str(threads), mash_database, r1_file]

    if winner == True:
        cmd.insert(2, "-w")
    # Use both r1 and r2 instead of just r1(faster)
    if r2_file:
        r2_file = os.path.abspath(r2_file)
        cmd.append(r2_file)

    prog = cmd[0]
    param = cmd[1:]

    try:
        # execute_subprocess(cmd)
        with open(species_output_file, "w+") as outfile:
            # calculate mash distance and save it in output file
            command = subprocess.run(cmd,
                                     stdout=outfile,
                                     stderr=subprocess.PIPE,
                                     universal_newlines=True)
        if command.returncode == 0:
            logger.info(GREEN + "Program %s successfully executed" % prog +
                        END_FORMATTING)
        else:
            print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING +
                  BOLD + "WITH PARAMETERS: " + END_FORMATTING +
                  " ".join(param) + "\n" + BOLD +
                  "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" +
                  END_FORMATTING + command.stderr)
    except OSError as e:
        sys.exit(RED + BOLD + "failed to execute program '%s': %s" %
                 (prog, str(e)) + END_FORMATTING)
Beispiel #5
0
def combine_gvcf(args, recalibrate=False, all_gvcf=False):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_CombineGVCFs.php
    #combined multi-sample gVCF:
    gatk CombineGVCFs -R reference.fasta --variant sample1.g.vcf.gz --variant sample2.g.vcf.gz -O cohort.g.vcf.gz
    """
    output = os.path.abspath(args.output)
    input_reference = os.path.abspath(args.reference)

    group_name = output.split("/")[-1]  #group_name

    if recalibrate:
        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")

    gvcf_output_file = group_name + ".cohort.g.vcf"
    gvcf_output_full = os.path.join(gvcf_input_dir, gvcf_output_file)

    check_create_dir(gvcf_input_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "CombineGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--output", gvcf_output_full
    ]

    for root, _, files in os.walk(gvcf_input_dir):
        for name in files:
            filename = os.path.join(root, name)
            if filename.endswith(".g.vcf"):
                cmd.append("--variant")
                cmd.append(filename)
    if all_gvcf != False:
        if os.path.isdir(all_gvcf):
            all_gvcf = os.path.abspath(all_gvcf)
            print("Using gvcf from enricment folder:" + all_gvcf)
            for root, _, files in os.walk(all_gvcf):
                for name in files:
                    filename = os.path.join(root, name)
                    if filename.endswith(".g.vcf"):
                        cmd.append("--variant")
                        cmd.append(filename)
        else:
            print("GVCF enrichment folder does not exist")

    execute_subprocess(cmd)
Beispiel #6
0
def call_variants(args, recalibrate=False, group=True):
    """
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php
    #Call variants:
    gatk --java-options "-Xmx4g" GenotypeGVCFs -R Homo_sapiens_assembly38.fasta -V input.g.vcf.gz -O output.vcf.gz
    """
    output = os.path.abspath(args.output)

    input_reference = os.path.abspath(args.reference)

    if not args.sample:
        args.sample = "nosample"

    file_name = args.sample  #sample_name
    group_name = output.split("/")[-1]  #group_name

    if recalibrate:

        gvcf_input_dir = obtain_output_dir(args, "GVCF_recal")
        vcf_output_dir = obtain_output_dir(args, "VCF_recal")
    else:
        gvcf_input_dir = obtain_output_dir(args, "GVCF")
        vcf_output_dir = obtain_output_dir(args, "VCF")

    if group:
        gvcf_input_file = group_name + ".cohort.g.vcf"
        vcf_output_file = group_name + ".cohort.raw.vcf"
    else:
        gvcf_input_file = file_name + ".g.vcf"
        vcf_output_file = file_name + ".raw.vcf"

    gvcf_input_full = os.path.join(gvcf_input_dir, gvcf_input_file)
    vcf_output_full = os.path.join(vcf_output_dir, vcf_output_file)

    check_create_dir(gvcf_input_dir)
    check_create_dir(vcf_output_dir)

    memory_param = "-Xmx" + str(args.memory) + "g"

    cmd = [
        "gatk", "GenotypeGVCFs", "--java-options", memory_param, "--reference",
        input_reference, "--variant", gvcf_input_full, "--output",
        vcf_output_full
    ]

    execute_subprocess(cmd)
Beispiel #7
0
def sam_to_index_bam(args):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    check_create_dir(output_dir)
    """
    #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam
    with open(output_bam_path, "w") as outfile:
        #map reads and save it in th eoutput file
        subprocess.run(["samtools", "view", "-Sb", input_sam_path], 
        stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True)
    """
    cmd = [
        "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path,
        "--threads",
        str(args.threads)
    ]
    execute_subprocess(cmd)

    check_remove_file(input_sam_path)

    add_SG(args, output_bam_path, output_bg_sorted_path)

    check_remove_file(output_bam_path)
    """
Beispiel #8
0
def picard_markdup(args):
    #java -jar picard.jar MarkDuplicates \
    #  I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt
    picard_jar = get_picard_path()

    input_bam = os.path.abspath(args.input_bam)
    in_param = "I=" + input_bam

    path_file_name = input_bam.split(".")[0]
    file_name = path_file_name.split("/")[-1]
    output_markdup = path_file_name + ".rg.markdup.bam"
    output_markdup_sorted = path_file_name + ".rg.markdup.sorted.bam"
    out_param = "O=" + output_markdup

    stat_output_dir = obtain_output_dir(args, "Stats")
    stat_output_file = file_name + ".markdup.metrics.txt"
    stat_output_full = os.path.join(stat_output_dir, stat_output_file)
    stats_param = "M=" + stat_output_full

    check_create_dir(stat_output_dir)

    cmd_markdup = [
        "java", "-jar", picard_jar, "MarkDuplicates", in_param, out_param,
        stats_param
    ]
    execute_subprocess(cmd_markdup)

    #samtools sort: samtools sort $output_dir/$sample".sorted.bam" -o $output_dir/$sample".sorted.bam"
    cmd_sort = [
        "samtools", "sort", output_markdup, "-o", output_markdup_sorted
    ]
    execute_subprocess(cmd_sort)

    #Handled in Haplotype Caller function
    #samtools index: samtools index $output_dir/$sample".sorted.bam"
    subprocess.run(["samtools", "index", output_markdup_sorted],
                   stdout=subprocess.PIPE,
                   stderr=subprocess.PIPE,
                   check=True)
    check_remove_file(input_bam)
    check_remove_file(output_markdup)
Beispiel #9
0
def fastp_trimming(r1,
                   r2,
                   sample,
                   output_dir,
                   threads=6,
                   min_qual=20,
                   window_size=10,
                   min_len=35):
    check_create_dir(output_dir)

    output_trimmed_r1 = os.path.join(output_dir,
                                     sample + ".trimmed_R1.fastq.gz")
    output_trimmed_r2 = os.path.join(output_dir,
                                     sample + ".trimmed_R2.fastq.gz")

    html_dir = os.path.join(output_dir, 'html')
    json_dir = os.path.join(output_dir, 'json')

    check_create_dir(html_dir)
    check_create_dir(json_dir)

    html_file = os.path.join(html_dir, sample + '_fastp.html')
    json_file = os.path.join(json_dir, sample + '_fastp.json')

    cmd = [
        'fastp', '--in1', r1, '--in2', r2, '--out1', output_trimmed_r1,
        '--out2', output_trimmed_r2, '--detect_adapter_for_pe', '--cut_tail',
        '--cut_window_size',
        str(window_size), '--cut_mean_quality',
        str(min_qual), '--length_required',
        str(min_len), '--json', json_file, '--html', html_file, '--thread',
        str(threads)
    ]

    execute_subprocess(cmd)
Beispiel #10
0
def ivar_variants(reference,
                  input_bam,
                  output_variant,
                  sample,
                  annotation,
                  min_quality=20,
                  min_frequency_threshold=0.8,
                  min_depth=20):
    """
    Usage: samtools mpileup -aa -A -d 0 -B -Q 0 --reference [<reference-fasta] <input.bam> | ivar variants -p <prefix> [-q <min-quality>] [-t <min-frequency-threshold>] [-m <minimum depth>] [-r <reference-fasta>] [-g GFF file]
        Note : samtools mpileup output must be piped into ivar variants
        Input Options    Description
           -q    Minimum quality score threshold to count base (Default: 20)
           -t    Minimum frequency threshold(0 - 1) to call variants (Default: 0.03)
           -m    Minimum read depth to call variants (Default: 0)
           -r    Reference file used for alignment. This is used to translate the nucleotide sequences and identify intra host single nucleotide variants
           -g    A GFF file in the GFF3 format can be supplied to specify coordinates of open reading frames (ORFs). In absence of GFF file, amino acid translation will not be done.
        Output Options   Description
           -p    (Required) Prefix for the output tsv variant file
    """
    ivar_folder = os.path.join(output_variant, 'ivar_raw')
    check_create_dir(ivar_folder)
    prefix = ivar_folder + '/' + sample

    input = {
        'reference': reference,
        'input_bam': input_bam,
        'prefix': prefix,
        'min_quality': str(min_quality),
        'min_frequency_threshold': str(min_frequency_threshold),
        'min_depth': str(min_depth),
        'annotation': annotation
    }

    cmd = "samtools mpileup -aa -A -d 0 -B -Q 0 --reference {reference} {input_bam} | \
        ivar variants -p {prefix} -q {min_quality} -t {min_frequency_threshold} -m {min_depth} -r {reference} -g {annotation}".format(
        **input)

    execute_subprocess(cmd, isShell=True)
Beispiel #11
0
def replace_reference(input_vcf, output, ref_old=False, ref_new="Chromosome"):
    """
    This function replace all instances of a reference in a vcf file
    Depends on extract_reference_vcf
    190909 - Function now uses chromosome name in file and replaces it with term provided (default "Chromosome")
    """
    input_file = os.path.abspath(input_vcf)
    output_file = os.path.abspath(output)
    output_dir = os.path.dirname(output)

    check_create_dir(output_dir)

    if ref_old == False:
        ref_old = extract_reference_vcf(input_file)

    with open(input_file, 'r') as fi:
        with open(output_file, 'w') as fo:
            for line in fi:
                ref = ref_old + "\t"
                new = ref_new + "\t"
                line = line.replace(ref, new)
                fo.write(line)
Beispiel #12
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    #annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD +
                "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(sample_list_F), ",".join(sample_list_F)))
    logger.info("\n%d NEW samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))
    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    #script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_variant_dir = os.path.join(output, "Variants")
    out_core_dir = os.path.join(output, "Core")

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(
        out_stats_dir, "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(
        out_stats_dir, "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder
    out_annot_blast_dir = os.path.join(out_annot_dir, "blast")  # subfolder

    out_species_dir = os.path.join(output, "Species")
    new_sample_number = 0
    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:
            # VARINAT SAMPLE DIR
            sample_variant_dir = os.path.join(out_variant_dir, sample)

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))
            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING)

            output_final_vcf = os.path.join(
                sample_variant_dir, 'snps.all.ivar.tsv')

            if not os.path.isfile(output_final_vcf):

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                # check_file_exists(r1_file)
                # check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(r1_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(r2_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Checking quality in sample " + sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file,
                                   out_qc_pre_dir, args.threads)

                """
                TODO: Human filter
                """

                # VARIANT CALLING WITH SNIPPY
                ###################################################

                output_vcf_sub = os.path.join(
                    sample_variant_dir, "snps.subs.vcf")
                output_vcf = os.path.join(sample_variant_dir, "snps.vcf")

                if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf):
                    logger.info(YELLOW + DIM + output_vcf +
                                " EXIST\nOmmiting Variant calling in  " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Calling variants with snippy " + sample + END_FORMATTING)
                    run_snippy(r1_file, r2_file, reference, out_variant_dir, sample,
                               threads=args.threads, minqual=10, minfrac=0.1, mincov=1)
                    old_bam = os.path.join(sample_variant_dir, "snps.bam")
                    old_bai = os.path.join(sample_variant_dir, "snps.bam.bai")
                    new_bam = os.path.join(sample_variant_dir, sample + ".bam")
                    new_bai = os.path.join(
                        sample_variant_dir, sample + ".bam.bai")
                    os.rename(old_bam, new_bam)
                    os.rename(old_bai, new_bai)

                #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ########
                #####################################################
                out_variant_indel_sample = os.path.join(
                    sample_variant_dir, "snps.indel.vcf")
                out_variant_all_sample = os.path.join(
                    sample_variant_dir, "snps.all.vcf")

                if os.path.isfile(out_variant_indel_sample):
                    logger.info(YELLOW + DIM + out_variant_indel_sample +
                                " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Filtering INDELS in " +
                                sample + END_FORMATTING)
                    extract_indels(output_vcf)

                if os.path.isfile(out_variant_all_sample):
                    logger.info(YELLOW + DIM + out_variant_all_sample +
                                " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Combining vcf in " +
                                sample + END_FORMATTING)
                    merge_vcf(output_vcf_sub, out_variant_indel_sample)

                #VARIANT FORMAT ADAPTATION TO IVAR ##################
                #####################################################
                out_variant_tsv_file = os.path.join(
                    sample_variant_dir, 'snps.all.ivar.tsv')

                if os.path.isfile(out_variant_tsv_file):
                    logger.info(YELLOW + DIM + out_variant_tsv_file +
                                " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Adapting variants format in sample " + sample + END_FORMATTING)
                    prior = datetime.datetime.now()
                    vcf_to_ivar_tsv(out_variant_all_sample,
                                    out_variant_tsv_file)
                    after = datetime.datetime.now()
                    print(("Done with function in: %s" % (after - prior)))

            # SPECIES DETERMINATION
            ###################################################
            check_create_dir(out_species_dir)

            output_species = os.path.join(
                out_species_dir, sample + ".screen.tab")

            if os.path.isfile(output_species):
                logger.info(YELLOW + DIM + output_species +
                            " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING)
            else:
                logger.info(
                    GREEN + "Determining species in " + sample + END_FORMATTING)
                mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads,
                            mash_database=args.mash_database)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(
                out_stats_bamstats_dir, out_bamstats_name)
            bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam")

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " +
                            sample + END_FORMATTING)
                create_bamstat(
                    bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(
                out_stats_coverage_dir, out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " +
                            sample + END_FORMATTING)
                create_coverage(bam_sample_file,
                                out_stats_coverage_dir, sample)

    # coverage OUTPUT SUMMARY
    ######################################################
    prior_recal = datetime.datetime.now()
    logger.info(GREEN + "Creating summary report for coverage result in group " +
                group_name + END_FORMATTING)
    obtain_group_cov_stats(out_stats_dir, group_name)
    after_recal = datetime.datetime.now()
    logger.info("Done with report for coverage: %s" %
                (after_recal - prior_recal))

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report in group " +
                group_name + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples in group " +
                group_name + END_FORMATTING)
    uncovered_samples = remove_low_quality(
        output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered')

    if len(uncovered_samples) > 1:
        logger.info(GREEN + "Uncovered samples: " +
                    (",").join(uncovered_samples) + END_FORMATTING)
    else:
        logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING)

    # RUN SNIPPY CORE
    ##############################################################################################################################
    if args.core:
        check_create_dir(out_core_dir)
        logger.info(GREEN + "Running snippy-core " +
                    group_name + END_FORMATTING)
        run_snippy_core(out_variant_dir, out_core_dir, reference)

        logger.info(GREEN + "Adapting core-snp to compare format " +
                    group_name + END_FORMATTING)
        core_vcf_file = os.path.join(out_core_dir, "core.vcf")
        core_vcf_file_adapted = os.path.join(
            out_core_dir, "core.vcf.adapted.tsv")
        core_vcf_file_removed = os.path.join(
            out_core_dir, "core.vcf.adapted.final.tsv")

        core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file)
        core_vcf_df_adapted.to_csv(
            core_vcf_file_adapted, sep="\t", index=False)

        logger.info(GREEN + "Obtaining clustered positions " +
                    group_name + END_FORMATTING)

        close_positions_list = extract_close_snps(
            core_vcf_df_adapted, snps_in_10=1)
        logger.info(GREEN + "Obtaining uncovered positions " +
                    group_name + END_FORMATTING)
        uncovered_list = identify_uncovered(
            out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5)

        logger.debug('Clustered positions in core SNP:\n{}'.format(
            (",".join([str(x) for x in close_positions_list]))))
        logger.debug('Uncovered positions in all samples:\n{}'.format(
            (",".join([str(x) for x in uncovered_list]))))

        to_remove_list = close_positions_list + uncovered_list

        remove_df = remove_position_from_compare(
            core_vcf_df_adapted, to_remove_list)
        remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False)

        ddtb_compare(core_vcf_file_removed, distance=10)

    #ANNOTATION WITH SNPEFF AND USER INPUT ##############
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    # SNPEFF
    if args.snpeff_database != False:
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.vcf':
                    sample = root.split('/')[-1]
                    filename = os.path.join(root, name)
                    chrom_filename = os.path.join(
                        root, 'snps.all.chromosome.vcf')
                    out_annot_file = os.path.join(
                        out_annot_snpeff_dir, sample + ".annot")
                    if os.path.isfile(out_annot_file):
                        logger.info(YELLOW + DIM + out_annot_file +
                                    " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING)
                    else:
                        logger.info(
                            GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING)
                        rename_reference_snpeff(filename, chrom_filename)
                        annotate_snpeff(chrom_filename, out_annot_file,
                                        database=args.snpeff_database)
    else:
        logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " +
                    group_name + END_FORMATTING)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(
            YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.ivar.tsv':
                    sample = root.split('/')[-1]
                    logger.info(
                        'User bed/vcf annotation in sample {}'.format(sample))
                    filename = os.path.join(root, name)
                    out_annot_file = os.path.join(
                        out_annot_user_dir, sample + ".tsv")
                    user_annotation(
                        filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(
                                out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(
                                filename, out_annot_aa_file, aa_files=args.annot_aa)
    # USER FASTA ANNOTATION
    if not args.annot_fasta:
        logger.info(
            YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_blast_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name.endswith('.consensus.subs.fa'):
                    filename = os.path.join(root, name)
                    sample = root.split('/')[-1]
                    logger.info(
                        'User FASTA annotation in sample {}'.format(sample))
                    # out_annot_aa_file = os.path.join(
                    #    out_annot_user_aa_dir, sample + ".tsv")
                    for db in args.annot_fasta:
                        make_blast(filename, db, sample, out_annot_blast_dir,
                                   db_type="nucl", query_type="nucl", evalue=0.0001, threads=8)

    # USER AA TO HTML
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING)
    else:
        annotated_samples = []
        logger.info('Adapting annotation to html in {}'.format(group_name))
        for root, _, files in os.walk(out_annot_user_aa_dir):
            if root == out_annot_user_aa_dir:
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        annotated_samples.append(sample)
                        filename = os.path.join(root, name)
                        annotation_to_html(filename, sample)
        annotated_samples = [str(x) for x in annotated_samples]
        report_samples_html_all = report_samples_html.replace(
            'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
        with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f:
            f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_recal_mpileup = full_path_compare + \
        ".revised_intermediate_vcf.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"

    # Create intermediate

    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False)
    # recalibrated_snp_matrix_intermediate.to_csv(
    #     compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Remove SNPs from BED file (PE/PPE)

    if args.remove_bed:
        recalibrated_snp_matrix_intermediate = remove_bed_positions(
            recalibrated_snp_matrix_intermediate, args.remove_bed)

    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Recalibrate intermediate with VCF

    prior_recal = datetime.datetime.now()
    recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate(
        compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10)
    recalibrated_snp_matrix_mpileup.to_csv(
        compare_snp_matrix_recal_mpileup, sep="\t", index=False)

    after_recal = datetime.datetime.now()
    logger.debug("Done with recalibration vcf: %s" %
                 (after_recal - prior_recal))

    # Remove SNPs located within INDELs

    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_mpileup)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)

    # Extract all positions marked as complex
    complex_variants = extract_complex_list(out_variant_dir)
    logger.debug('Complex positions in all samples:\n{}'.format(
        (",".join([str(x) for x in complex_variants]))))

    # Clean all faulty positions and samples => Final table

    recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df,
                                               path_compare,
                                               complex_pos=complex_variants,
                                               min_freq_include=0.8,
                                               min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample,
                                               min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos,
                                               min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample,
                                               min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos,
                                               min_threshold_discard_all_pos=args.min_threshold_discard_all_pos,
                                               min_threshold_discard_all_sample=args.min_threshold_discard_all_sample,
                                               remove_faulty=True,
                                               drop_samples=True,
                                               drop_positions=True,
                                               windows_size_discard=args.window)
    recalibrated_revised_INDEL_df.to_csv(
        compare_snp_matrix_recal, sep="\t", index=False)

    # Matrix to pairwise and mwk

    ddtb_compare(compare_snp_matrix_recal, distance=5)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
            args.r1_file = r1_file
            args.r2_file = r2_file

            print("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                  END_FORMATTING)

            ##############START PIPELINE#####################
            #################################################

            #INPUT ARGUMENTS
            ################
            check_file_exists(args.r1_file)
            check_file_exists(args.r2_file)

            args.output = os.path.abspath(args.output)
            check_create_dir(args.output)
            #QUALITY CHECK
            ##############
            """
            TODO: Quality check 
            """

            #QUALITY TRIMMING AND ADAPTER REMOVAL WITH bbduk.sh
            ###################################################
            out_trim_name_r1 = sample + "_R1.clean.fastq.gz"
            out_trim_name_r2 = sample + "_R2.clean.fastq.gz"
            output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                   out_trim_name_r1)
            output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                   out_trim_name_r2)
Beispiel #14
0
def haplotype_caller(args,
                     recalibrate=False,
                     ploidy=2,
                     bamout=False,
                     forceactive=False,
                     intervals=False):
    #base_quality=13,
    """
    #No excuses
    https://software.broadinstitute.org/gatk/documentation/article?id=11081
    """
    #input_bam = os.path.abspath(args.input_bam)
    input_reference = os.path.abspath(args.reference)

    bam_output_dir = obtain_output_dir(args, "Bam")
    #file_name = path_file_name.split("/")[-1] #sample_name
    file_name = args.sample
    #path_file_name = os.path.join(output_dir, gvcf_output_file)

    if recalibrate:
        input_bam_to_call_name = file_name + ".rg.markdup.sorted.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF_recal")
        gvcf_output_file = file_name + ".g.vcf"
    else:
        input_bam_to_call_name = file_name + ".bqsr.bam"

        gvcf_output_dir = obtain_output_dir(args, "GVCF")
        gvcf_output_file = file_name + ".g.vcf"

    check_create_dir(gvcf_output_dir)

    input_bam_to_call = os.path.join(bam_output_dir, input_bam_to_call_name)
    gvcf_output_full = os.path.join(gvcf_output_dir, gvcf_output_file)

    memory_param = "-Xmx" + str(args.memory) + "g"

    hc_args = [
        "gatk", "HaplotypeCaller", "--java-options", memory_param,
        "--reference", input_reference, "--input", input_bam_to_call,
        "--output", gvcf_output_full, "--emit-ref-confidence", "GVCF",
        "--annotation-group", "AS_StandardAnnotation", "--sample-ploidy",
        str(ploidy)
    ]

    #"--min-base-quality-score", str(base_quality),

    #Create bam index
    #cmd_index = ["samtools", "index", input_bam_to_call]
    #execute_subprocess(cmd_index)

    if bamout:
        bamout_output_dir = obtain_output_dir(args, "Bamout")
        bamout_output_file = file_name + ".p" + str(ploidy) + ".out.bam"
        bamout_output_full = os.path.join(bamout_output_dir,
                                          bamout_output_file)
        check_create_dir(bamout_output_dir)
        bamout_params = ["--bam-output", bamout_output_full]
        hc_args.extend(bamout_params)

    if forceactive:
        force_params = ["--force-active", "--disable-optimizations"]
        hc_args.extend(force_params)

    execute_subprocess(hc_args)
    """
Beispiel #15
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    # ARGUMENTS

    def get_arguments():

        parser = argparse.ArgumentParser(
            prog='covidma.py',
            description=
            'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2'
        )

        input_group = parser.add_argument_group('Input', 'Input parameters')

        input_group.add_argument(
            '-i',
            '--input',
            dest="input_dir",
            metavar="input_directory",
            type=str,
            required=True,
            help='REQUIRED.Input directory containing all fast[aq] files')
        input_group.add_argument('-r',
                                 '--reference',
                                 metavar="reference",
                                 type=str,
                                 required=True,
                                 help='REQUIRED. File to map against')
        input_group.add_argument(
            '-a',
            '--annotation',
            metavar="annotation",
            type=str,
            required=True,
            help='REQUIRED. gff3 file to annotate variants')
        input_group.add_argument('-s',
                                 '--sample',
                                 metavar="sample",
                                 type=str,
                                 required=False,
                                 help='Sample to identify further files')
        input_group.add_argument(
            '-L',
            '--sample_list',
            type=str,
            required=False,
            help='Sample names to analyse only in the file supplied')
        input_group.add_argument(
            '-p',
            '--primers',
            type=str,
            default=
            '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed',
            required=False,
            help='Bed file including primers to trim')

        quality_group = parser.add_argument_group(
            'Quality parameters', 'parameters for diferent triming conditions')

        quality_group.add_argument(
            '-c',
            '--coverage20',
            type=int,
            default=90,
            required=False,
            help=
            'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)'
        )
        quality_group.add_argument('-n',
                                   '--min_snp',
                                   type=int,
                                   required=False,
                                   default=1,
                                   help='SNP number to pass quality threshold')

        output_group = parser.add_argument_group(
            'Output', 'Required parameter to output results')

        output_group.add_argument(
            '-o',
            '--output',
            type=str,
            required=True,
            help='REQUIRED. Output directory to extract all results')
        output_group.add_argument(
            '-C',
            '--noclean',
            required=False,
            action='store_false',
            help='Clean unwanted files for standard execution')

        params_group = parser.add_argument_group(
            'Parameters', 'parameters for diferent stringent conditions')

        params_group.add_argument('-T',
                                  '--threads',
                                  type=str,
                                  dest="threads",
                                  required=False,
                                  default=16,
                                  help='Threads to use')
        params_group.add_argument('-M',
                                  '--memory',
                                  type=str,
                                  dest="memory",
                                  required=False,
                                  default=32,
                                  help='Max memory to use')

        annot_group = parser.add_argument_group(
            'Annotation', 'parameters for variant annotation')

        annot_group.add_argument('-B',
                                 '--annot_bed',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='bed file to annotate')
        annot_group.add_argument('-V',
                                 '--annot_vcf',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='vcf file to annotate')
        annot_group.add_argument('-A',
                                 '--annot_aa',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='aminoacid file to annotate')
        annot_group.add_argument('-R',
                                 '--remove_bed',
                                 type=str,
                                 default=False,
                                 required=False,
                                 help='BED file with positions to remove')
        annot_group.add_argument(
            '--mash_database',
            type=str,
            required=False,
            default=False,
            help='MASH ncbi annotation containing all species database')
        annot_group.add_argument('--snpeff_database',
                                 type=str,
                                 required=False,
                                 default='NC_045512.2',
                                 help='snpEFF annotation database')

        compare_group = parser.add_argument_group(
            'Compare', 'parameters for compare_snp')

        compare_group.add_argument('-S',
                                   '--only_snp',
                                   required=False,
                                   action='store_true',
                                   help='Use INDELS while comparing')

        arguments = parser.parse_args()

        return arguments

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " +
                group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))

    #PREPARE REFERENCE FOR MAPPING + FAI + DICT #########
    #####################################################

    # picard_dictionary(args)
    samtools_faidx(args)

    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    # script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_qc_post_dir = os.path.join(out_qc_dir, "processed")  # subfolder
    out_trim_dir = os.path.join(output, "Trimmed")
    out_map_dir = os.path.join(output, "Bam")
    out_variant_dir = os.path.join(output, "Variants")
    out_variant_ivar_dir = os.path.join(out_variant_dir,
                                        "ivar_raw")  # subfolder
    out_filtered_ivar_dir = os.path.join(out_variant_dir,
                                         "ivar_filtered")  # subfolder
    out_consensus_dir = os.path.join(output, "Consensus")
    out_consensus_ivar_dir = os.path.join(out_consensus_dir,
                                          "ivar")  # subfolder

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(out_stats_dir,
                                          "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(out_stats_dir,
                                          "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_pangolin_dir = os.path.join(out_annot_dir,
                                          "pangolin")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder

    new_sample_number = 0

    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))

            out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam"
            output_markdup_trimmed_file = os.path.join(
                out_map_dir, out_markdup_trimmed_name)

            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            " (" + new_sample_number + "/" + new_sample_total +
                            ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            END_FORMATTING)

            if not os.path.isfile(output_markdup_trimmed_file):

                args.r1_file = r1_file
                args.r2_file = r2_file

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                check_file_exists(r1_file)
                check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(
                    r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(
                    r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(
                        output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN + "Checking quality in sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file, out_qc_pre_dir,
                                   args.threads)
                """
                TODO: Human filter
                """

                # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp
                ###################################################
                out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz"
                out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz"
                output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                       out_trim_name_r1)
                output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                       out_trim_name_r2)

                if os.path.isfile(output_trimming_file_r1) and os.path.isfile(
                        output_trimming_file_r2):
                    logger.info(YELLOW + DIM + output_trimming_file_r1 +
                                " EXIST\nOmmiting Trimming for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming sample " + sample +
                                END_FORMATTING)
                    fastp_trimming(r1_file,
                                   r2_file,
                                   sample,
                                   out_trim_dir,
                                   threads=args.threads,
                                   min_qual=20,
                                   window_size=10,
                                   min_len=35)

                # QUALITY CHECK in TRIMMED with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html"
                out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html"
                output_qc_precessed_file_r1 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r1)
                output_qc_precessed_file_r2 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r2)

                if os.path.isfile(
                        output_qc_precessed_file_r1) and os.path.isfile(
                            output_qc_precessed_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN +
                                "Checking quality in processed sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2)
                    fastqc_quality(output_trimming_file_r1,
                                   output_trimming_file_r2, out_qc_post_dir,
                                   args.threads)

                # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG
                #####################################################
                out_map_name = sample + ".rg.sorted.bam"
                output_map_file = os.path.join(out_map_dir, out_map_name)

                if os.path.isfile(output_map_file):
                    logger.info(YELLOW + DIM + output_map_file +
                                " EXIST\nOmmiting Mapping for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Mapping sample " + sample +
                                END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2 + "\nReference: " +
                                reference)
                    bwa_mapping(output_trimming_file_r1,
                                output_trimming_file_r2,
                                reference,
                                sample,
                                out_map_dir,
                                threads=args.threads)
                    sam_to_index_bam(sample,
                                     out_map_dir,
                                     output_trimming_file_r1,
                                     threads=args.threads)

                #MARK DUPLICATES WITH PICARDTOOLS ###################
                #####################################################
                out_markdup_name = sample + ".rg.markdup.sorted.bam"
                output_markdup_file = os.path.join(out_map_dir,
                                                   out_markdup_name)

                if os.path.isfile(output_markdup_file):
                    logger.info(YELLOW + DIM + output_markdup_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Marking Dupes in sample " + sample +
                                END_FORMATTING)
                    logger.info("Input Bam: " + output_map_file)
                    picard_markdup(output_map_file)

                #TRIM PRIMERS WITH ivar trim ########################
                #####################################################

                if os.path.isfile(output_markdup_trimmed_file):
                    logger.info(YELLOW + DIM + output_markdup_trimmed_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming primers in sample " +
                                sample + END_FORMATTING)
                    logger.info("Input Bam: " + output_markdup_file)
                    ivar_trim(output_markdup_file,
                              args.primers,
                              sample,
                              min_length=30,
                              min_quality=20,
                              sliding_window_width=4)
            else:
                logger.info(
                    YELLOW + DIM + output_markdup_trimmed_file +
                    " EXIST\nOmmiting BAM mapping and BAM manipulation in sample "
                    + sample + END_FORMATTING)

            ########################END OF MAPPING AND BAM MANIPULATION#####################################################################
            ################################################################################################################################

            #VARIANT CALLING WTIH ivar variants##################
            #####################################################
            check_create_dir(out_variant_dir)
            out_ivar_variant_name = sample + ".tsv"
            out_ivar_variant_file = os.path.join(out_variant_ivar_dir,
                                                 out_ivar_variant_name)

            if os.path.isfile(out_ivar_variant_file):
                logger.info(YELLOW + DIM + out_ivar_variant_file +
                            " EXIST\nOmmiting Variant call for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Calling variants with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_variants(reference,
                              output_markdup_trimmed_file,
                              out_variant_dir,
                              sample,
                              annotation,
                              min_quality=15,
                              min_frequency_threshold=0.01,
                              min_depth=1)

            #VARIANT FILTERING ##################################
            #####################################################
            check_create_dir(out_filtered_ivar_dir)
            out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir,
                                                  out_ivar_variant_name)

            if os.path.isfile(out_ivar_filtered_file):
                logger.info(YELLOW + DIM + out_ivar_filtered_file +
                            " EXIST\nOmmiting Variant filtering for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Filtering variants in sample " + sample +
                            END_FORMATTING)
                filter_tsv_variants(out_ivar_variant_file,
                                    out_filtered_ivar_dir,
                                    min_frequency=0.7,
                                    min_total_depth=10,
                                    min_alt_dp=4,
                                    is_pass=True,
                                    only_snp=False)

            #CREATE CONSENSUS with ivar consensus##################
            #######################################################
            check_create_dir(out_consensus_dir)
            check_create_dir(out_consensus_ivar_dir)
            out_ivar_consensus_name = sample + ".fa"
            out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir,
                                                   out_ivar_consensus_name)

            if os.path.isfile(out_ivar_consensus_file):
                logger.info(YELLOW + DIM + out_ivar_consensus_file +
                            " EXIST\nOmmiting Consensus for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating consensus with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_consensus(output_markdup_trimmed_file,
                               out_consensus_ivar_dir,
                               sample,
                               min_quality=20,
                               min_frequency_threshold=0.8,
                               min_depth=20,
                               uncovered_character='N')
                logger.info(GREEN + "Replacing consensus header in " + sample +
                            END_FORMATTING)
                replace_consensus_header(out_ivar_consensus_file)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(out_stats_bamstats_dir,
                                             out_bamstats_name)

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " + sample +
                            END_FORMATTING)
                create_bamstat(output_markdup_trimmed_file,
                               out_stats_bamstats_dir,
                               sample,
                               threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(out_stats_coverage_dir,
                                             out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " + sample +
                            END_FORMATTING)
                create_coverage(output_markdup_trimmed_file,
                                out_stats_coverage_dir, sample)

    # fastqc OUTPUT FORMAT FOR COMPARISON
    ######################################################
    logger.info(GREEN + "Creating summary report for quality result " +
                END_FORMATTING)
    # format_html_image(out_qc_dir)

    # coverage OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating summary report for coverage result " +
                END_FORMATTING)
    obtain_group_cov_stats(out_stats_coverage_dir, group_name)

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report " + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples" + END_FORMATTING)
    # remove_low_quality(output, min_percentage_20x=args.coverage20,
    #                   min_hq_snp=args.min_snp, type_remove='Uncovered')

    #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN ####
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    check_create_dir(out_annot_pangolin_dir)
    # SNPEFF
    if args.snpeff_database != False:
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_filtered_ivar_dir):
            if root == out_filtered_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_snpeff_dir,
                                                      sample + ".annot")
                        if os.path.isfile(out_annot_file):
                            logger.info(
                                YELLOW + DIM + out_annot_file +
                                " EXIST\nOmmiting snpEff Annotation for sample "
                                + sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Annotating sample with snpEff: " +
                                        sample + END_FORMATTING)
                            output_vcf = os.path.join(out_annot_snpeff_dir,
                                                      sample + '.vcf')
                            annotate_snpeff(filename,
                                            output_vcf,
                                            out_annot_file,
                                            database=args.snpeff_database)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(YELLOW + BOLD +
                    "Ommiting User Annotation, no BED or VCF files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_variant_ivar_dir):
            if root == out_variant_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User bed/vcf annotation in sample {}'.format(
                                sample))
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_user_dir,
                                                      sample + ".tsv")
                        user_annotation(filename,
                                        out_annot_file,
                                        vcf_files=args.annot_vcf,
                                        bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(YELLOW + BOLD +
                    "Ommiting User aa Annotation, no AA files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(out_annot_aa_file,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(filename,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)

    # PANGOLIN
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=args.threads) as executor:
        futures_pangolin = []

        for root, _, files in os.walk(out_consensus_ivar_dir):
            if root == out_consensus_ivar_dir:
                for name in files:
                    if name.endswith('.fa'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_pangolin_filename = sample + ".lineage.csv"
                        out_pangolin_file = os.path.join(
                            out_annot_pangolin_dir, out_pangolin_filename)
                        if os.path.isfile(out_pangolin_file):
                            logger.info(
                                YELLOW + DIM + out_pangolin_file +
                                " EXIST\nOmmiting Lineage for  sample " +
                                sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Obtaining Lineage in sample " +
                                        sample + END_FORMATTING)
                            future = executor.submit(annotate_pangolin,
                                                     filename,
                                                     out_annot_pangolin_dir,
                                                     out_pangolin_filename,
                                                     threads=args.threads,
                                                     max_ambig=0.6)
                            futures_pangolin.append(future)
                for future in concurrent.futures.as_completed(
                        futures_pangolin):
                    logger.info(future.result())
                    # annotate_pangolin(filename, out_annot_pangolin_dir,
                    #                out_pangolin_filename, threads=args.threads, max_ambig=0.6)

    # USER AA TO HTML
    annotated_samples = []
    logger.info('Adapting annotation to html in {}'.format(group_name))
    for root, _, files in os.walk(out_annot_user_aa_dir):
        if root == out_annot_user_aa_dir:
            for name in files:
                if name.endswith('.tsv'):
                    sample = name.split('.')[0]
                    annotated_samples.append(sample)
                    filename = os.path.join(root, name)
                    annotation_to_html(filename, sample)
    annotated_samples = [str(x) for x in annotated_samples]
    report_samples_html_all = report_samples_html.replace(
        'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
    with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'),
              'w+') as f:
        f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    # ddtb_add(out_filtered_ivar_dir, full_path_compare)
    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"
    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_ivar_dir,
        out_stats_coverage_dir,
        min_freq_discard=0.1,
        min_alt_dp=4,
        only_snp=args.only_snp)
    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)
    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_intermediate)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)
    recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate,
                                         path_compare,
                                         min_freq_include=0.7,
                                         min_threshold_discard_sample=0.07,
                                         min_threshold_discard_position=0.4,
                                         remove_faulty=True,
                                         drop_samples=True,
                                         drop_positions=True)
    recalibrated_revised_df.to_csv(compare_snp_matrix_recal,
                                   sep="\t",
                                   index=False)
    recalibrated_revised_INDEL_df = revised_df(
        compare_snp_matrix_INDEL_intermediate_df,
        path_compare,
        min_freq_include=0.7,
        min_threshold_discard_sample=0.07,
        min_threshold_discard_position=0.4,
        remove_faulty=True,
        drop_samples=True,
        drop_positions=True)
    recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL,
                                         sep="\t",
                                         index=False)

    ddtb_compare(compare_snp_matrix_recal, distance=0)
    ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    #####################CONSENSUS WITH REFINED CALL######
    ######################################################
    logger.info(GREEN + "Creating refined consensus" + END_FORMATTING)
    create_consensus(reference, compare_snp_matrix_recal,
                     out_stats_coverage_dir, out_consensus_dir)

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE COVID MULTI ANALYSIS#####" +
                END_FORMATTING + "\n")
Beispiel #16
0
            args.r1_file = r1_file
            args.r2_file = r2_file

            print("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" +
                  sample_number + "/" + sample_total + ")" + END_FORMATTING)

            ##############START PIPELINE#####################
            #################################################

            #INPUT ARGUMENTS
            ################
            check_file_exists(args.r1_file)
            check_file_exists(args.r2_file)

            args.output = os.path.abspath(args.output)
            check_create_dir(args.output)
            #QUALITY CHECK
            ##############
            """
            TODO: Quality check 
            TODO: Human filter
            """

            #QUALITY TRIMMING AND ADAPTER REMOVAL WITH bbduk.sh
            ###################################################
            out_trim_name_r1 = sample + "_R1.clean.fastq.gz"
            out_trim_name_r2 = sample + "_R2.clean.fastq.gz"
            output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                   out_trim_name_r1)
            output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                   out_trim_name_r2)
Beispiel #17
0
def fastqc_quality(r1, r2, output_dir, threads=8):
    check_create_dir(output_dir)

    cmd = ['fastqc', r1, r2, '-o', output_dir, '--threads', str(threads)]

    execute_subprocess(cmd)
Beispiel #18
0
######################################################################
#####################START PIPELINE###################################
######################################################################
#Annotation related
script_dir = os.path.dirname(os.path.realpath(__file__))
annotation_dir = os.path.join(script_dir, "annotation/genes")
if args.bed_remove == "TB":
    bed_polymorphism = os.path.join(annotation_dir, "MTB_repeats_annot.bed")


output = os.path.abspath(args.output)
#input_dir = os.path.abspath(args.input)
group_name = output.split("/")[-1]
out_gvcf_dir = os.path.join(args.output, "GVCF")
out_vcf_dir = os.path.join(args.output, "VCF")
check_create_dir(out_vcf_dir)

gvcf_input_dir = os.path.abspath(args.input)



print("\n\n" + BLUE + BOLD + "STARTING COHORT GVCF TO SPLIT SAMPLE VCF IN GROUP: " + group_name + END_FORMATTING)

#CALL VARIANTS 2/2 FOR HARD FILTERING AND RECALIBRATION
#######################################################
out_gvcf_name = group_name + ".cohort.g.vcf"
output_gvcf_file = os.path.join(out_gvcf_dir, out_gvcf_name)

if os.path.isfile(output_gvcf_file):
    print(YELLOW + DIM + output_gvcf_file + " EXIST\nOmmiting GVCF Combination for group " + group_name + END_FORMATTING)
else:
Beispiel #19
0
def vcf_consensus_filter(vcf_file, distance=1, AF=0.75, QD=15, window_10=3, dp_limit=8, dp_AF=10, AF_dp=0.80, 
    highly_hetz=False, non_genotyped=False, poorly_covered=False, bed_to_filter=False, var_type="SNP"):
    """
    Apply custom filter to individual vcf based on:
    AF
    snp distance --> Replaced by window_10
    QD
    Window_10, 20 and 30
    gatk asigned genotype for diploid calls
    Highly heterozygous positions 
    Poorly covered positions
    """
    df_vcf = import_VCF42_to_pandas(vcf_file)

    vcf_path = os.path.abspath(vcf_file)
    output_dir = ("/").join(vcf_path.split("/")[:-2])
    vcf_name = vcf_path.split("/")[-1]

    tab_name = (".").join(vcf_name.split(".")[:-1])
    extend_raw = ".raw.tab"
    extend_final = "." + var_type + ".final.vcf"

    table_outputt_dir = os.path.join(output_dir, "Table")
    check_create_dir(table_outputt_dir)

    #Add polymorphic regions info (Phage, Transposon or PE/PPE regions for TB)
    if bed_to_filter == False:
        df_vcf['is_polymorphic'] = False
    else:
        annotate_bed_s(df_vcf, bed_to_filter)
    
    if highly_hetz != False:
        annotate_bed_s(df_vcf, highly_hetz)
    
    if non_genotyped != False:
        annotate_bed_s(df_vcf, non_genotyped)

    if poorly_covered != False:
        annotate_bed_s(df_vcf, poorly_covered)
    

    #Add info of nearby positions
    add_snp_distance(df_vcf)
    add_indel_distance(df_vcf)

    #Add info of clustered positions in sliding window
    add_window_distance(df_vcf, window_size=10)
    add_window_distance(df_vcf, window_size=20)
    add_window_distance(df_vcf, window_size=30)

    #Manage SNP INDEL filter
    if var_type == "SNP":
        var_to_filter = "INDEL"
    elif var_type == "INDEL":
        var_to_filter = "SNP"
    elif var_type == "ALL":
        var_to_filter = "*"
    else:
        print("Wrong variant type to filter, use SNP/INDEL/ALL")
        sys.exit(1)

    #output all raw info into a file in 'Table' folder
    new_out_file = tab_name + extend_raw
    output_raw_tab = os.path.join(table_outputt_dir, new_out_file)
    df_vcf.to_csv(output_raw_tab, sep='\t', index=False)
    
    #Apply all filters and extract positions as table to filer the final vcf
    list_positions_to_filter = df_vcf['POS'][((df_vcf.AF < AF) | 
                                (df_vcf.snp_left_distance <= distance)|
                                (df_vcf.snp_right_distance <= distance)|
                                (df_vcf.window_10 > window_10)|
                                (df_vcf.AF <= 0.0)|
                                (df_vcf.QD <= QD)|
                                (df_vcf.dp == 0)|
                                (df_vcf.len_AD > 2) |
                                (df_vcf.ALT_AD < 2) |
                                (df_vcf.ALT == '*') |
                                (df_vcf.TYPE == var_to_filter) |
                                (df_vcf.dp < dp_limit) |
                                (df_vcf.FILTER != "PASS") |
                                ((df_vcf.gt0 == 0) & (df_vcf.window_10 > 1)) |
                                ((df_vcf.gt0 == 0) & (df_vcf.window_20 >= 2)) |
                                ((df_vcf.gt0 == 0) & (df_vcf.window_30 >= 3)) |
                                ((df_vcf.dp < dp_AF) & (df_vcf.AF < AF_dp)) |
                                (df_vcf.highly_hetz == True) |
                                (df_vcf.poorly_covered == True) |
                                (df_vcf.non_genotyped == True) |
                                (df_vcf.is_polymorphic == True))].tolist()

    final_vcf_name = tab_name + extend_final
    filter_vcf_list(vcf_path, list_positions_to_filter, final_vcf_name)