Ejemplo n.º 1
0
def zcat_concat_reads(args):
    """
    decompress gz r1 and r2 into a combined .fastq with the common sample in the same directory
    """

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)

    output_dir = ("/").join(r1.split("/")[:-1])
    output_name = sample + ".fastq"
    output_file = os.path.join(output_dir, output_name)

    cmd = ["zcat", r1, r2]
    # execute_subprocess(cmd)
    with open(output_file, "w+") as outfile:
        # calculate coverage and save it in th eoutput file
        subprocess.run(cmd,
                       stdout=outfile,
                       stderr=subprocess.PIPE,
                       check=True,
                       universal_newlines=True)

    return output_file
Ejemplo n.º 2
0
def bwa_mapping(args):
    """
    #Store output in a file when it is outputted in stdout
    https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    cmd_index = ["bwa", "index", reference]
    execute_subprocess(cmd_index)

    cmd_map = [
        "bwa", "mem", "-t",
        str(args.threads), "-o", output_file, reference, r1, r2
    ]
    execute_subprocess(cmd_map)
    """
Ejemplo n.º 3
0
def bbduk_trimming(args):
    """
    TODO : handle params
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    output_dir = obtain_output_dir(args, "Trimmed")

    in1_param = "in1=" + r1
    in2_param = "in2=" + r2

    sample = extract_sample(r1, r2)

    out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz"
    out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz"

    stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats"

    adapter_path = "ref=" + get_bbduk_adapters()

    memory_param = "-Xmx" + str(args.memory) + "g"
    threads_param = "threads=" + str(args.threads)

    check_create_dir(output_dir)

    #bbduk.sh
    cmd = [
        "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param,
        adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21",
        "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo",
        stats_param
    ]

    execute_subprocess(cmd)
Ejemplo n.º 4
0
def bowtie2_mapping(args):
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)
    reference = os.path.abspath(args.reference)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    output_file = os.path.join(output_dir, sample_name)

    check_create_dir(output_dir)

    if args.extensive_mapping:
        extensive_command = "-a"
    else:
        extensive_command = ""
    #bowtie2 index
    cmd_index = ["bowtie2-build", reference, reference]
    execute_subprocess(cmd_index)

    #bowtie map
    cmd_map = [
        "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q",
        "--very-sensitive-local", "-p",
        str(args.threads), "-x", reference, extensive_command
    ]
    execute_subprocess(cmd_map)
Ejemplo n.º 5
0
def mash_screen(r1_file,
                out_dir,
                r2_file=False,
                winner=True,
                threads=16,
                mash_database="/home/laura/DATABASES/Mash/bacteria_mash.msh"):
    # https://mash.readthedocs.io/en/latest/index.html
    # https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database
    # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab
    # identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment

    if not os.path.isfile(mash_database):
        logger.info(RED + BOLD + "Mash database can't be found\n" +
                    END_FORMATTING + "You can download it typing:\n\
            wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh")
        sys.exit(1)

    r1_file = os.path.abspath(r1_file)

    sample = extract_sample(r1_file, r2_file)

    check_create_dir(out_dir)
    species_output_name = sample + ".screen.tab"
    species_output_file = os.path.join(out_dir, species_output_name)

    cmd = ["mash", "screen", "-p", str(threads), mash_database, r1_file]

    if winner == True:
        cmd.insert(2, "-w")
    # Use both r1 and r2 instead of just r1(faster)
    if r2_file:
        r2_file = os.path.abspath(r2_file)
        cmd.append(r2_file)

    prog = cmd[0]
    param = cmd[1:]

    try:
        # execute_subprocess(cmd)
        with open(species_output_file, "w+") as outfile:
            # calculate mash distance and save it in output file
            command = subprocess.run(cmd,
                                     stdout=outfile,
                                     stderr=subprocess.PIPE,
                                     universal_newlines=True)
        if command.returncode == 0:
            logger.info(GREEN + "Program %s successfully executed" % prog +
                        END_FORMATTING)
        else:
            print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING +
                  BOLD + "WITH PARAMETERS: " + END_FORMATTING +
                  " ".join(param) + "\n" + BOLD +
                  "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" +
                  END_FORMATTING + command.stderr)
    except OSError as e:
        sys.exit(RED + BOLD + "failed to execute program '%s': %s" %
                 (prog, str(e)) + END_FORMATTING)
Ejemplo n.º 6
0
def add_SG(args, input_bam, output_bg_sorted):
    """
    @MN00227:45:000H255J3:1:11102:21214:1110 1:N:0:18
    @NS500454:48:HKG57BGXX:1:11101:17089:1032 2:N:0:TCCTGAGC+TCTTACGC
    @NS500454:27:HJJ32BGXX:1:11101:12392:1099 1:N:0:2

    @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:
    <is filtered>:<control number>:<sample number | barcode1'+barcode2'>
    ID = Read group identifier {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE} 
    PU = Platform Unit #optional
    SM = Sample
    PL = Platform/technology used to produce the read (ILLUMINA, SOLID, LS454, HELICOS and PACBIO)
    LB = DNA preparation library identifier
    """
    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)

    with gzip.open(r1) as f:
        first_line = f.readline().strip().decode()
    #print(first_line)
    first_line_list = first_line.split(":")

    rg_id = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_pu = ".".join(
        [first_line_list[2], first_line_list[3], first_line_list[-1]])
    rg_sm = sample
    rg_pl = "ILLUMINA"
    rg_lb = "lib_" + sample

    rg_id_param = "RGID=" + rg_id
    rg_pu_param = "RGPU=" + rg_pu
    rg_sm_param = "RGSM=" + rg_sm
    rg_pl_param = "RGPL=" + rg_pl
    rg_lb_param = "RGLB=" + rg_lb

    picard_jar = get_picard_path()

    input_param = "INPUT=" + input_bam
    output_param = "OUTPUT=" + output_bg_sorted

    # java -jar picard.jar AddOrReplaceReadGroups \
    # INPUT=reads.bam \ OUTPUT=reads_addRG.bam \ RGID=H0164.2 \ #be sure to change from default of 1
    # RGLB= library1 \ RGPL=illumina \ RGPU=H0164ALXX140820.2 \ RGSM=sample1 \
    # SORT_ORDER=coordinate \ CREATE_INDEX=true

    cmd = [
        "java", "-jar", picard_jar, "AddOrReplaceReadGroups", input_param,
        output_param, rg_id_param, rg_lb_param, rg_pl_param, rg_pu_param,
        rg_sm_param, "SORT_ORDER=coordinate"
    ]
    execute_subprocess(cmd)
Ejemplo n.º 7
0
def sam_to_index_bam(args):
    # input_sam_path = os.path.abspath(input_sam)
    # if output_bam == "inputdir":
    #     output_bam = os.path.dirname(input_sam_path)
    # else:
    #     output_bam = output_bam

    r1 = os.path.abspath(args.r1_file)
    r2 = os.path.abspath(args.r2_file)

    sample = extract_sample(r1, r2)
    output_dir = obtain_output_dir(args, "Bam")
    sample_name = sample + ".sam"
    input_sam_path = os.path.join(output_dir, sample_name)

    input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1])

    output_bam_name = input_name + ".bam"
    output_bam_path = os.path.join(output_dir, output_bam_name)

    output_bg_sorted_name = input_name + ".rg.sorted.bam"
    output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name)

    check_create_dir(output_dir)
    """
    #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam
    with open(output_bam_path, "w") as outfile:
        #map reads and save it in th eoutput file
        subprocess.run(["samtools", "view", "-Sb", input_sam_path], 
        stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True)
    """
    cmd = [
        "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path,
        "--threads",
        str(args.threads)
    ]
    execute_subprocess(cmd)

    check_remove_file(input_sam_path)

    add_SG(args, output_bam_path, output_bg_sorted_path)

    check_remove_file(output_bam_path)
    """
Ejemplo n.º 8
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    #annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD +
                "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(sample_list_F), ",".join(sample_list_F)))
    logger.info("\n%d NEW samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))
    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    #script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_variant_dir = os.path.join(output, "Variants")
    out_core_dir = os.path.join(output, "Core")

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(
        out_stats_dir, "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(
        out_stats_dir, "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder
    out_annot_blast_dir = os.path.join(out_annot_dir, "blast")  # subfolder

    out_species_dir = os.path.join(output, "Species")
    new_sample_number = 0
    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:
            # VARINAT SAMPLE DIR
            sample_variant_dir = os.path.join(out_variant_dir, sample)

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))
            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING)

            output_final_vcf = os.path.join(
                sample_variant_dir, 'snps.all.ivar.tsv')

            if not os.path.isfile(output_final_vcf):

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                # check_file_exists(r1_file)
                # check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(r1_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(r2_file.split(
                    '/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(
                    out_qc_pre_dir, out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Checking quality in sample " + sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file,
                                   out_qc_pre_dir, args.threads)

                """
                TODO: Human filter
                """

                # VARIANT CALLING WITH SNIPPY
                ###################################################

                output_vcf_sub = os.path.join(
                    sample_variant_dir, "snps.subs.vcf")
                output_vcf = os.path.join(sample_variant_dir, "snps.vcf")

                if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf):
                    logger.info(YELLOW + DIM + output_vcf +
                                " EXIST\nOmmiting Variant calling in  " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Calling variants with snippy " + sample + END_FORMATTING)
                    run_snippy(r1_file, r2_file, reference, out_variant_dir, sample,
                               threads=args.threads, minqual=10, minfrac=0.1, mincov=1)
                    old_bam = os.path.join(sample_variant_dir, "snps.bam")
                    old_bai = os.path.join(sample_variant_dir, "snps.bam.bai")
                    new_bam = os.path.join(sample_variant_dir, sample + ".bam")
                    new_bai = os.path.join(
                        sample_variant_dir, sample + ".bam.bai")
                    os.rename(old_bam, new_bam)
                    os.rename(old_bai, new_bai)

                #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ########
                #####################################################
                out_variant_indel_sample = os.path.join(
                    sample_variant_dir, "snps.indel.vcf")
                out_variant_all_sample = os.path.join(
                    sample_variant_dir, "snps.all.vcf")

                if os.path.isfile(out_variant_indel_sample):
                    logger.info(YELLOW + DIM + out_variant_indel_sample +
                                " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Filtering INDELS in " +
                                sample + END_FORMATTING)
                    extract_indels(output_vcf)

                if os.path.isfile(out_variant_all_sample):
                    logger.info(YELLOW + DIM + out_variant_all_sample +
                                " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Combining vcf in " +
                                sample + END_FORMATTING)
                    merge_vcf(output_vcf_sub, out_variant_indel_sample)

                #VARIANT FORMAT ADAPTATION TO IVAR ##################
                #####################################################
                out_variant_tsv_file = os.path.join(
                    sample_variant_dir, 'snps.all.ivar.tsv')

                if os.path.isfile(out_variant_tsv_file):
                    logger.info(YELLOW + DIM + out_variant_tsv_file +
                                " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING)
                else:
                    logger.info(
                        GREEN + "Adapting variants format in sample " + sample + END_FORMATTING)
                    prior = datetime.datetime.now()
                    vcf_to_ivar_tsv(out_variant_all_sample,
                                    out_variant_tsv_file)
                    after = datetime.datetime.now()
                    print(("Done with function in: %s" % (after - prior)))

            # SPECIES DETERMINATION
            ###################################################
            check_create_dir(out_species_dir)

            output_species = os.path.join(
                out_species_dir, sample + ".screen.tab")

            if os.path.isfile(output_species):
                logger.info(YELLOW + DIM + output_species +
                            " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING)
            else:
                logger.info(
                    GREEN + "Determining species in " + sample + END_FORMATTING)
                mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads,
                            mash_database=args.mash_database)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(
                out_stats_bamstats_dir, out_bamstats_name)
            bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam")

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " +
                            sample + END_FORMATTING)
                create_bamstat(
                    bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(
                out_stats_coverage_dir, out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " +
                            sample + END_FORMATTING)
                create_coverage(bam_sample_file,
                                out_stats_coverage_dir, sample)

    # coverage OUTPUT SUMMARY
    ######################################################
    prior_recal = datetime.datetime.now()
    logger.info(GREEN + "Creating summary report for coverage result in group " +
                group_name + END_FORMATTING)
    obtain_group_cov_stats(out_stats_dir, group_name)
    after_recal = datetime.datetime.now()
    logger.info("Done with report for coverage: %s" %
                (after_recal - prior_recal))

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report in group " +
                group_name + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples in group " +
                group_name + END_FORMATTING)
    uncovered_samples = remove_low_quality(
        output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered')

    if len(uncovered_samples) > 1:
        logger.info(GREEN + "Uncovered samples: " +
                    (",").join(uncovered_samples) + END_FORMATTING)
    else:
        logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING)

    # RUN SNIPPY CORE
    ##############################################################################################################################
    if args.core:
        check_create_dir(out_core_dir)
        logger.info(GREEN + "Running snippy-core " +
                    group_name + END_FORMATTING)
        run_snippy_core(out_variant_dir, out_core_dir, reference)

        logger.info(GREEN + "Adapting core-snp to compare format " +
                    group_name + END_FORMATTING)
        core_vcf_file = os.path.join(out_core_dir, "core.vcf")
        core_vcf_file_adapted = os.path.join(
            out_core_dir, "core.vcf.adapted.tsv")
        core_vcf_file_removed = os.path.join(
            out_core_dir, "core.vcf.adapted.final.tsv")

        core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file)
        core_vcf_df_adapted.to_csv(
            core_vcf_file_adapted, sep="\t", index=False)

        logger.info(GREEN + "Obtaining clustered positions " +
                    group_name + END_FORMATTING)

        close_positions_list = extract_close_snps(
            core_vcf_df_adapted, snps_in_10=1)
        logger.info(GREEN + "Obtaining uncovered positions " +
                    group_name + END_FORMATTING)
        uncovered_list = identify_uncovered(
            out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5)

        logger.debug('Clustered positions in core SNP:\n{}'.format(
            (",".join([str(x) for x in close_positions_list]))))
        logger.debug('Uncovered positions in all samples:\n{}'.format(
            (",".join([str(x) for x in uncovered_list]))))

        to_remove_list = close_positions_list + uncovered_list

        remove_df = remove_position_from_compare(
            core_vcf_df_adapted, to_remove_list)
        remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False)

        ddtb_compare(core_vcf_file_removed, distance=10)

    #ANNOTATION WITH SNPEFF AND USER INPUT ##############
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    # SNPEFF
    if args.snpeff_database != False:
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.vcf':
                    sample = root.split('/')[-1]
                    filename = os.path.join(root, name)
                    chrom_filename = os.path.join(
                        root, 'snps.all.chromosome.vcf')
                    out_annot_file = os.path.join(
                        out_annot_snpeff_dir, sample + ".annot")
                    if os.path.isfile(out_annot_file):
                        logger.info(YELLOW + DIM + out_annot_file +
                                    " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING)
                    else:
                        logger.info(
                            GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING)
                        rename_reference_snpeff(filename, chrom_filename)
                        annotate_snpeff(chrom_filename, out_annot_file,
                                        database=args.snpeff_database)
    else:
        logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " +
                    group_name + END_FORMATTING)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(
            YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name == 'snps.all.ivar.tsv':
                    sample = root.split('/')[-1]
                    logger.info(
                        'User bed/vcf annotation in sample {}'.format(sample))
                    filename = os.path.join(root, name)
                    out_annot_file = os.path.join(
                        out_annot_user_dir, sample + ".tsv")
                    user_annotation(
                        filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(
                                out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(
                                filename, out_annot_aa_file, aa_files=args.annot_aa)
    # USER FASTA ANNOTATION
    if not args.annot_fasta:
        logger.info(
            YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING)
    else:
        check_create_dir(out_annot_blast_dir)
        for root, _, files in os.walk(out_variant_dir):
            for name in files:
                if name.endswith('.consensus.subs.fa'):
                    filename = os.path.join(root, name)
                    sample = root.split('/')[-1]
                    logger.info(
                        'User FASTA annotation in sample {}'.format(sample))
                    # out_annot_aa_file = os.path.join(
                    #    out_annot_user_aa_dir, sample + ".tsv")
                    for db in args.annot_fasta:
                        make_blast(filename, db, sample, out_annot_blast_dir,
                                   db_type="nucl", query_type="nucl", evalue=0.0001, threads=8)

    # USER AA TO HTML
    if not args.annot_aa:
        logger.info(
            YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING)
    else:
        annotated_samples = []
        logger.info('Adapting annotation to html in {}'.format(group_name))
        for root, _, files in os.walk(out_annot_user_aa_dir):
            if root == out_annot_user_aa_dir:
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        annotated_samples.append(sample)
                        filename = os.path.join(root, name)
                        annotation_to_html(filename, sample)
        annotated_samples = [str(x) for x in annotated_samples]
        report_samples_html_all = report_samples_html.replace(
            'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
        with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f:
            f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_recal_mpileup = full_path_compare + \
        ".revised_intermediate_vcf.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"

    # Create intermediate

    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False)
    # recalibrated_snp_matrix_intermediate.to_csv(
    #     compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Remove SNPs from BED file (PE/PPE)

    if args.remove_bed:
        recalibrated_snp_matrix_intermediate = remove_bed_positions(
            recalibrated_snp_matrix_intermediate, args.remove_bed)

    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)

    # Recalibrate intermediate with VCF

    prior_recal = datetime.datetime.now()
    recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate(
        compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10)
    recalibrated_snp_matrix_mpileup.to_csv(
        compare_snp_matrix_recal_mpileup, sep="\t", index=False)

    after_recal = datetime.datetime.now()
    logger.debug("Done with recalibration vcf: %s" %
                 (after_recal - prior_recal))

    # Remove SNPs located within INDELs

    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_mpileup)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)

    # Extract all positions marked as complex
    complex_variants = extract_complex_list(out_variant_dir)
    logger.debug('Complex positions in all samples:\n{}'.format(
        (",".join([str(x) for x in complex_variants]))))

    # Clean all faulty positions and samples => Final table

    recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df,
                                               path_compare,
                                               complex_pos=complex_variants,
                                               min_freq_include=0.8,
                                               min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample,
                                               min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos,
                                               min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample,
                                               min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos,
                                               min_threshold_discard_all_pos=args.min_threshold_discard_all_pos,
                                               min_threshold_discard_all_sample=args.min_threshold_discard_all_sample,
                                               remove_faulty=True,
                                               drop_samples=True,
                                               drop_positions=True,
                                               windows_size_discard=args.window)
    recalibrated_revised_INDEL_df.to_csv(
        compare_snp_matrix_recal, sep="\t", index=False)

    # Matrix to pairwise and mwk

    ddtb_compare(compare_snp_matrix_recal, distance=5)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
Ejemplo n.º 9
0
def main():
    """
    Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python
    """

    # ARGUMENTS

    def get_arguments():

        parser = argparse.ArgumentParser(
            prog='covidma.py',
            description=
            'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2'
        )

        input_group = parser.add_argument_group('Input', 'Input parameters')

        input_group.add_argument(
            '-i',
            '--input',
            dest="input_dir",
            metavar="input_directory",
            type=str,
            required=True,
            help='REQUIRED.Input directory containing all fast[aq] files')
        input_group.add_argument('-r',
                                 '--reference',
                                 metavar="reference",
                                 type=str,
                                 required=True,
                                 help='REQUIRED. File to map against')
        input_group.add_argument(
            '-a',
            '--annotation',
            metavar="annotation",
            type=str,
            required=True,
            help='REQUIRED. gff3 file to annotate variants')
        input_group.add_argument('-s',
                                 '--sample',
                                 metavar="sample",
                                 type=str,
                                 required=False,
                                 help='Sample to identify further files')
        input_group.add_argument(
            '-L',
            '--sample_list',
            type=str,
            required=False,
            help='Sample names to analyse only in the file supplied')
        input_group.add_argument(
            '-p',
            '--primers',
            type=str,
            default=
            '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed',
            required=False,
            help='Bed file including primers to trim')

        quality_group = parser.add_argument_group(
            'Quality parameters', 'parameters for diferent triming conditions')

        quality_group.add_argument(
            '-c',
            '--coverage20',
            type=int,
            default=90,
            required=False,
            help=
            'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)'
        )
        quality_group.add_argument('-n',
                                   '--min_snp',
                                   type=int,
                                   required=False,
                                   default=1,
                                   help='SNP number to pass quality threshold')

        output_group = parser.add_argument_group(
            'Output', 'Required parameter to output results')

        output_group.add_argument(
            '-o',
            '--output',
            type=str,
            required=True,
            help='REQUIRED. Output directory to extract all results')
        output_group.add_argument(
            '-C',
            '--noclean',
            required=False,
            action='store_false',
            help='Clean unwanted files for standard execution')

        params_group = parser.add_argument_group(
            'Parameters', 'parameters for diferent stringent conditions')

        params_group.add_argument('-T',
                                  '--threads',
                                  type=str,
                                  dest="threads",
                                  required=False,
                                  default=16,
                                  help='Threads to use')
        params_group.add_argument('-M',
                                  '--memory',
                                  type=str,
                                  dest="memory",
                                  required=False,
                                  default=32,
                                  help='Max memory to use')

        annot_group = parser.add_argument_group(
            'Annotation', 'parameters for variant annotation')

        annot_group.add_argument('-B',
                                 '--annot_bed',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='bed file to annotate')
        annot_group.add_argument('-V',
                                 '--annot_vcf',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='vcf file to annotate')
        annot_group.add_argument('-A',
                                 '--annot_aa',
                                 type=str,
                                 default=[],
                                 required=False,
                                 action='append',
                                 help='aminoacid file to annotate')
        annot_group.add_argument('-R',
                                 '--remove_bed',
                                 type=str,
                                 default=False,
                                 required=False,
                                 help='BED file with positions to remove')
        annot_group.add_argument(
            '--mash_database',
            type=str,
            required=False,
            default=False,
            help='MASH ncbi annotation containing all species database')
        annot_group.add_argument('--snpeff_database',
                                 type=str,
                                 required=False,
                                 default='NC_045512.2',
                                 help='snpEFF annotation database')

        compare_group = parser.add_argument_group(
            'Compare', 'parameters for compare_snp')

        compare_group.add_argument('-S',
                                   '--only_snp',
                                   required=False,
                                   action='store_true',
                                   help='Use INDELS while comparing')

        arguments = parser.parse_args()

        return arguments

    args = get_arguments()

    ######################################################################
    #####################START PIPELINE###################################
    ######################################################################
    output = os.path.abspath(args.output)
    group_name = output.split("/")[-1]
    reference = os.path.abspath(args.reference)
    annotation = os.path.abspath(args.annotation)

    # LOGGING
    # Create log file with date and time
    right_now = str(datetime.datetime.now())
    right_now_full = "_".join(right_now.split(" "))
    log_filename = group_name + "_" + right_now_full + ".log"
    log_folder = os.path.join(output, 'Logs')
    check_create_dir(log_folder)
    log_full_path = os.path.join(log_folder, log_filename)

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    formatter = logging.Formatter('%(asctime)s:%(message)s')

    file_handler = logging.FileHandler(log_full_path)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)

    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    # stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " +
                group_name + END_FORMATTING)

    today = str(datetime.date.today())

    logger.info("ARGUMENTS:")
    logger.info(str(args))

    # Obtain all R1 and R2 from folder
    r1, r2 = extract_read_list(args.input_dir)

    # Check if there are samples to filter out
    sample_list_F = []
    if args.sample_list == None:
        logger.info("\n" + "No samples to filter")
        for r1_file, r2_file in zip(r1, r2):
            sample = extract_sample(r1_file, r2_file)
            sample_list_F.append(sample)
    else:
        logger.info("samples will be filtered")
        sample_list_F = file_to_list(args.sample_list)

    new_samples = check_reanalysis(args.output, sample_list_F)

    logger.info("\n%d samples will be analysed: %s" %
                (len(new_samples), ",".join(new_samples)))

    #PREPARE REFERENCE FOR MAPPING + FAI + DICT #########
    #####################################################

    # picard_dictionary(args)
    samtools_faidx(args)

    #DECLARE FOLDERS CREATED IN PIPELINE ################
    #AND KEY FILES ######################################
    #####################################################
    # Annotation related parameters
    # script_dir = os.path.dirname(os.path.realpath(__file__))

    # Output related
    out_qc_dir = os.path.join(output, "Quality")
    out_qc_pre_dir = os.path.join(out_qc_dir, "raw")  # subfolder
    out_qc_post_dir = os.path.join(out_qc_dir, "processed")  # subfolder
    out_trim_dir = os.path.join(output, "Trimmed")
    out_map_dir = os.path.join(output, "Bam")
    out_variant_dir = os.path.join(output, "Variants")
    out_variant_ivar_dir = os.path.join(out_variant_dir,
                                        "ivar_raw")  # subfolder
    out_filtered_ivar_dir = os.path.join(out_variant_dir,
                                         "ivar_filtered")  # subfolder
    out_consensus_dir = os.path.join(output, "Consensus")
    out_consensus_ivar_dir = os.path.join(out_consensus_dir,
                                          "ivar")  # subfolder

    out_stats_dir = os.path.join(output, "Stats")
    out_stats_bamstats_dir = os.path.join(out_stats_dir,
                                          "Bamstats")  # subfolder
    out_stats_coverage_dir = os.path.join(out_stats_dir,
                                          "Coverage")  # subfolder
    out_compare_dir = os.path.join(output, "Compare")

    out_annot_dir = os.path.join(output, "Annotation")
    out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff")  # subfolder
    out_annot_pangolin_dir = os.path.join(out_annot_dir,
                                          "pangolin")  # subfolder
    out_annot_user_dir = os.path.join(out_annot_dir, "user")  # subfolder
    out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa")  # subfolder

    new_sample_number = 0

    for r1_file, r2_file in zip(r1, r2):
        # EXtract sample name
        sample = extract_sample(r1_file, r2_file)
        args.sample = sample
        if sample in sample_list_F:

            sample_number = str(sample_list_F.index(sample) + 1)
            sample_total = str(len(sample_list_F))

            out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam"
            output_markdup_trimmed_file = os.path.join(
                out_map_dir, out_markdup_trimmed_name)

            if sample in new_samples:
                new_sample_number = str(int(new_sample_number) + 1)
                new_sample_total = str(len(new_samples))
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            " (" + new_sample_number + "/" + new_sample_total +
                            ")" + END_FORMATTING)
            else:
                logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample +
                            " (" + sample_number + "/" + sample_total + ")" +
                            END_FORMATTING)

            if not os.path.isfile(output_markdup_trimmed_file):

                args.r1_file = r1_file
                args.r2_file = r2_file

                ##############START PIPELINE#####################
                #################################################

                # INPUT ARGUMENTS
                ################
                check_file_exists(r1_file)
                check_file_exists(r2_file)

                args.output = os.path.abspath(args.output)
                check_create_dir(args.output)

                # QUALITY CHECK in RAW with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_raw_name_r1 = (".").join(
                    r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                out_qc_raw_name_r2 = (".").join(
                    r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html'
                output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r1)
                output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir,
                                                     out_qc_raw_name_r2)

                if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(
                        output_qc_raw_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN + "Checking quality in sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + r1_file + "\nR2: " + r2_file)
                    fastqc_quality(r1_file, r2_file, out_qc_pre_dir,
                                   args.threads)
                """
                TODO: Human filter
                """

                # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp
                ###################################################
                out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz"
                out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz"
                output_trimming_file_r1 = os.path.join(out_trim_dir,
                                                       out_trim_name_r1)
                output_trimming_file_r2 = os.path.join(out_trim_dir,
                                                       out_trim_name_r2)

                if os.path.isfile(output_trimming_file_r1) and os.path.isfile(
                        output_trimming_file_r2):
                    logger.info(YELLOW + DIM + output_trimming_file_r1 +
                                " EXIST\nOmmiting Trimming for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming sample " + sample +
                                END_FORMATTING)
                    fastp_trimming(r1_file,
                                   r2_file,
                                   sample,
                                   out_trim_dir,
                                   threads=args.threads,
                                   min_qual=20,
                                   window_size=10,
                                   min_len=35)

                # QUALITY CHECK in TRIMMED with fastqc
                ######################################################
                check_create_dir(out_qc_dir)

                out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html"
                out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html"
                output_qc_precessed_file_r1 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r1)
                output_qc_precessed_file_r2 = os.path.join(
                    out_qc_post_dir, out_qc_pos_r2)

                if os.path.isfile(
                        output_qc_precessed_file_r1) and os.path.isfile(
                            output_qc_precessed_file_r2):
                    logger.info(YELLOW + DIM + output_qc_raw_file_r1 +
                                " EXIST\nOmmiting QC for sample " + sample +
                                END_FORMATTING)
                else:
                    logger.info(GREEN +
                                "Checking quality in processed sample " +
                                sample + END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2)
                    fastqc_quality(output_trimming_file_r1,
                                   output_trimming_file_r2, out_qc_post_dir,
                                   args.threads)

                # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG
                #####################################################
                out_map_name = sample + ".rg.sorted.bam"
                output_map_file = os.path.join(out_map_dir, out_map_name)

                if os.path.isfile(output_map_file):
                    logger.info(YELLOW + DIM + output_map_file +
                                " EXIST\nOmmiting Mapping for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Mapping sample " + sample +
                                END_FORMATTING)
                    logger.info("R1: " + output_trimming_file_r1 + "\nR2: " +
                                output_trimming_file_r2 + "\nReference: " +
                                reference)
                    bwa_mapping(output_trimming_file_r1,
                                output_trimming_file_r2,
                                reference,
                                sample,
                                out_map_dir,
                                threads=args.threads)
                    sam_to_index_bam(sample,
                                     out_map_dir,
                                     output_trimming_file_r1,
                                     threads=args.threads)

                #MARK DUPLICATES WITH PICARDTOOLS ###################
                #####################################################
                out_markdup_name = sample + ".rg.markdup.sorted.bam"
                output_markdup_file = os.path.join(out_map_dir,
                                                   out_markdup_name)

                if os.path.isfile(output_markdup_file):
                    logger.info(YELLOW + DIM + output_markdup_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Marking Dupes in sample " + sample +
                                END_FORMATTING)
                    logger.info("Input Bam: " + output_map_file)
                    picard_markdup(output_map_file)

                #TRIM PRIMERS WITH ivar trim ########################
                #####################################################

                if os.path.isfile(output_markdup_trimmed_file):
                    logger.info(YELLOW + DIM + output_markdup_trimmed_file +
                                " EXIST\nOmmiting Duplucate Mark for sample " +
                                sample + END_FORMATTING)
                else:
                    logger.info(GREEN + "Trimming primers in sample " +
                                sample + END_FORMATTING)
                    logger.info("Input Bam: " + output_markdup_file)
                    ivar_trim(output_markdup_file,
                              args.primers,
                              sample,
                              min_length=30,
                              min_quality=20,
                              sliding_window_width=4)
            else:
                logger.info(
                    YELLOW + DIM + output_markdup_trimmed_file +
                    " EXIST\nOmmiting BAM mapping and BAM manipulation in sample "
                    + sample + END_FORMATTING)

            ########################END OF MAPPING AND BAM MANIPULATION#####################################################################
            ################################################################################################################################

            #VARIANT CALLING WTIH ivar variants##################
            #####################################################
            check_create_dir(out_variant_dir)
            out_ivar_variant_name = sample + ".tsv"
            out_ivar_variant_file = os.path.join(out_variant_ivar_dir,
                                                 out_ivar_variant_name)

            if os.path.isfile(out_ivar_variant_file):
                logger.info(YELLOW + DIM + out_ivar_variant_file +
                            " EXIST\nOmmiting Variant call for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Calling variants with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_variants(reference,
                              output_markdup_trimmed_file,
                              out_variant_dir,
                              sample,
                              annotation,
                              min_quality=15,
                              min_frequency_threshold=0.01,
                              min_depth=1)

            #VARIANT FILTERING ##################################
            #####################################################
            check_create_dir(out_filtered_ivar_dir)
            out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir,
                                                  out_ivar_variant_name)

            if os.path.isfile(out_ivar_filtered_file):
                logger.info(YELLOW + DIM + out_ivar_filtered_file +
                            " EXIST\nOmmiting Variant filtering for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Filtering variants in sample " + sample +
                            END_FORMATTING)
                filter_tsv_variants(out_ivar_variant_file,
                                    out_filtered_ivar_dir,
                                    min_frequency=0.7,
                                    min_total_depth=10,
                                    min_alt_dp=4,
                                    is_pass=True,
                                    only_snp=False)

            #CREATE CONSENSUS with ivar consensus##################
            #######################################################
            check_create_dir(out_consensus_dir)
            check_create_dir(out_consensus_ivar_dir)
            out_ivar_consensus_name = sample + ".fa"
            out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir,
                                                   out_ivar_consensus_name)

            if os.path.isfile(out_ivar_consensus_file):
                logger.info(YELLOW + DIM + out_ivar_consensus_file +
                            " EXIST\nOmmiting Consensus for  sample " +
                            sample + END_FORMATTING)
            else:
                logger.info(GREEN + "Creating consensus with ivar in sample " +
                            sample + END_FORMATTING)
                ivar_consensus(output_markdup_trimmed_file,
                               out_consensus_ivar_dir,
                               sample,
                               min_quality=20,
                               min_frequency_threshold=0.8,
                               min_depth=20,
                               uncovered_character='N')
                logger.info(GREEN + "Replacing consensus header in " + sample +
                            END_FORMATTING)
                replace_consensus_header(out_ivar_consensus_file)

            ########################CREATE STATS AND QUALITY FILTERS########################################################################
            ################################################################################################################################
            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_dir)
            check_create_dir(out_stats_bamstats_dir)
            out_bamstats_name = sample + ".bamstats"
            out_bamstats_file = os.path.join(out_stats_bamstats_dir,
                                             out_bamstats_name)

            if os.path.isfile(out_bamstats_file):
                logger.info(YELLOW + DIM + out_bamstats_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating bamstats in sample " + sample +
                            END_FORMATTING)
                create_bamstat(output_markdup_trimmed_file,
                               out_stats_bamstats_dir,
                               sample,
                               threads=args.threads)

            #CREATE Bamstats#######################################
            #######################################################
            check_create_dir(out_stats_coverage_dir)
            out_coverage_name = sample + ".cov"
            out_coverage_file = os.path.join(out_stats_coverage_dir,
                                             out_coverage_name)

            if os.path.isfile(out_coverage_file):
                logger.info(YELLOW + DIM + out_coverage_file +
                            " EXIST\nOmmiting Bamstats for  sample " + sample +
                            END_FORMATTING)
            else:
                logger.info(GREEN + "Creating coverage in sample " + sample +
                            END_FORMATTING)
                create_coverage(output_markdup_trimmed_file,
                                out_stats_coverage_dir, sample)

    # fastqc OUTPUT FORMAT FOR COMPARISON
    ######################################################
    logger.info(GREEN + "Creating summary report for quality result " +
                END_FORMATTING)
    # format_html_image(out_qc_dir)

    # coverage OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating summary report for coverage result " +
                END_FORMATTING)
    obtain_group_cov_stats(out_stats_coverage_dir, group_name)

    # READS and VARIANTS OUTPUT SUMMARY
    ######################################################
    logger.info(GREEN + "Creating overal summary report " + END_FORMATTING)
    obtain_overal_stats(output, group_name)

    # REMOVE UNCOVERED
    ##############################################################################################################################
    logger.info(GREEN + "Removing low quality samples" + END_FORMATTING)
    # remove_low_quality(output, min_percentage_20x=args.coverage20,
    #                   min_hq_snp=args.min_snp, type_remove='Uncovered')

    #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN ####
    #####################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " +
                group_name + END_FORMATTING + "\n")
    check_create_dir(out_annot_dir)
    check_create_dir(out_annot_snpeff_dir)
    check_create_dir(out_annot_pangolin_dir)
    # SNPEFF
    if args.snpeff_database != False:
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_filtered_ivar_dir):
            if root == out_filtered_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_snpeff_dir,
                                                      sample + ".annot")
                        if os.path.isfile(out_annot_file):
                            logger.info(
                                YELLOW + DIM + out_annot_file +
                                " EXIST\nOmmiting snpEff Annotation for sample "
                                + sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Annotating sample with snpEff: " +
                                        sample + END_FORMATTING)
                            output_vcf = os.path.join(out_annot_snpeff_dir,
                                                      sample + '.vcf')
                            annotate_snpeff(filename,
                                            output_vcf,
                                            out_annot_file,
                                            database=args.snpeff_database)
    # USER DEFINED
    if not args.annot_bed and not args.annot_vcf:
        logger.info(YELLOW + BOLD +
                    "Ommiting User Annotation, no BED or VCF files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_dir)
        # CHANGE FOR RAW/FILTERED ANNOTATION
        for root, _, files in os.walk(out_variant_ivar_dir):
            if root == out_variant_ivar_dir:  # CHANGE FOR RAW/FILTERED ANNOTATION
                for name in files:
                    if name.endswith('.tsv'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User bed/vcf annotation in sample {}'.format(
                                sample))
                        filename = os.path.join(root, name)
                        out_annot_file = os.path.join(out_annot_user_dir,
                                                      sample + ".tsv")
                        user_annotation(filename,
                                        out_annot_file,
                                        vcf_files=args.annot_vcf,
                                        bed_files=args.annot_bed)

    # USER AA DEFINED
    if not args.annot_aa:
        logger.info(YELLOW + BOLD +
                    "Ommiting User aa Annotation, no AA files supplied" +
                    END_FORMATTING)
    else:
        check_create_dir(out_annot_user_aa_dir)
        for root, _, files in os.walk(out_annot_snpeff_dir):
            if root == out_annot_snpeff_dir:
                for name in files:
                    if name.endswith('.annot'):
                        sample = name.split('.')[0]
                        logger.info(
                            'User aa annotation in sample {}'.format(sample))
                        filename = os.path.join(root, name)
                        out_annot_aa_file = os.path.join(
                            out_annot_user_aa_dir, sample + ".tsv")
                        if os.path.isfile(out_annot_aa_file):
                            user_annotation_aa(out_annot_aa_file,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)
                        else:
                            user_annotation_aa(filename,
                                               out_annot_aa_file,
                                               aa_files=args.annot_aa)

    # PANGOLIN
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=args.threads) as executor:
        futures_pangolin = []

        for root, _, files in os.walk(out_consensus_ivar_dir):
            if root == out_consensus_ivar_dir:
                for name in files:
                    if name.endswith('.fa'):
                        sample = name.split('.')[0]
                        filename = os.path.join(root, name)
                        out_pangolin_filename = sample + ".lineage.csv"
                        out_pangolin_file = os.path.join(
                            out_annot_pangolin_dir, out_pangolin_filename)
                        if os.path.isfile(out_pangolin_file):
                            logger.info(
                                YELLOW + DIM + out_pangolin_file +
                                " EXIST\nOmmiting Lineage for  sample " +
                                sample + END_FORMATTING)
                        else:
                            logger.info(GREEN +
                                        "Obtaining Lineage in sample " +
                                        sample + END_FORMATTING)
                            future = executor.submit(annotate_pangolin,
                                                     filename,
                                                     out_annot_pangolin_dir,
                                                     out_pangolin_filename,
                                                     threads=args.threads,
                                                     max_ambig=0.6)
                            futures_pangolin.append(future)
                for future in concurrent.futures.as_completed(
                        futures_pangolin):
                    logger.info(future.result())
                    # annotate_pangolin(filename, out_annot_pangolin_dir,
                    #                out_pangolin_filename, threads=args.threads, max_ambig=0.6)

    # USER AA TO HTML
    annotated_samples = []
    logger.info('Adapting annotation to html in {}'.format(group_name))
    for root, _, files in os.walk(out_annot_user_aa_dir):
        if root == out_annot_user_aa_dir:
            for name in files:
                if name.endswith('.tsv'):
                    sample = name.split('.')[0]
                    annotated_samples.append(sample)
                    filename = os.path.join(root, name)
                    annotation_to_html(filename, sample)
    annotated_samples = [str(x) for x in annotated_samples]
    report_samples_html_all = report_samples_html.replace(
        'ALLSAMPLES', ('","').join(annotated_samples))  # NEW
    with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'),
              'w+') as f:
        f.write(report_samples_html_all)

    # SNP COMPARISON using tsv variant files
    ######################################################
    logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    check_create_dir(out_compare_dir)
    folder_compare = today + "_" + group_name
    path_compare = os.path.join(out_compare_dir, folder_compare)
    check_create_dir(path_compare)
    full_path_compare = os.path.join(path_compare, group_name)

    # ddtb_add(out_filtered_ivar_dir, full_path_compare)
    compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv"
    compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv"
    compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv"
    compare_snp_matrix_INDEL_intermediate = full_path_compare + \
        ".revised_INDEL_intermediate.tsv"
    recalibrated_snp_matrix_intermediate = ddbb_create_intermediate(
        out_variant_ivar_dir,
        out_stats_coverage_dir,
        min_freq_discard=0.1,
        min_alt_dp=4,
        only_snp=args.only_snp)
    recalibrated_snp_matrix_intermediate.to_csv(
        compare_snp_matrix_recal_intermediate, sep="\t", index=False)
    compare_snp_matrix_INDEL_intermediate_df = remove_position_range(
        recalibrated_snp_matrix_intermediate)
    compare_snp_matrix_INDEL_intermediate_df.to_csv(
        compare_snp_matrix_INDEL_intermediate, sep="\t", index=False)
    recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate,
                                         path_compare,
                                         min_freq_include=0.7,
                                         min_threshold_discard_sample=0.07,
                                         min_threshold_discard_position=0.4,
                                         remove_faulty=True,
                                         drop_samples=True,
                                         drop_positions=True)
    recalibrated_revised_df.to_csv(compare_snp_matrix_recal,
                                   sep="\t",
                                   index=False)
    recalibrated_revised_INDEL_df = revised_df(
        compare_snp_matrix_INDEL_intermediate_df,
        path_compare,
        min_freq_include=0.7,
        min_threshold_discard_sample=0.07,
        min_threshold_discard_position=0.4,
        remove_faulty=True,
        drop_samples=True,
        drop_positions=True)
    recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL,
                                         sep="\t",
                                         index=False)

    ddtb_compare(compare_snp_matrix_recal, distance=0)
    ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True)

    logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " +
                group_name + END_FORMATTING + "\n")

    #####################CONSENSUS WITH REFINED CALL######
    ######################################################
    logger.info(GREEN + "Creating refined consensus" + END_FORMATTING)
    create_consensus(reference, compare_snp_matrix_recal,
                     out_stats_coverage_dir, out_consensus_dir)

    logger.info("\n\n" + MAGENTA + BOLD +
                "#####END OF PIPELINE COVID MULTI ANALYSIS#####" +
                END_FORMATTING + "\n")
Ejemplo n.º 10
0
args = get_arguments()

print("ARGUMENTS\n")
print(args)

check_reanalysis(args.output)

#Obtain all R1 and R2 from folder
r1, r2 = extract_read_list(args.input_dir)

#Check if there are samples to filter
sample_list_F = []
if args.sample_list == None:
    print("\n" + "No samples to filter")
    for r1_file, r2_file in zip(r1, r2):
        sample = extract_sample(r1_file, r2_file)
        sample_list_F.append(sample)
else:
    print("samples will be filtered")
    sample_list_F = file_to_list(args.sample_list)
print("\n%d samples will be analysed: %s" %
      (len(sample_list_F), ",".join(sample_list_F)))

######################################################################
#####################START PIPELINE###################################
######################################################################
output = os.path.abspath(args.output)
group_name = output.split("/")[-1]

print("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name +
      END_FORMATTING)