def zcat_concat_reads(args): """ decompress gz r1 and r2 into a combined .fastq with the common sample in the same directory """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) output_dir = ("/").join(r1.split("/")[:-1]) output_name = sample + ".fastq" output_file = os.path.join(output_dir, output_name) cmd = ["zcat", r1, r2] # execute_subprocess(cmd) with open(output_file, "w+") as outfile: # calculate coverage and save it in th eoutput file subprocess.run(cmd, stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True) return output_file
def bwa_mapping(args): """ #Store output in a file when it is outputted in stdout https://stackoverflow.com/questions/4965159/how-to-redirect-output-with-subprocess-in-python """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) cmd_index = ["bwa", "index", reference] execute_subprocess(cmd_index) cmd_map = [ "bwa", "mem", "-t", str(args.threads), "-o", output_file, reference, r1, r2 ] execute_subprocess(cmd_map) """
def bbduk_trimming(args): """ TODO : handle params """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) output_dir = obtain_output_dir(args, "Trimmed") in1_param = "in1=" + r1 in2_param = "in2=" + r2 sample = extract_sample(r1, r2) out1_param = "out1=" + output_dir + "/" + sample + "_R1.clean.fastq.gz" out2_param = "out2=" + output_dir + "/" + sample + "_R2.clean.fastq.gz" stats_param = "stats=" + output_dir + "/" + sample + "_trim.stats" adapter_path = "ref=" + get_bbduk_adapters() memory_param = "-Xmx" + str(args.memory) + "g" threads_param = "threads=" + str(args.threads) check_create_dir(output_dir) #bbduk.sh cmd = [ "bbduk.sh", memory_param, in1_param, in2_param, out1_param, out2_param, adapter_path, "trimq=15", "qtrim=rl", "minlen=40", "ktrim=r", "k=21", "mink=11", "hammingdistance=2", threads_param, "tpe", "tbo", stats_param ] execute_subprocess(cmd)
def bowtie2_mapping(args): r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) reference = os.path.abspath(args.reference) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" output_file = os.path.join(output_dir, sample_name) check_create_dir(output_dir) if args.extensive_mapping: extensive_command = "-a" else: extensive_command = "" #bowtie2 index cmd_index = ["bowtie2-build", reference, reference] execute_subprocess(cmd_index) #bowtie map cmd_map = [ "bowtie2", "-1", r1, "-2", r2, "-S", output_file, "-q", "--very-sensitive-local", "-p", str(args.threads), "-x", reference, extensive_command ] execute_subprocess(cmd_map)
def mash_screen(r1_file, out_dir, r2_file=False, winner=True, threads=16, mash_database="/home/laura/DATABASES/Mash/bacteria_mash.msh"): # https://mash.readthedocs.io/en/latest/index.html # https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh #MASH refseq database # mash screen -w -p 4 ../refseq.genomes.k21s1000.msh 4_R1.fastq.gz 4_R2.fastq.gz > 4.winner.screen.tab # identity, shared-hashes, median-multiplicity, p-value, query-ID, query-comment if not os.path.isfile(mash_database): logger.info(RED + BOLD + "Mash database can't be found\n" + END_FORMATTING + "You can download it typing:\n\ wget https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh") sys.exit(1) r1_file = os.path.abspath(r1_file) sample = extract_sample(r1_file, r2_file) check_create_dir(out_dir) species_output_name = sample + ".screen.tab" species_output_file = os.path.join(out_dir, species_output_name) cmd = ["mash", "screen", "-p", str(threads), mash_database, r1_file] if winner == True: cmd.insert(2, "-w") # Use both r1 and r2 instead of just r1(faster) if r2_file: r2_file = os.path.abspath(r2_file) cmd.append(r2_file) prog = cmd[0] param = cmd[1:] try: # execute_subprocess(cmd) with open(species_output_file, "w+") as outfile: # calculate mash distance and save it in output file command = subprocess.run(cmd, stdout=outfile, stderr=subprocess.PIPE, universal_newlines=True) if command.returncode == 0: logger.info(GREEN + "Program %s successfully executed" % prog + END_FORMATTING) else: print(RED + BOLD + "Command %s FAILED\n" % prog + END_FORMATTING + BOLD + "WITH PARAMETERS: " + END_FORMATTING + " ".join(param) + "\n" + BOLD + "EXIT-CODE: %d\n" % command.returncode + "ERROR:\n" + END_FORMATTING + command.stderr) except OSError as e: sys.exit(RED + BOLD + "failed to execute program '%s': %s" % (prog, str(e)) + END_FORMATTING)
def add_SG(args, input_bam, output_bg_sorted): """ @MN00227:45:000H255J3:1:11102:21214:1110 1:N:0:18 @NS500454:48:HKG57BGXX:1:11101:17089:1032 2:N:0:TCCTGAGC+TCTTACGC @NS500454:27:HJJ32BGXX:1:11101:12392:1099 1:N:0:2 @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>: <is filtered>:<control number>:<sample number | barcode1'+barcode2'> ID = Read group identifier {FLOWCELL_BARCODE}.{LANE}.{SAMPLE_BARCODE} PU = Platform Unit #optional SM = Sample PL = Platform/technology used to produce the read (ILLUMINA, SOLID, LS454, HELICOS and PACBIO) LB = DNA preparation library identifier """ r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) with gzip.open(r1) as f: first_line = f.readline().strip().decode() #print(first_line) first_line_list = first_line.split(":") rg_id = ".".join( [first_line_list[2], first_line_list[3], first_line_list[-1]]) rg_pu = ".".join( [first_line_list[2], first_line_list[3], first_line_list[-1]]) rg_sm = sample rg_pl = "ILLUMINA" rg_lb = "lib_" + sample rg_id_param = "RGID=" + rg_id rg_pu_param = "RGPU=" + rg_pu rg_sm_param = "RGSM=" + rg_sm rg_pl_param = "RGPL=" + rg_pl rg_lb_param = "RGLB=" + rg_lb picard_jar = get_picard_path() input_param = "INPUT=" + input_bam output_param = "OUTPUT=" + output_bg_sorted # java -jar picard.jar AddOrReplaceReadGroups \ # INPUT=reads.bam \ OUTPUT=reads_addRG.bam \ RGID=H0164.2 \ #be sure to change from default of 1 # RGLB= library1 \ RGPL=illumina \ RGPU=H0164ALXX140820.2 \ RGSM=sample1 \ # SORT_ORDER=coordinate \ CREATE_INDEX=true cmd = [ "java", "-jar", picard_jar, "AddOrReplaceReadGroups", input_param, output_param, rg_id_param, rg_lb_param, rg_pl_param, rg_pu_param, rg_sm_param, "SORT_ORDER=coordinate" ] execute_subprocess(cmd)
def sam_to_index_bam(args): # input_sam_path = os.path.abspath(input_sam) # if output_bam == "inputdir": # output_bam = os.path.dirname(input_sam_path) # else: # output_bam = output_bam r1 = os.path.abspath(args.r1_file) r2 = os.path.abspath(args.r2_file) sample = extract_sample(r1, r2) output_dir = obtain_output_dir(args, "Bam") sample_name = sample + ".sam" input_sam_path = os.path.join(output_dir, sample_name) input_name = (".").join(os.path.basename(input_sam_path).split(".")[:-1]) output_bam_name = input_name + ".bam" output_bam_path = os.path.join(output_dir, output_bam_name) output_bg_sorted_name = input_name + ".rg.sorted.bam" output_bg_sorted_path = os.path.join(output_dir, output_bg_sorted_name) check_create_dir(output_dir) """ #sam to bam: samtools view -Sb $input_file -o $output_dir/$sample.bam with open(output_bam_path, "w") as outfile: #map reads and save it in th eoutput file subprocess.run(["samtools", "view", "-Sb", input_sam_path], stdout=outfile, stderr=subprocess.PIPE, check=True, universal_newlines=True) """ cmd = [ "samtools", "view", "-Sb", input_sam_path, "-o", output_bam_path, "--threads", str(args.threads) ] execute_subprocess(cmd) check_remove_file(input_sam_path) add_SG(args, output_bam_path, output_bg_sorted_path) check_remove_file(output_bam_path) """
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) #annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(sample_list_F), ",".join(sample_list_F))) logger.info("\n%d NEW samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters #script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_variant_dir = os.path.join(output, "Variants") out_core_dir = os.path.join(output, "Core") out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join( out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join( out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder out_annot_blast_dir = os.path.join(out_annot_dir, "blast") # subfolder out_species_dir = os.path.join(output, "Species") new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: # VARINAT SAMPLE DIR sample_variant_dir = os.path.join(out_variant_dir, sample) sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) output_final_vcf = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if not os.path.isfile(output_final_vcf): ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ # check_file_exists(r1_file) # check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join(r1_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join(r2_file.split( '/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join( out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile(output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # VARIANT CALLING WITH SNIPPY ################################################### output_vcf_sub = os.path.join( sample_variant_dir, "snps.subs.vcf") output_vcf = os.path.join(sample_variant_dir, "snps.vcf") if os.path.isfile(output_vcf_sub) and os.path.isfile(output_vcf): logger.info(YELLOW + DIM + output_vcf + " EXIST\nOmmiting Variant calling in " + sample + END_FORMATTING) else: logger.info( GREEN + "Calling variants with snippy " + sample + END_FORMATTING) run_snippy(r1_file, r2_file, reference, out_variant_dir, sample, threads=args.threads, minqual=10, minfrac=0.1, mincov=1) old_bam = os.path.join(sample_variant_dir, "snps.bam") old_bai = os.path.join(sample_variant_dir, "snps.bam.bai") new_bam = os.path.join(sample_variant_dir, sample + ".bam") new_bai = os.path.join( sample_variant_dir, sample + ".bam.bai") os.rename(old_bam, new_bam) os.rename(old_bai, new_bai) #VARIANT FORMAT COMBINATION (REMOVE COMPLEX) ######## ##################################################### out_variant_indel_sample = os.path.join( sample_variant_dir, "snps.indel.vcf") out_variant_all_sample = os.path.join( sample_variant_dir, "snps.all.vcf") if os.path.isfile(out_variant_indel_sample): logger.info(YELLOW + DIM + out_variant_indel_sample + " EXIST\nOmmiting indel filtering in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering INDELS in " + sample + END_FORMATTING) extract_indels(output_vcf) if os.path.isfile(out_variant_all_sample): logger.info(YELLOW + DIM + out_variant_all_sample + " EXIST\nOmmiting vcf combination in sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Combining vcf in " + sample + END_FORMATTING) merge_vcf(output_vcf_sub, out_variant_indel_sample) #VARIANT FORMAT ADAPTATION TO IVAR ################## ##################################################### out_variant_tsv_file = os.path.join( sample_variant_dir, 'snps.all.ivar.tsv') if os.path.isfile(out_variant_tsv_file): logger.info(YELLOW + DIM + out_variant_tsv_file + " EXIST\nOmmiting format adaptation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Adapting variants format in sample " + sample + END_FORMATTING) prior = datetime.datetime.now() vcf_to_ivar_tsv(out_variant_all_sample, out_variant_tsv_file) after = datetime.datetime.now() print(("Done with function in: %s" % (after - prior))) # SPECIES DETERMINATION ################################################### check_create_dir(out_species_dir) output_species = os.path.join( out_species_dir, sample + ".screen.tab") if os.path.isfile(output_species): logger.info(YELLOW + DIM + output_species + " EXIST\nOmmiting Species determinatin in " + sample + END_FORMATTING) else: logger.info( GREEN + "Determining species in " + sample + END_FORMATTING) mash_screen(r1_file, out_species_dir, r2_file=r2_file, winner=True, threads=args.threads, mash_database=args.mash_database) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join( out_stats_bamstats_dir, out_bamstats_name) bam_sample_file = os.path.join(sample_variant_dir, sample + ".bam") if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat( bam_sample_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join( out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(bam_sample_file, out_stats_coverage_dir, sample) # coverage OUTPUT SUMMARY ###################################################### prior_recal = datetime.datetime.now() logger.info(GREEN + "Creating summary report for coverage result in group " + group_name + END_FORMATTING) obtain_group_cov_stats(out_stats_dir, group_name) after_recal = datetime.datetime.now() logger.info("Done with report for coverage: %s" % (after_recal - prior_recal)) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report in group " + group_name + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples in group " + group_name + END_FORMATTING) uncovered_samples = remove_low_quality( output, min_coverage=args.coverage20, min_hq_snp=args.min_snp, type_remove='Uncovered') if len(uncovered_samples) > 1: logger.info(GREEN + "Uncovered samples: " + (",").join(uncovered_samples) + END_FORMATTING) else: logger.info(GREEN + "NO uncovered samples found" + END_FORMATTING) # RUN SNIPPY CORE ############################################################################################################################## if args.core: check_create_dir(out_core_dir) logger.info(GREEN + "Running snippy-core " + group_name + END_FORMATTING) run_snippy_core(out_variant_dir, out_core_dir, reference) logger.info(GREEN + "Adapting core-snp to compare format " + group_name + END_FORMATTING) core_vcf_file = os.path.join(out_core_dir, "core.vcf") core_vcf_file_adapted = os.path.join( out_core_dir, "core.vcf.adapted.tsv") core_vcf_file_removed = os.path.join( out_core_dir, "core.vcf.adapted.final.tsv") core_vcf_df_adapted = import_VCF4_core_to_compare(core_vcf_file) core_vcf_df_adapted.to_csv( core_vcf_file_adapted, sep="\t", index=False) logger.info(GREEN + "Obtaining clustered positions " + group_name + END_FORMATTING) close_positions_list = extract_close_snps( core_vcf_df_adapted, snps_in_10=1) logger.info(GREEN + "Obtaining uncovered positions " + group_name + END_FORMATTING) uncovered_list = identify_uncovered( out_stats_coverage_dir, min_coverage=10, nocall_fr=0.5) logger.debug('Clustered positions in core SNP:\n{}'.format( (",".join([str(x) for x in close_positions_list])))) logger.debug('Uncovered positions in all samples:\n{}'.format( (",".join([str(x) for x in uncovered_list])))) to_remove_list = close_positions_list + uncovered_list remove_df = remove_position_from_compare( core_vcf_df_adapted, to_remove_list) remove_df.to_csv(core_vcf_file_removed, sep="\t", index=False) ddtb_compare(core_vcf_file_removed, distance=10) #ANNOTATION WITH SNPEFF AND USER INPUT ############## ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) # SNPEFF if args.snpeff_database != False: for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.vcf': sample = root.split('/')[-1] filename = os.path.join(root, name) chrom_filename = os.path.join( root, 'snps.all.chromosome.vcf') out_annot_file = os.path.join( out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info(YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info( GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) rename_reference_snpeff(filename, chrom_filename) annotate_snpeff(chrom_filename, out_annot_file, database=args.snpeff_database) else: logger.info(YELLOW + DIM + " No SnpEff database suplied, skipping annotation in group " + group_name + END_FORMATTING) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info( YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name == 'snps.all.ivar.tsv': sample = root.split('/')[-1] logger.info( 'User bed/vcf annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_file = os.path.join( out_annot_user_dir, sample + ".tsv") user_annotation( filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa( out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa( filename, out_annot_aa_file, aa_files=args.annot_aa) # USER FASTA ANNOTATION if not args.annot_fasta: logger.info( YELLOW + BOLD + "Ommiting User FASTA Annotation, no FASTA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_blast_dir) for root, _, files in os.walk(out_variant_dir): for name in files: if name.endswith('.consensus.subs.fa'): filename = os.path.join(root, name) sample = root.split('/')[-1] logger.info( 'User FASTA annotation in sample {}'.format(sample)) # out_annot_aa_file = os.path.join( # out_annot_user_aa_dir, sample + ".tsv") for db in args.annot_fasta: make_blast(filename, db, sample, out_annot_blast_dir, db_type="nucl", query_type="nucl", evalue=0.0001, threads=8) # USER AA TO HTML if not args.annot_aa: logger.info( YELLOW + BOLD + "Ommiting User aa Annotation to HTML, no AA files supplied" + END_FORMATTING) else: annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_recal_mpileup = full_path_compare + \ ".revised_intermediate_vcf.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" # Create intermediate recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=10, only_snp=False) # recalibrated_snp_matrix_intermediate.to_csv( # compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Remove SNPs from BED file (PE/PPE) if args.remove_bed: recalibrated_snp_matrix_intermediate = remove_bed_positions( recalibrated_snp_matrix_intermediate, args.remove_bed) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) # Recalibrate intermediate with VCF prior_recal = datetime.datetime.now() recalibrated_snp_matrix_mpileup = recalibrate_ddbb_vcf_intermediate( compare_snp_matrix_recal_intermediate, out_variant_dir, min_cov_low_freq=10) recalibrated_snp_matrix_mpileup.to_csv( compare_snp_matrix_recal_mpileup, sep="\t", index=False) after_recal = datetime.datetime.now() logger.debug("Done with recalibration vcf: %s" % (after_recal - prior_recal)) # Remove SNPs located within INDELs compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_mpileup) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) # Extract all positions marked as complex complex_variants = extract_complex_list(out_variant_dir) logger.debug('Complex positions in all samples:\n{}'.format( (",".join([str(x) for x in complex_variants])))) # Clean all faulty positions and samples => Final table recalibrated_revised_INDEL_df = revised_df(compare_snp_matrix_INDEL_intermediate_df, path_compare, complex_pos=complex_variants, min_freq_include=0.8, min_threshold_discard_uncov_sample=args.min_threshold_discard_uncov_sample, min_threshold_discard_uncov_pos=args.min_threshold_discard_uncov_pos, min_threshold_discard_htz_sample=args.min_threshold_discard_htz_sample, min_threshold_discard_htz_pos=args.min_threshold_discard_htz_pos, min_threshold_discard_all_pos=args.min_threshold_discard_all_pos, min_threshold_discard_all_sample=args.min_threshold_discard_all_sample, remove_faulty=True, drop_samples=True, drop_positions=True, windows_size_discard=args.window) recalibrated_revised_INDEL_df.to_csv( compare_snp_matrix_recal, sep="\t", index=False) # Matrix to pairwise and mwk ddtb_compare(compare_snp_matrix_recal, distance=5) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE AUTOSNIPPY ANALYSIS#####" + END_FORMATTING + "\n")
def main(): """ Create main function to capture code errors: https://stackoverflow.com/questions/6234405/logging-uncaught-exceptions-in-python """ # ARGUMENTS def get_arguments(): parser = argparse.ArgumentParser( prog='covidma.py', description= 'Pipeline to call variants (SNVs) with any non model organism. Specialised in SARS-CoV-2' ) input_group = parser.add_argument_group('Input', 'Input parameters') input_group.add_argument( '-i', '--input', dest="input_dir", metavar="input_directory", type=str, required=True, help='REQUIRED.Input directory containing all fast[aq] files') input_group.add_argument('-r', '--reference', metavar="reference", type=str, required=True, help='REQUIRED. File to map against') input_group.add_argument( '-a', '--annotation', metavar="annotation", type=str, required=True, help='REQUIRED. gff3 file to annotate variants') input_group.add_argument('-s', '--sample', metavar="sample", type=str, required=False, help='Sample to identify further files') input_group.add_argument( '-L', '--sample_list', type=str, required=False, help='Sample names to analyse only in the file supplied') input_group.add_argument( '-p', '--primers', type=str, default= '/home/laura/DATABASES/Anotacion/COVID/primers/nCoV-2019.bed', required=False, help='Bed file including primers to trim') quality_group = parser.add_argument_group( 'Quality parameters', 'parameters for diferent triming conditions') quality_group.add_argument( '-c', '--coverage20', type=int, default=90, required=False, help= 'Minimum percentage of coverage at 20x to clasify as uncovered (Default 90)' ) quality_group.add_argument('-n', '--min_snp', type=int, required=False, default=1, help='SNP number to pass quality threshold') output_group = parser.add_argument_group( 'Output', 'Required parameter to output results') output_group.add_argument( '-o', '--output', type=str, required=True, help='REQUIRED. Output directory to extract all results') output_group.add_argument( '-C', '--noclean', required=False, action='store_false', help='Clean unwanted files for standard execution') params_group = parser.add_argument_group( 'Parameters', 'parameters for diferent stringent conditions') params_group.add_argument('-T', '--threads', type=str, dest="threads", required=False, default=16, help='Threads to use') params_group.add_argument('-M', '--memory', type=str, dest="memory", required=False, default=32, help='Max memory to use') annot_group = parser.add_argument_group( 'Annotation', 'parameters for variant annotation') annot_group.add_argument('-B', '--annot_bed', type=str, default=[], required=False, action='append', help='bed file to annotate') annot_group.add_argument('-V', '--annot_vcf', type=str, default=[], required=False, action='append', help='vcf file to annotate') annot_group.add_argument('-A', '--annot_aa', type=str, default=[], required=False, action='append', help='aminoacid file to annotate') annot_group.add_argument('-R', '--remove_bed', type=str, default=False, required=False, help='BED file with positions to remove') annot_group.add_argument( '--mash_database', type=str, required=False, default=False, help='MASH ncbi annotation containing all species database') annot_group.add_argument('--snpeff_database', type=str, required=False, default='NC_045512.2', help='snpEFF annotation database') compare_group = parser.add_argument_group( 'Compare', 'parameters for compare_snp') compare_group.add_argument('-S', '--only_snp', required=False, action='store_true', help='Use INDELS while comparing') arguments = parser.parse_args() return arguments args = get_arguments() ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] reference = os.path.abspath(args.reference) annotation = os.path.abspath(args.annotation) # LOGGING # Create log file with date and time right_now = str(datetime.datetime.now()) right_now_full = "_".join(right_now.split(" ")) log_filename = group_name + "_" + right_now_full + ".log" log_folder = os.path.join(output, 'Logs') check_create_dir(log_folder) log_full_path = os.path.join(log_folder, log_filename) logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s:%(message)s') file_handler = logging.FileHandler(log_full_path) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(formatter) logger.addHandler(stream_handler) logger.addHandler(file_handler) logger.info("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING) today = str(datetime.date.today()) logger.info("ARGUMENTS:") logger.info(str(args)) # Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) # Check if there are samples to filter out sample_list_F = [] if args.sample_list == None: logger.info("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: logger.info("samples will be filtered") sample_list_F = file_to_list(args.sample_list) new_samples = check_reanalysis(args.output, sample_list_F) logger.info("\n%d samples will be analysed: %s" % (len(new_samples), ",".join(new_samples))) #PREPARE REFERENCE FOR MAPPING + FAI + DICT ######### ##################################################### # picard_dictionary(args) samtools_faidx(args) #DECLARE FOLDERS CREATED IN PIPELINE ################ #AND KEY FILES ###################################### ##################################################### # Annotation related parameters # script_dir = os.path.dirname(os.path.realpath(__file__)) # Output related out_qc_dir = os.path.join(output, "Quality") out_qc_pre_dir = os.path.join(out_qc_dir, "raw") # subfolder out_qc_post_dir = os.path.join(out_qc_dir, "processed") # subfolder out_trim_dir = os.path.join(output, "Trimmed") out_map_dir = os.path.join(output, "Bam") out_variant_dir = os.path.join(output, "Variants") out_variant_ivar_dir = os.path.join(out_variant_dir, "ivar_raw") # subfolder out_filtered_ivar_dir = os.path.join(out_variant_dir, "ivar_filtered") # subfolder out_consensus_dir = os.path.join(output, "Consensus") out_consensus_ivar_dir = os.path.join(out_consensus_dir, "ivar") # subfolder out_stats_dir = os.path.join(output, "Stats") out_stats_bamstats_dir = os.path.join(out_stats_dir, "Bamstats") # subfolder out_stats_coverage_dir = os.path.join(out_stats_dir, "Coverage") # subfolder out_compare_dir = os.path.join(output, "Compare") out_annot_dir = os.path.join(output, "Annotation") out_annot_snpeff_dir = os.path.join(out_annot_dir, "snpeff") # subfolder out_annot_pangolin_dir = os.path.join(out_annot_dir, "pangolin") # subfolder out_annot_user_dir = os.path.join(out_annot_dir, "user") # subfolder out_annot_user_aa_dir = os.path.join(out_annot_dir, "user_aa") # subfolder new_sample_number = 0 for r1_file, r2_file in zip(r1, r2): # EXtract sample name sample = extract_sample(r1_file, r2_file) args.sample = sample if sample in sample_list_F: sample_number = str(sample_list_F.index(sample) + 1) sample_total = str(len(sample_list_F)) out_markdup_trimmed_name = sample + ".rg.markdup.trimmed.sorted.bam" output_markdup_trimmed_file = os.path.join( out_map_dir, out_markdup_trimmed_name) if sample in new_samples: new_sample_number = str(int(new_sample_number) + 1) new_sample_total = str(len(new_samples)) logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + " (" + new_sample_number + "/" + new_sample_total + ")" + END_FORMATTING) else: logger.info("\n" + WHITE_BG + "STARTING SAMPLE: " + sample + " (" + sample_number + "/" + sample_total + ")" + END_FORMATTING) if not os.path.isfile(output_markdup_trimmed_file): args.r1_file = r1_file args.r2_file = r2_file ##############START PIPELINE##################### ################################################# # INPUT ARGUMENTS ################ check_file_exists(r1_file) check_file_exists(r2_file) args.output = os.path.abspath(args.output) check_create_dir(args.output) # QUALITY CHECK in RAW with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_raw_name_r1 = (".").join( r1_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' out_qc_raw_name_r2 = (".").join( r2_file.split('/')[-1].split('.')[0:-2]) + '_fastqc.html' output_qc_raw_file_r1 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r1) output_qc_raw_file_r2 = os.path.join(out_qc_pre_dir, out_qc_raw_name_r2) if os.path.isfile(output_qc_raw_file_r1) and os.path.isfile( output_qc_raw_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in sample " + sample + END_FORMATTING) logger.info("R1: " + r1_file + "\nR2: " + r2_file) fastqc_quality(r1_file, r2_file, out_qc_pre_dir, args.threads) """ TODO: Human filter """ # QUALITY TRIMMING AND ADAPTER REMOVAL WITH fastp ################################################### out_trim_name_r1 = sample + ".trimmed_R1.fastq.gz" out_trim_name_r2 = sample + ".trimmed_R2.fastq.gz" output_trimming_file_r1 = os.path.join(out_trim_dir, out_trim_name_r1) output_trimming_file_r2 = os.path.join(out_trim_dir, out_trim_name_r2) if os.path.isfile(output_trimming_file_r1) and os.path.isfile( output_trimming_file_r2): logger.info(YELLOW + DIM + output_trimming_file_r1 + " EXIST\nOmmiting Trimming for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming sample " + sample + END_FORMATTING) fastp_trimming(r1_file, r2_file, sample, out_trim_dir, threads=args.threads, min_qual=20, window_size=10, min_len=35) # QUALITY CHECK in TRIMMED with fastqc ###################################################### check_create_dir(out_qc_dir) out_qc_pos_r1 = sample + ".trimmed_R1_fastqc.html" out_qc_pos_r2 = sample + ".trimmed_R2_fastqc.html" output_qc_precessed_file_r1 = os.path.join( out_qc_post_dir, out_qc_pos_r1) output_qc_precessed_file_r2 = os.path.join( out_qc_post_dir, out_qc_pos_r2) if os.path.isfile( output_qc_precessed_file_r1) and os.path.isfile( output_qc_precessed_file_r2): logger.info(YELLOW + DIM + output_qc_raw_file_r1 + " EXIST\nOmmiting QC for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Checking quality in processed sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2) fastqc_quality(output_trimming_file_r1, output_trimming_file_r2, out_qc_post_dir, args.threads) # MAPPING WITH BWA - SAM TO SORTED BAM - ADD HEADER SG ##################################################### out_map_name = sample + ".rg.sorted.bam" output_map_file = os.path.join(out_map_dir, out_map_name) if os.path.isfile(output_map_file): logger.info(YELLOW + DIM + output_map_file + " EXIST\nOmmiting Mapping for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Mapping sample " + sample + END_FORMATTING) logger.info("R1: " + output_trimming_file_r1 + "\nR2: " + output_trimming_file_r2 + "\nReference: " + reference) bwa_mapping(output_trimming_file_r1, output_trimming_file_r2, reference, sample, out_map_dir, threads=args.threads) sam_to_index_bam(sample, out_map_dir, output_trimming_file_r1, threads=args.threads) #MARK DUPLICATES WITH PICARDTOOLS ################### ##################################################### out_markdup_name = sample + ".rg.markdup.sorted.bam" output_markdup_file = os.path.join(out_map_dir, out_markdup_name) if os.path.isfile(output_markdup_file): logger.info(YELLOW + DIM + output_markdup_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Marking Dupes in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_map_file) picard_markdup(output_map_file) #TRIM PRIMERS WITH ivar trim ######################## ##################################################### if os.path.isfile(output_markdup_trimmed_file): logger.info(YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting Duplucate Mark for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Trimming primers in sample " + sample + END_FORMATTING) logger.info("Input Bam: " + output_markdup_file) ivar_trim(output_markdup_file, args.primers, sample, min_length=30, min_quality=20, sliding_window_width=4) else: logger.info( YELLOW + DIM + output_markdup_trimmed_file + " EXIST\nOmmiting BAM mapping and BAM manipulation in sample " + sample + END_FORMATTING) ########################END OF MAPPING AND BAM MANIPULATION##################################################################### ################################################################################################################################ #VARIANT CALLING WTIH ivar variants################## ##################################################### check_create_dir(out_variant_dir) out_ivar_variant_name = sample + ".tsv" out_ivar_variant_file = os.path.join(out_variant_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_variant_file): logger.info(YELLOW + DIM + out_ivar_variant_file + " EXIST\nOmmiting Variant call for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Calling variants with ivar in sample " + sample + END_FORMATTING) ivar_variants(reference, output_markdup_trimmed_file, out_variant_dir, sample, annotation, min_quality=15, min_frequency_threshold=0.01, min_depth=1) #VARIANT FILTERING ################################## ##################################################### check_create_dir(out_filtered_ivar_dir) out_ivar_filtered_file = os.path.join(out_filtered_ivar_dir, out_ivar_variant_name) if os.path.isfile(out_ivar_filtered_file): logger.info(YELLOW + DIM + out_ivar_filtered_file + " EXIST\nOmmiting Variant filtering for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Filtering variants in sample " + sample + END_FORMATTING) filter_tsv_variants(out_ivar_variant_file, out_filtered_ivar_dir, min_frequency=0.7, min_total_depth=10, min_alt_dp=4, is_pass=True, only_snp=False) #CREATE CONSENSUS with ivar consensus################## ####################################################### check_create_dir(out_consensus_dir) check_create_dir(out_consensus_ivar_dir) out_ivar_consensus_name = sample + ".fa" out_ivar_consensus_file = os.path.join(out_consensus_ivar_dir, out_ivar_consensus_name) if os.path.isfile(out_ivar_consensus_file): logger.info(YELLOW + DIM + out_ivar_consensus_file + " EXIST\nOmmiting Consensus for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating consensus with ivar in sample " + sample + END_FORMATTING) ivar_consensus(output_markdup_trimmed_file, out_consensus_ivar_dir, sample, min_quality=20, min_frequency_threshold=0.8, min_depth=20, uncovered_character='N') logger.info(GREEN + "Replacing consensus header in " + sample + END_FORMATTING) replace_consensus_header(out_ivar_consensus_file) ########################CREATE STATS AND QUALITY FILTERS######################################################################## ################################################################################################################################ #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_dir) check_create_dir(out_stats_bamstats_dir) out_bamstats_name = sample + ".bamstats" out_bamstats_file = os.path.join(out_stats_bamstats_dir, out_bamstats_name) if os.path.isfile(out_bamstats_file): logger.info(YELLOW + DIM + out_bamstats_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating bamstats in sample " + sample + END_FORMATTING) create_bamstat(output_markdup_trimmed_file, out_stats_bamstats_dir, sample, threads=args.threads) #CREATE Bamstats####################################### ####################################################### check_create_dir(out_stats_coverage_dir) out_coverage_name = sample + ".cov" out_coverage_file = os.path.join(out_stats_coverage_dir, out_coverage_name) if os.path.isfile(out_coverage_file): logger.info(YELLOW + DIM + out_coverage_file + " EXIST\nOmmiting Bamstats for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Creating coverage in sample " + sample + END_FORMATTING) create_coverage(output_markdup_trimmed_file, out_stats_coverage_dir, sample) # fastqc OUTPUT FORMAT FOR COMPARISON ###################################################### logger.info(GREEN + "Creating summary report for quality result " + END_FORMATTING) # format_html_image(out_qc_dir) # coverage OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating summary report for coverage result " + END_FORMATTING) obtain_group_cov_stats(out_stats_coverage_dir, group_name) # READS and VARIANTS OUTPUT SUMMARY ###################################################### logger.info(GREEN + "Creating overal summary report " + END_FORMATTING) obtain_overal_stats(output, group_name) # REMOVE UNCOVERED ############################################################################################################################## logger.info(GREEN + "Removing low quality samples" + END_FORMATTING) # remove_low_quality(output, min_percentage_20x=args.coverage20, # min_hq_snp=args.min_snp, type_remove='Uncovered') #ANNOTATION WITH SNPEFF, USER INOUT AND PANGOLIN #### ##################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING ANNOTATION IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_annot_dir) check_create_dir(out_annot_snpeff_dir) check_create_dir(out_annot_pangolin_dir) # SNPEFF if args.snpeff_database != False: # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_filtered_ivar_dir): if root == out_filtered_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_snpeff_dir, sample + ".annot") if os.path.isfile(out_annot_file): logger.info( YELLOW + DIM + out_annot_file + " EXIST\nOmmiting snpEff Annotation for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Annotating sample with snpEff: " + sample + END_FORMATTING) output_vcf = os.path.join(out_annot_snpeff_dir, sample + '.vcf') annotate_snpeff(filename, output_vcf, out_annot_file, database=args.snpeff_database) # USER DEFINED if not args.annot_bed and not args.annot_vcf: logger.info(YELLOW + BOLD + "Ommiting User Annotation, no BED or VCF files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_dir) # CHANGE FOR RAW/FILTERED ANNOTATION for root, _, files in os.walk(out_variant_ivar_dir): if root == out_variant_ivar_dir: # CHANGE FOR RAW/FILTERED ANNOTATION for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] logger.info( 'User bed/vcf annotation in sample {}'.format( sample)) filename = os.path.join(root, name) out_annot_file = os.path.join(out_annot_user_dir, sample + ".tsv") user_annotation(filename, out_annot_file, vcf_files=args.annot_vcf, bed_files=args.annot_bed) # USER AA DEFINED if not args.annot_aa: logger.info(YELLOW + BOLD + "Ommiting User aa Annotation, no AA files supplied" + END_FORMATTING) else: check_create_dir(out_annot_user_aa_dir) for root, _, files in os.walk(out_annot_snpeff_dir): if root == out_annot_snpeff_dir: for name in files: if name.endswith('.annot'): sample = name.split('.')[0] logger.info( 'User aa annotation in sample {}'.format(sample)) filename = os.path.join(root, name) out_annot_aa_file = os.path.join( out_annot_user_aa_dir, sample + ".tsv") if os.path.isfile(out_annot_aa_file): user_annotation_aa(out_annot_aa_file, out_annot_aa_file, aa_files=args.annot_aa) else: user_annotation_aa(filename, out_annot_aa_file, aa_files=args.annot_aa) # PANGOLIN with concurrent.futures.ThreadPoolExecutor( max_workers=args.threads) as executor: futures_pangolin = [] for root, _, files in os.walk(out_consensus_ivar_dir): if root == out_consensus_ivar_dir: for name in files: if name.endswith('.fa'): sample = name.split('.')[0] filename = os.path.join(root, name) out_pangolin_filename = sample + ".lineage.csv" out_pangolin_file = os.path.join( out_annot_pangolin_dir, out_pangolin_filename) if os.path.isfile(out_pangolin_file): logger.info( YELLOW + DIM + out_pangolin_file + " EXIST\nOmmiting Lineage for sample " + sample + END_FORMATTING) else: logger.info(GREEN + "Obtaining Lineage in sample " + sample + END_FORMATTING) future = executor.submit(annotate_pangolin, filename, out_annot_pangolin_dir, out_pangolin_filename, threads=args.threads, max_ambig=0.6) futures_pangolin.append(future) for future in concurrent.futures.as_completed( futures_pangolin): logger.info(future.result()) # annotate_pangolin(filename, out_annot_pangolin_dir, # out_pangolin_filename, threads=args.threads, max_ambig=0.6) # USER AA TO HTML annotated_samples = [] logger.info('Adapting annotation to html in {}'.format(group_name)) for root, _, files in os.walk(out_annot_user_aa_dir): if root == out_annot_user_aa_dir: for name in files: if name.endswith('.tsv'): sample = name.split('.')[0] annotated_samples.append(sample) filename = os.path.join(root, name) annotation_to_html(filename, sample) annotated_samples = [str(x) for x in annotated_samples] report_samples_html_all = report_samples_html.replace( 'ALLSAMPLES', ('","').join(annotated_samples)) # NEW with open(os.path.join(out_annot_user_aa_dir, '00_all_samples.html'), 'w+') as f: f.write(report_samples_html_all) # SNP COMPARISON using tsv variant files ###################################################### logger.info("\n\n" + BLUE + BOLD + "STARTING COMPARISON IN GROUP: " + group_name + END_FORMATTING + "\n") check_create_dir(out_compare_dir) folder_compare = today + "_" + group_name path_compare = os.path.join(out_compare_dir, folder_compare) check_create_dir(path_compare) full_path_compare = os.path.join(path_compare, group_name) # ddtb_add(out_filtered_ivar_dir, full_path_compare) compare_snp_matrix_recal = full_path_compare + ".revised.final.tsv" compare_snp_matrix_INDEL = full_path_compare + ".revised_INDEL.final.tsv" compare_snp_matrix_recal_intermediate = full_path_compare + ".revised_intermediate.tsv" compare_snp_matrix_INDEL_intermediate = full_path_compare + \ ".revised_INDEL_intermediate.tsv" recalibrated_snp_matrix_intermediate = ddbb_create_intermediate( out_variant_ivar_dir, out_stats_coverage_dir, min_freq_discard=0.1, min_alt_dp=4, only_snp=args.only_snp) recalibrated_snp_matrix_intermediate.to_csv( compare_snp_matrix_recal_intermediate, sep="\t", index=False) compare_snp_matrix_INDEL_intermediate_df = remove_position_range( recalibrated_snp_matrix_intermediate) compare_snp_matrix_INDEL_intermediate_df.to_csv( compare_snp_matrix_INDEL_intermediate, sep="\t", index=False) recalibrated_revised_df = revised_df(recalibrated_snp_matrix_intermediate, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_df.to_csv(compare_snp_matrix_recal, sep="\t", index=False) recalibrated_revised_INDEL_df = revised_df( compare_snp_matrix_INDEL_intermediate_df, path_compare, min_freq_include=0.7, min_threshold_discard_sample=0.07, min_threshold_discard_position=0.4, remove_faulty=True, drop_samples=True, drop_positions=True) recalibrated_revised_INDEL_df.to_csv(compare_snp_matrix_INDEL, sep="\t", index=False) ddtb_compare(compare_snp_matrix_recal, distance=0) ddtb_compare(compare_snp_matrix_INDEL, distance=0, indel=True) logger.info("\n\n" + MAGENTA + BOLD + "COMPARING FINISHED IN GROUP: " + group_name + END_FORMATTING + "\n") #####################CONSENSUS WITH REFINED CALL###### ###################################################### logger.info(GREEN + "Creating refined consensus" + END_FORMATTING) create_consensus(reference, compare_snp_matrix_recal, out_stats_coverage_dir, out_consensus_dir) logger.info("\n\n" + MAGENTA + BOLD + "#####END OF PIPELINE COVID MULTI ANALYSIS#####" + END_FORMATTING + "\n")
args = get_arguments() print("ARGUMENTS\n") print(args) check_reanalysis(args.output) #Obtain all R1 and R2 from folder r1, r2 = extract_read_list(args.input_dir) #Check if there are samples to filter sample_list_F = [] if args.sample_list == None: print("\n" + "No samples to filter") for r1_file, r2_file in zip(r1, r2): sample = extract_sample(r1_file, r2_file) sample_list_F.append(sample) else: print("samples will be filtered") sample_list_F = file_to_list(args.sample_list) print("\n%d samples will be analysed: %s" % (len(sample_list_F), ",".join(sample_list_F))) ###################################################################### #####################START PIPELINE################################### ###################################################################### output = os.path.abspath(args.output) group_name = output.split("/")[-1] print("\n\n" + BLUE + BOLD + "STARTING PIPELINE IN GROUP: " + group_name + END_FORMATTING)