def main(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name): print("aligning consensus sequence from all samples\n") tmp_file = pathlib.Path(project_path, "temp_aligned_file.fasta") mafft_cmd = f"mafft --globalpair --maxiterate 1000 {str(all_samples_consens_seqs)} > {str(tmp_file)}" ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0] print(mafft_cmd) run = try_except_continue_on_fail(mafft_cmd) if not run: print(f"could not align {all_samples_consens_seqs}") sys.exit("exiting") else: all_samples_consens_seqs.unlink() os.rename(str(tmp_file), str(all_samples_consens_seqs)) # calculate coverage ref_length = len(ref_seq) coverage_outfile = pathlib.Path(project_path, f"{run_name}_genome_coverage.csv") all_consensus_d = fasta_to_dct(all_samples_consens_seqs) ref_lookup_name = list(all_consensus_d.keys())[0] aligned_ref = all_consensus_d[ref_lookup_name] del all_consensus_d[ref_lookup_name] with open(coverage_outfile, 'w') as fh: fh.write("sample_name,genome_coverage\n") for v_name, v_seq in all_consensus_d.items(): seq_coverage = 0 for i, base in enumerate(v_seq.upper()): if base != "-" and base != "N" and aligned_ref[i] != "-": seq_coverage += 1 percent_coverage = round((seq_coverage / ref_length) * 100, 2) fh.write(f"{v_name},{percent_coverage}\n") print("done")
def main(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name): print("aligning consensus sequence from all samples\n") tmp_file = pathlib.Path(project_path, "temp_aligned_file.fasta") mafft_cmd = f"mafft --thread -1 --auto {str(all_samples_consens_seqs)} > {str(tmp_file)}" ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0] print(mafft_cmd) run = try_except_continue_on_fail(mafft_cmd) if not run: print(f"could not align {all_samples_consens_seqs}") sys.exit("exiting") else: all_samples_consens_seqs.unlink() os.rename(str(tmp_file), str(all_samples_consens_seqs)) # calculate & collect seq stats ref_length = len(ref_seq) seqstats_outfile = pathlib.Path(project_path, f"{run_name}_seqstats.csv") all_consensus_d = fasta_to_dct(all_samples_consens_seqs) ref_lookup_name = list(all_consensus_d.keys())[0] aligned_ref = all_consensus_d[ref_lookup_name] sample_folder = pathlib.Path(project_path, "samples") del all_consensus_d[ref_lookup_name] with open(seqstats_outfile, 'w') as fh: fh.write("sample_name,genome_coverage,mean_depth\n") for v_name, v_seq in all_consensus_d.items(): print(v_name) seqname = v_name[0:-11] depth_file = csv.reader( open(f"{sample_folder}/{seqname}/{seqname}_depth.csv", "r")) mean_depth = "" for k, v in depth_file: mean_depth = v seq_coverage = 0 for i, base in enumerate(v_seq.upper()): if base != "-" and base != "N" and aligned_ref[i] != "-": seq_coverage += 1 percent_coverage = round((seq_coverage / ref_length) * 100, 2) fh.write(f"{v_name},{percent_coverage},{mean_depth}\n") print("done")
def main(project_path, reference, ref_start, ref_end, min_len, max_len, min_depth, run_step, rerun_step_only, basecall_mode, msa_cons, artic, cpu_cores, gpu_cores, gpu_buffers, use_gaps, use_bwa, guppy_path, real_time): threads = cpu_cores # set threads # threads = int() # if msa_cons: # threads = cpu_cores - 8 # else: # threads = cpu_cores # set the primer_scheme directory script_folder = pathlib.Path(__file__).absolute().parent primer_scheme_dir = pathlib.Path(script_folder, "primer-schemes") # get folder paths project_path = pathlib.Path(project_path).absolute() plot_folder = pathlib.Path(project_path, "seq_depth_plots") if os.path.exists(plot_folder): shutil.rmtree(plot_folder) plot_folder.mkdir(mode=0o777, parents=True, exist_ok=True) run_name = project_path.parts[-1] fast5_dir = pathlib.Path(project_path, "fast5") fastq_dir = pathlib.Path(project_path, "fastq") # sequencing_summary_file = pathlib.Path(fastq_dir, "sequencing_summary.txt") sample_names = pathlib.Path(project_path, "sample_names.csv") if not sample_names: sys.exit("Could not find sample_names.csv in project folder") demultiplexed_folder = pathlib.Path(project_path, "demultiplexed") sample_folder = pathlib.Path(project_path, "samples") print(sample_folder) # master_reads_file = pathlib.Path(project_path, run_name + "_all.fastq") time_stamp = str('{:%Y-%m-%d_%H_%M}'.format(datetime.datetime.now())) log_file = pathlib.Path(project_path, f"{time_stamp}_{run_name}_log_file.txt") with open(log_file, "w") as handle: handle.write(f"# start of pipeline run for project: {run_name}\n") now = datetime.datetime.now() date_time = now.strftime("%d/%m/%Y, %H:%M:%S") print(f"\nstart time = {date_time}\n\n") with open(log_file, "a") as handle: handle.write(f"\nstart time = {date_time}\n\n") # set dir to project dir so that output is written in correct place by external tools os.chdir(project_path) # set the reference genome reference_scheme = \ {"ChikECSA_V1_800": pathlib.Path(primer_scheme_dir, "ChikECSA800", "V1", "ChikECSA800.reference.fasta"), "ChikAsian_V1_400": pathlib.Path(primer_scheme_dir, "ChikAsian400", "V1", "ChikAsian400.reference.fasta"), "ZikaAsian_V1_400": pathlib.Path(primer_scheme_dir, "ZikaAsian400", "V1", "ZikaAsian400.reference.fasta"), "SARS2_V1_800": pathlib.Path(primer_scheme_dir, "SARS2_800", "V1", "SARS2_800.reference.fasta"), "SARS2_V1_400": pathlib.Path(primer_scheme_dir, "SARS2_400", "V1", "SARS2_400.reference.fasta"), "DENV1_V1_400": pathlib.Path(primer_scheme_dir, "DENV1_400", "V1", "DENV1_400.reference.fasta"), "DENV1_V2_400": pathlib.Path(primer_scheme_dir, "DENV1_400", "V2", "DENV1_400.reference.fasta"), "DENV2_V1_400": pathlib.Path(primer_scheme_dir, "DENV2_400", "V1", "DENV2_400.reference.fasta") } chosen_ref_scheme = str(reference_scheme[reference]) chosen_ref_scheme_bed_file = chosen_ref_scheme.replace( ".reference.fasta", ".scheme.bed") scheme_name = reference.replace("_V1_", "_") ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0] ref_name = ref_name.split()[0] if not ref_start or ref_start == 0: ref_start = 1 if not ref_end or ref_end > len(ref_seq): ref_end = len(ref_seq) # reference_slice = f'{ref_name}:{ref_start}-{ref_end}' print(f"\nReference is {chosen_ref_scheme}\n") print(f"\nPrimer bed file is {chosen_ref_scheme_bed_file}\n") with open(log_file, "a") as handle: handle.write( f"\nReference is {chosen_ref_scheme}\nPrimer bed file is {chosen_ref_scheme_bed_file}\n" ) if run_step == 0: run = gupppy_basecall(fast5_dir, guppy_path, fastq_dir, gpu_cores, basecall_mode, real_time, reference, script_folder) faildir = pathlib.Path(fastq_dir, "fail") shutil.rmtree(faildir) if run and not rerun_step_only: run_step = 1 elif run and rerun_step_only: sys.exit("Run step only completed, exiting") else: sys.exit("Basecalling failed") if run_step == 1: # demultiplex print(f"\nrunning: demultiplexing") with open(log_file, "a") as handle: handle.write(f"\nrunning: demultiplexing") if not list(fastq_dir.glob("*.fastq*")): fastq_dir = pathlib.Path(fastq_dir, "pass") if not list(fastq_dir.glob("*.fastq*")): print( f"No fastq files found in {str(fastq_dir)} or {str(fastq_dir.parent)}" ) sys.exit("fastq files not found") run = guppy_demultiplex(fastq_dir, guppy_path, demultiplexed_folder, threads, gpu_buffers, gpu_cores) if run and not rerun_step_only: run_step = 2 elif run and rerun_step_only: sys.exit("demultiplexing completed, exiting") else: sys.exit("demultiplexing failed") if run_step == 2: pre_existing_files = list(demultiplexed_folder.glob("*.fastq")) if pre_existing_files: print( "Found existing files in top level of demultiplex folder.\nThese files will be deleted" ) for file in pre_existing_files: os.unlink((str(file))) for folder in demultiplexed_folder.glob("barcode*"): search = list(pathlib.Path(folder).glob("*.fastq")) if not search: print(f"no files in folder\nskipping folder: {folder}\n") continue if len(search) > 1: barcode_number = pathlib.Path(search[0]).parent.parts[-1] concat_outfile = f"cat_barcode_{barcode_number}.fastq" cat_cmd = f"cat " for file in search: cat_cmd += f"{str(file)} " cat_cmd += f" > {concat_outfile}" try_except_exit_on_fail(cat_cmd) new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq") filtered_file = filter_length(concat_outfile, new_name, max_len, min_len) os.unlink(str(concat_outfile)) if not filtered_file: print( f"no sequences in file after length filtering for {concat_outfile}\n" ) # sed_syntax = r"\t/\n" # bash_cmd = f"cat {concat_outfile} | paste - - - - | awk 'length($2) >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}" # print(bash_cmd) # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \ # f"--min-length {min_len} --max-length {max_len} " \ # f"{concat_outfile} {new_name} " # vsearch_cmd = f"vsearch --fastq_filter {concat_outfile} -fastq_maxlen {max_len} " \ # f"--fastq_qmax 100 --fastq_minlen {min_len} --fastqout {new_name}" # try_except_exit_on_fail(bash_cmd) else: file = pathlib.Path(search[0]) barcode_number = file.parent.parts[-1] new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq") filtered_file = filter_length(file, new_name, max_len, min_len) if not filtered_file: print( f"no sequences in file after length filtering for {file}\n" ) # sed_syntax = r"\t/\n" # bash_cmd = f"cat {file} | paste - - - - | awk 'length($2) >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}" # print(bash_cmd) # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \ # f"--min-length {min_len} --max-length {max_len} " \ # f"{file} {new_name} " # vsearch_cmd = f"vsearch --fastq_filter {file} -fastq_maxlen {max_len} --fastq_minlen {min_len} " \ # f"--fastq_qmax 100 --fastqout {new_name}" # try_except_exit_on_fail(bash_cmd) if not rerun_step_only: run_step = 3 elif rerun_step_only: sys.exit( "filer demultiplexed files and rename them completed, exiting") else: sys.exit("filtering and renaming demultiplexed files failed") # if run_step == 3 and not msa_cons: # # index concatenated fastq with nanopolish # print(f"\nrunning: nanopolish index on fast5/fastq files") # with open(log_file, "a") as handle: # handle.write(f"\nrunning: nanopolish index on fast5/fastq files\n") # if not sequencing_summary_file.is_file(): # handle.write(f"\nSequencing summary file not found") # nanopolish_index_cmd = f"nanopolish index -d {fast5_dir} {master_reads_file} " # else: # nanopolish_index_cmd = f"nanopolish index -s {sequencing_summary_file} -d {fast5_dir} " \ # f"{master_reads_file} " # try_except_exit_on_fail(nanopolish_index_cmd) # if not rerun_step_only: # run_step = 4 # else: # sys.exit("Run step only completed, exiting") if run_step == 3: # concatenated demultiplexed files for each sample and setup sample names and barcode combinations print( "collecting demultiplexed files into sample.fastq files based on specified sample barcode combinations\n" ) with open(log_file, "a") as handle: handle.write( f"\ncollecting demultiplexed files into sample.fastq files based on specified sample " f"barcode combinations\n") sample_names_df = pd.read_csv(sample_names, sep=None, keep_default_na=False, na_values=['NA'], engine="python") sample_names_df['barcode_1'] = sample_names_df['barcode_1'].apply( lambda x: cat_sample_names(x, run_name)) sample_names_df['barcode_2'] = sample_names_df['barcode_2'].apply( lambda x: cat_sample_names(x, run_name)) sample_names_dict = sample_names_df.set_index('sample_name').T.to_dict( orient='list') for sample_name, [barcode_1, barcode_2] in sample_names_dict.items(): sample_dir = pathlib.Path(sample_folder, sample_name) if not sample_dir.exists(): pathlib.Path(sample_dir).mkdir(mode=0o777, parents=True, exist_ok=True) # allow for case where only one barcode was specified per sample. barcode_1_file = pathlib.Path(demultiplexed_folder, barcode_1) if barcode_2 == " ": barcode_2_file = "" else: barcode_2_file = pathlib.Path(demultiplexed_folder, barcode_2) cat_outfile = pathlib.Path(sample_dir, f"{sample_name}.fastq") cat_cmd = f"cat {str(barcode_1_file)} {str(barcode_2_file)} > {cat_outfile}" print(cat_cmd) run = try_except_continue_on_fail(cat_cmd) if not run: print( "missing one or more demultiplexed files for this sample") with open(log_file, "a") as handle: handle.write( "\nmissing one or more demultiplexed files for this sample\n" ) continue if not rerun_step_only: run_step = 4 else: sys.exit("Run step only completed, exiting") if run_step == 4: print("Running variant calling on samples") with open(log_file, "a") as handle: handle.write(f"\nRunning variant calling on samples\n") if use_bwa: make_index_cmd = f"bwa index {chosen_ref_scheme}" with open(log_file, "a") as handle: handle.write(f"\n{make_index_cmd}\n") try_except_exit_on_fail(make_index_cmd) all_sample_files = pathlib.Path(sample_folder).glob("*/*.fastq") number_samples = len(list(sample_folder.glob('*/*.fastq'))) # make variable for project file containing all samples' consensus sequences project_name = project_path.parts[-1] all_samples_consens_seqs = pathlib.Path( project_path, project_name + "_all_samples.fasta") # initialize the file, and add reference to all consensus file with open(all_samples_consens_seqs, 'w') as fh: fh.write(f">{ref_name}\n{ref_seq}\n") p = pathlib.Path(project_path, project_name + '_mapping.csv') with open(p, 'w') as fh: fh.close() samples_run = 1 old_number_png_files = 0 for sample_fastq in all_sample_files: # get folder paths sample_folder = pathlib.Path(sample_fastq).parent sample_name = pathlib.Path(sample_fastq).stem os.chdir(sample_folder) seq_summary_file_name = "" for file in project_path.glob('sequencing_summary*.txt'): seq_summary_file_name = file seq_summary_file = pathlib.Path(seq_summary_file_name).resolve() artic_folder = pathlib.Path(sample_folder, "artic") if os.path.exists(artic_folder): shutil.rmtree(artic_folder) artic_folder.mkdir(mode=0o777, parents=True, exist_ok=True) # check if fastq is present if not sample_fastq.is_file(): print( f"\nCould not find the concatenated sample fastq file: {sample_fastq}\nskipping sample" ) with open(log_file, "a") as handle: handle.write( f"\nCould not find the concatenated sample fastq file: {sample_fastq}\nskipping sample" ) continue print( f"\n________________\n\nStarting processing sample: {sample_name}\n________________\n" ) with open(log_file, "a") as handle: handle.write( f"\n________________\n\nStarting processing sample: {sample_name}\n________________\n" ) # start artic pipeline in new window if artic: print(f"\n------->Running Artic's pipeline in new window\n") with open(log_file, "a") as handle: handle.write( f"\n------->Running Artic's pipeline in new window\n\n" ) artic_cmd = f"artic minion --normalise 400 --threads {threads} --scheme-directory ~/artic-ncov2019/primer_schemes " \ f"--read-file {sample_fastq} --fast5-directory {fast5_dir} " \ f"--sequencing-summary {seq_summary_file} {scheme_name} {sample_name} " \ f"2>&1 | tee -a {log_file}" print(artic_cmd) try_except_continue_on_fail( f"gnome-terminal -- /bin/sh -c 'conda run -n artic-ncov2019 {artic_cmd}'" ) last_file_made = pathlib.Path( sample_folder, sample_name + ".muscle.out.fasta") while pathlib.Path.exists(last_file_made) == False: time.sleep(5) else: time.sleep(2) all_files = os.listdir(sample_folder) # write consensus to master consensus file artic_cons_file = pathlib.Path( sample_folder, sample_name + ".consensus.fasta") artic_d = fasta_to_dct(artic_cons_file) with open(all_samples_consens_seqs, 'a') as fh: for name, seq in artic_d.items(): newname = re.sub("/ARTIC.*", "_art", name) fh.write(f">{newname}\n{seq.replace('-', '')}\n") for filename in all_files: if os.path.isfile( filename) and not filename.endswith('.fastq'): file = os.path.join(sample_folder, filename) shutil.move(file, artic_folder) # start majority consensus pipeline in new window if msa_cons: print( f"\n\n------->Running majority consensus pipeline in new window\n" ) with open(log_file, "a") as handle: handle.write( f"\n\n------->Running majority consensus pipeline in new window\n" ) majority_cmd = f"python ~/nanopore_pipeline_wrapper/msa_consensus.py -in {sample_fastq} -pf {plot_folder} -lf {log_file} " \ f"{use_bwa} -rs {chosen_ref_scheme} -bf {chosen_ref_scheme_bed_file} " \ f"-t {threads} -d {min_depth} {use_gaps} -ac {all_samples_consens_seqs}" print(majority_cmd) try_except_continue_on_fail( f"gnome-terminal -- /bin/sh -c 'conda run -n nanop {majority_cmd}'" ) # open(f"{sample_name}_msa_from_bam_file.fasta", "w+") last_file_made_2 = pathlib.Path( sample_folder, sample_name + "_msa_from_bam_file.fasta") while pathlib.Path.exists(last_file_made_2) == False: time.sleep(5) else: if samples_run + 1 <= number_samples: print(f"\ncontinuing with sample {samples_run + 1}\n") # keep threads balanced number_png_files = len( list(plot_folder.glob('*_sequencing_depth.png'))) print(f'{number_png_files} png files created') difference = number_png_files - old_number_png_files old_number_png_files = number_png_files threads = threads - 1 + difference samples_run += 1 # run sample summary as soon as all sequencing_depth.png files made number_pngs = len(list(plot_folder.glob('*_sequencing_depth.png'))) if number_pngs < number_samples: print('waiting for all msa to be completed') while number_pngs < number_samples: time.sleep(10) number_pngs = len(list(plot_folder.glob('*_sequencing_depth.png'))) else: sample_summary(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name) now = datetime.datetime.now() date_time = now.strftime("%d/%m/%Y, %H:%M:%S") print(f"\nend time = {date_time}\n\n") with open(log_file, "a") as handle: handle.write(f"\nend time = {date_time}\n\n") print("sample processing completed\n") with open(log_file, "a") as handle: handle.write(f"\nsample processing completed\n\n") # compress fast5 files targzpath = pathlib.Path(project_path.parent, run_name + ".tar.gz") tarcmd = f"tar cf - {fast5_dir} | pigz -7 -p 16 > {targzpath}" try_except_exit_on_fail(tarcmd) print(tarcmd) with open(log_file, "a") as handle: handle.write(f"\n{tarcmd}\n\n")
def main(infile, plot_folder, log_file, use_minmap2, chosen_ref_scheme, chosen_ref_scheme_bed_file, threads, msa_cons_also, min_depth, use_gaps, all_samples_consens_seqs): # force absolute file paths sample_fastq = pathlib.Path(infile).absolute() script_folder = pathlib.Path(__file__).absolute().parent if not sample_fastq.is_file(): print( f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample" ) with open(log_file, "a") as handle: handle.write( f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample" ) return False # set the reference coordinates to use ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0] ref_name = ref_name.split()[0] reference_slice = f"{ref_name}:0-{len(ref_seq)}" # set input and output file name sample_name = pathlib.Path(sample_fastq).stem sample_folder = pathlib.Path(sample_fastq).parent sam_name = pathlib.Path(sample_folder, sample_name + "_mapped.sam") trimmed_sam_file = pathlib.Path(sample_folder, sample_name + ".primerclipped.sam") trimmed_bam_file = pathlib.Path(sample_folder, sample_name + ".primerclipped.bam") sorted_trimmed_bam_file = pathlib.Path( sample_folder, sample_name + ".primerclipped_sorted.bam") bcftools_vcf_file = pathlib.Path(sample_folder, sample_name + "_bcftools.vcf") bcftools_cons_file = pathlib.Path( sample_folder, sample_name + "_consensus_bcftools.fasta") msa_fasta = pathlib.Path(sample_folder, sample_name + "_msa_from_bam_file.fasta") msa_cons = pathlib.Path(sample_folder, sample_name + "_msa_consensus.fasta") # make sure cwd is the sample folder, as some programs output to cwd os.chdir(sample_folder) print( f"\n\n________________\nStarting processing sample: {sample_name}\n\n________________\n" ) with open(log_file, "a") as handle: handle.write( f"\n\n________________\nStarting processing sample: {sample_name}\n\n________________\n" ) if use_minmap2: # run read mapping using minimap print(f"\nrunning: minimap2 read mapping\n") minimap2_cmd = f"minimap2 -a -Y -t 8 -x ava-ont {chosen_ref_scheme} {sample_fastq} -o {sam_name} " \ f"2>&1 | tee -a {log_file}" print("\n", minimap2_cmd, "\n") with open(log_file, "a") as handle: handle.write(f"\nrunning: bwa read mapping\n") handle.write(f"{minimap2_cmd}\n") run = try_except_continue_on_fail(minimap2_cmd) if not run: return False else: # run read mapping using bwa print(f"\nrunning: bwa read mapping\n") bwa_cmd = f"bwa mem -t {threads} -x ont2d {chosen_ref_scheme} {sample_fastq} -o {sam_name} " \ f"2>&1 | tee -a {log_file}" print("\n", bwa_cmd, "\n") with open(log_file, "a") as handle: handle.write(f"\nrunning: bwa read mapping\n") handle.write(f"{bwa_cmd}\n") run = try_except_continue_on_fail(bwa_cmd) if not run: return False # remove primer sequences with custom script print(f"\nrunning: trim primer sequences from bam file\n") trim_script = pathlib.Path(script_folder, "src", "clip_primers_from_bed_file.py") trim_primer = f"python {trim_script} -in {sam_name} -o {trimmed_sam_file} " \ f"-b {chosen_ref_scheme_bed_file} 2>&1 | tee -a {log_file}" print("\n", trim_primer, "\n") with open(log_file, "a") as handle: handle.write( f"\nrunning: soft clipping primer sequences from bam file\n") handle.write(f"{trim_primer}\n") run = try_except_continue_on_fail(trim_primer) if not run: return False # convert sam to bam print(f"\nrunning: sam to bam conversion of trimmed file") sam_bam_cmd = f"samtools view -bS {trimmed_sam_file} -o {trimmed_bam_file} 2>&1 | tee -a {log_file}" print("\n", sam_bam_cmd, "\n") with open(log_file, "a") as handle: handle.write(f"\nrunning: sam to bam conversion\n") handle.write(f"{sam_bam_cmd}\n") run = try_except_continue_on_fail(sam_bam_cmd) if not run: return False # sort bam file print(f"\nrunning: sorting bam file") sort_sam_cmd = f"samtools sort -T {sample_name} {trimmed_bam_file} -o {sorted_trimmed_bam_file} " \ f"2>&1 | tee -a {log_file}" print("\n", sort_sam_cmd, "\n") with open(log_file, "a") as handle: handle.write(f"\nrunning: sorting bam file\n{sort_sam_cmd}\n") run = try_except_continue_on_fail(sort_sam_cmd) if not run: return False # index trimmed bam file print(f"\nrunning: indexing bam file") index_bam_cmd = f"samtools index {sorted_trimmed_bam_file} 2>&1 | tee -a {log_file}" print("\n", index_bam_cmd, "\n") with open(log_file, "a") as handle: handle.write(f"\nrunning: indexing bam file\n") handle.write(f"{index_bam_cmd}\n") run = try_except_continue_on_fail(index_bam_cmd) if not run: return False # make bcftools consensus print(f"\nrunning: making consensuses sequence from bcftools\n") min_base_qual = 30 # default=13 p_val_of_variant = 0.2 # default=0.5 bcf_vcf_cmd = f"bcftools mpileup --threads {threads} --max-depth 10000 --min-BQ {min_base_qual} -Oz " \ f"-f {chosen_ref_scheme} {sorted_trimmed_bam_file} | bcftools call -c -p {p_val_of_variant} " \ f"--ploidy 1 -Oz -o {bcftools_vcf_file} 2>&1 | tee -a {log_file}" bcf_index_cmd = f"bcftools index {bcftools_vcf_file} 2>&1 | tee -a {log_file}" bcf_cons_cmd = f"bcftools consensus -H A -f {chosen_ref_scheme} {bcftools_vcf_file} " \ f"-o {bcftools_cons_file} 2>&1 | tee -a {log_file}" with open(log_file, "a") as handle: handle.write( f"\nrunning: making consensuses sequence from bcftools:\n") handle.write(f"{bcf_vcf_cmd}\n\n{bcf_index_cmd}\n\n{bcf_cons_cmd}\n") run = try_except_continue_on_fail(bcf_vcf_cmd) if not run: return False run = try_except_continue_on_fail(bcf_index_cmd) if not run: return False run = try_except_continue_on_fail(bcf_cons_cmd) if not run: return False # rename the fasta header to the sample name rename_fasta(bcftools_cons_file, sample_name, "bcftools_cons") bcf_cons_d = fasta_to_dct(bcftools_cons_file) # write consensus to master consensus file with open(all_samples_consens_seqs, 'a') as fh: for name, seq in bcf_cons_d.items(): fh.write(f">{name}\n{seq.replace('-', '')}\n") # # generate manual vcf consensus and seq depth + qual output depth_qual_outfile = vcf_processing(bcftools_vcf_file, min_depth, sample_folder) vcf_plots(depth_qual_outfile, plot_folder) # get json dump of reads and primer pairs json_file = list( pathlib.Path(sample_folder).glob("*read_primer_pair_lookup.json"))[0] if not json_file.is_file(): print("the json file containing primer pair depth info was not found") with open(str(json_file), 'r') as jd: read_primer_pairs_dct = json.load(jd) primer_pair_depth_outfile = pathlib.Path( plot_folder, sample_name + "_per_primer_depth.png") primer_pairs = [] primers_depth = [] for primer_pair, names_list in read_primer_pairs_dct.items(): primers_depth.append(len(names_list)) primer_pairs.append(primer_pair) max_depth = max(primers_depth) percent_primers_depth = [ round(val / max_depth * 100, 2) for val in primers_depth ] primers_and_depths = zip(primer_pairs, primers_depth) plot_primer_depth(primer_pairs, primers_depth, percent_primers_depth, sample_name, primer_pair_depth_outfile) if msa_cons_also: # convert bam file to a mutli fasta alignment print( f"\nrunning: making consensuses sequence from bam to MSA with jvarkit\n" ) sam4web = pathlib.Path(script_folder, "jvarkit", "dist", "sam4weblogo.jar") msa_from_bam = f"java -jar {sam4web} -r '{reference_slice}' -o {msa_fasta} " \ f"{sorted_trimmed_bam_file} 2>&1 | tee -a {log_file}" print(msa_from_bam) with open(log_file, "a") as handle: handle.write( f"\nrunning: making consensuses sequence from bam to MSA with jvarkit\n" ) handle.write(f"{msa_from_bam}\n") run = try_except_continue_on_fail(msa_from_bam) if not run: return False # convert multi fasta alignment to consensus sequence fasta_msa_d = fasta_to_dct(msa_fasta) if len(fasta_msa_d) == 0: print( f"{sam_name} alignment had no sequences\nskipping to next sample\n" ) with open(log_file, "a") as handle: handle.write( f"{sam_name} alignment had no sequences\nskipping to next sample\n" ) return False # set minimum depth for calling a position in the consensus sequence per primer region positional_depth = collections.defaultdict(int) for (primerpair, depth) in primers_and_depths: start_pos = int(primerpair.split("_")[0]) end_pos = int(primerpair.split("_")[1]) for i in range(start_pos, end_pos + 1): positional_depth[str(i).zfill(4)] += depth # build the consensus sequence try: cons, depth_profile = consensus_maker(fasta_msa_d, positional_depth, min_depth, use_gaps) except IndexError as e: with open(log_file, "a") as handle: handle.write( f"\nNo MSA made from Bam file\nno reads may have been mapped\n{e}\n" ) else: with open(msa_cons, 'w') as handle: handle.write(f">{sample_name}\n{cons}\n") # write consensus to master consensus file with open(all_samples_consens_seqs, 'a') as fh: fh.write(f">{sample_name}\n{cons.replace('-', '')}\n") # plot depth for sample depth_list = depth_profile["non_gap"] depth_outfile = pathlib.Path(plot_folder, sample_name + "_sequencing_depth.png") plot_depth(depth_list, sample_name, depth_outfile) print(f"Completed processing sample: {sample_name}\n\n") with open(log_file, "a") as handle: handle.write( f"\n\n________________\nCompleted processing sample: {sample_name}\n\n________________\n" ) print("done")
def main(project_path, sample_names, reference, ref_start, ref_end, min_len, max_len, min_depth, run_step, rerun_step_only, basecall_mode, msa_cons_only, threads, gpu_cores, gpu_buffers, use_gaps, use_minmap2, guppy_path, real_time): # set the primer_scheme directory script_folder = pathlib.Path(__file__).absolute().parent primer_scheme_dir = pathlib.Path(script_folder, "primer-schemes") # get folder paths project_path = pathlib.Path(project_path).absolute() plot_folder = pathlib.Path(project_path, "seq_depth_plots") plot_folder.mkdir(mode=0o777, parents=True, exist_ok=True) run_name = project_path.parts[-1] fast5_dir = pathlib.Path(project_path, "fast5") fastq_dir = pathlib.Path(project_path, "fastq") sequencing_summary_file = pathlib.Path(fastq_dir, "sequencing_summary.txt") sample_names = pathlib.Path(sample_names).absolute() demultiplexed_folder = pathlib.Path(project_path, "demultiplexed") sample_folder = pathlib.Path(project_path, "samples") master_reads_file = pathlib.Path(project_path, run_name + "_all.fastq") time_stamp = str('{:%Y-%m-%d_%H_%M}'.format(datetime.datetime.now())) log_file = pathlib.Path(project_path, f"{time_stamp}_{run_name}_log_file.txt") with open(log_file, "w") as handle: handle.write(f"# start of pipeline run for project: {run_name}\n") # set dir to project dir so that output is written in correct place by external tools os.chdir(project_path) # set the reference genome reference_scheme = \ {"ChikECSA_V1_800": pathlib.Path(primer_scheme_dir, "ChikECSA800", "V1", "ChikECSA800.reference.fasta"), "ChikAsian_V1_400": pathlib.Path(primer_scheme_dir, "ChikAsian400", "V1", "ChikAsian400.reference.fasta"), "ZikaAsian_V1_400": pathlib.Path(primer_scheme_dir, "ZikaAsian400", "V1", "ZikaAsian400.reference.fasta"), "SARS2_V1_800": pathlib.Path(primer_scheme_dir, "SARS2_800", "V1", "SARS2_800.reference.fasta"), "SARS2_V1_400": pathlib.Path(primer_scheme_dir, "SARS2_400", "V1", "SARS2_400.reference.fasta"), } chosen_ref_scheme = str(reference_scheme[reference]) chosen_ref_scheme_bed_file = chosen_ref_scheme.replace(".reference.fasta", ".scheme.bed") ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0] ref_name = ref_name.split()[0] if not ref_start or ref_start == 0: ref_start = 1 if not ref_end or ref_end > len(ref_seq): ref_end = len(ref_seq) reference_slice = f'{ref_name}:{ref_start}-{ref_end}' print(f"\nReference is {chosen_ref_scheme}\n") print(f"\nPrimer bed file is {chosen_ref_scheme_bed_file}\n") with open(log_file, "a") as handle: handle.write(f"\nReference is {chosen_ref_scheme}\nPrimer bed file is {chosen_ref_scheme_bed_file}\n") if run_step == 0: run = gupppy_basecall(fast5_dir, guppy_path, fastq_dir, gpu_cores, basecall_mode, real_time, reference, script_folder) faildir = pathlib.Path(fastq_dir, "fail") shutil.rmtree(faildir) if run and not rerun_step_only: run_step = 1 elif run and rerun_step_only: sys.exit("Run step only completed, exiting") else: sys.exit("Basecalling failed") if run_step == 1: # demultiplex print(f"\nrunning: demultiplexing") with open(log_file, "a") as handle: handle.write(f"\nrunning: demultiplexing") if not list(fastq_dir.glob("*.fastq*")): fastq_dir = pathlib.Path(fastq_dir, "pass") if not list(fastq_dir.glob("*.fastq*")): print(f"No fastq files found in {str(fastq_dir)} or {str(fastq_dir.parent)}") sys.exit("fastq files not found") run = guppy_demultiplex(fastq_dir, guppy_path, demultiplexed_folder, threads, gpu_buffers, gpu_cores) if run and not rerun_step_only: run_step = 2 elif run and rerun_step_only: sys.exit("demultiplexing completed, exiting") else: sys.exit("demultiplexing failed") if run_step == 2: pre_existing_files = list(demultiplexed_folder.glob("*.fastq")) if pre_existing_files: print("Found existing files in top level of demultiplex folder.\nThese files will be deleted") for file in pre_existing_files: os.unlink((str(file))) for folder in demultiplexed_folder.glob("barcode*"): search = list(pathlib.Path(folder).glob("*.fastq")) if not search: print(f"no files in folder\nskipping folder: {folder}\n") continue if len(search) > 1: barcode_number = pathlib.Path(search[0]).parent.parts[-1] concat_outfile = f"cat_barcode_{barcode_number}.fastq" cat_cmd = f"cat " for file in search: cat_cmd += f"{str(file)} " cat_cmd += f" > {concat_outfile}" try_except_exit_on_fail(cat_cmd) new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq") filtered_file = filter_length(concat_outfile, new_name, max_len, min_len) os.unlink(str(concat_outfile)) if not filtered_file: print(f"no sequences in file after length filtering for {concat_outfile}\n") # sed_syntax = r"\t/\n" # bash_cmd = f"cat {concat_outfile} | paste - - - - | awk 'length($2) >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}" # print(bash_cmd) # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \ # f"--min-length {min_len} --max-length {max_len} " \ # f"{concat_outfile} {new_name} " # vsearch_cmd = f"vsearch --fastq_filter {concat_outfile} -fastq_maxlen {max_len} " \ # f"--fastq_qmax 100 --fastq_minlen {min_len} --fastqout {new_name}" # try_except_exit_on_fail(bash_cmd) else: file = pathlib.Path(search[0]) barcode_number = file.parent.parts[-1] new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq") filtered_file = filter_length(file, new_name, max_len, min_len) if not filtered_file: print(f"no sequences in file after length filtering for {file}\n") # sed_syntax = r"\t/\n" # bash_cmd = f"cat {file} | paste - - - - | awk 'length($2) >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}" # print(bash_cmd) # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \ # f"--min-length {min_len} --max-length {max_len} " \ # f"{file} {new_name} " # vsearch_cmd = f"vsearch --fastq_filter {file} -fastq_maxlen {max_len} --fastq_minlen {min_len} " \ # f"--fastq_qmax 100 --fastqout {new_name}" # try_except_exit_on_fail(bash_cmd) if not rerun_step_only and not msa_cons_only: run_step = 3 elif not rerun_step_only and msa_cons_only: run_step = 4 elif rerun_step_only: sys.exit("filer demultiplexed files and rename them completed, exiting") else: sys.exit("filtering and renaming demultiplexed files failed") if run_step == 3 and not msa_cons_only: # index concatenated fastq with nanopolish print(f"\nrunning: nanopolish index on fast5/fastq files") with open(log_file, "a") as handle: handle.write(f"\nrunning: nanopolish index on fast5/fastq files\n") if not sequencing_summary_file.is_file(): handle.write(f"\nSequencing summary file not found") nanopolish_index_cmd = f"nanopolish index -d {fast5_dir} {master_reads_file} " else: nanopolish_index_cmd = f"nanopolish index -s {sequencing_summary_file} -d {fast5_dir} " \ f"{master_reads_file} " try_except_exit_on_fail(nanopolish_index_cmd) if not rerun_step_only: run_step = 4 else: sys.exit("Run step only completed, exiting") if run_step == 4: # concatenated demultiplexed files for each sample and setup sample names and barcode combinations print("collecting demultiplexed files into sample.fastq files based on specified sample barcode combinations\n") with open(log_file, "a") as handle: handle.write(f"\ncollecting demultiplexed files into sample.fastq files based on specified sample " f"barcode combinations\n") sample_names_df = pd.read_csv(sample_names, sep=None, keep_default_na=False, na_values=['NA'], engine="python") sample_names_df['barcode_1'] = sample_names_df['barcode_1'].apply(lambda x: cat_sample_names(x, run_name)) sample_names_df['barcode_2'] = sample_names_df['barcode_2'].apply(lambda x: cat_sample_names(x, run_name)) sample_names_dict = sample_names_df.set_index('sample_name').T.to_dict(orient='list') for sample_name, [barcode_1, barcode_2] in sample_names_dict.items(): sample_dir = pathlib.Path(sample_folder, sample_name) if not sample_dir.exists(): pathlib.Path(sample_dir).mkdir(mode=0o777, parents=True, exist_ok=True) # allow for case where only one barcode was specified per sample. barcode_1_file = pathlib.Path(demultiplexed_folder, barcode_1) if barcode_2 == " ": barcode_2_file = "" else: barcode_2_file = pathlib.Path(demultiplexed_folder, barcode_2) cat_outfile = pathlib.Path(sample_dir, f"{sample_name}.fastq") cat_cmd = f"cat {str(barcode_1_file)} {str(barcode_2_file)} > {cat_outfile}" print(cat_cmd) run = try_except_continue_on_fail(cat_cmd) if not run: print("missing one or more demultiplexed files for this sample") with open(log_file, "a") as handle: handle.write("\nmissing one or more demultiplexed files for this sample\n") continue for fastq in demultiplexed_folder.glob('*.fastq'): os.remove(str(fastq)) if not rerun_step_only: run_step = 5 else: sys.exit("Run step only completed, exiting") if run_step == 5: print("Running variant calling on samples") with open(log_file, "a") as handle: handle.write(f"\nRunning variant calling on samples\n") if not use_minmap2: make_index_cmd = f"bwa index {chosen_ref_scheme}" with open(log_file, "a") as handle: handle.write(f"\n{make_index_cmd}\n") try_except_exit_on_fail(make_index_cmd) all_sample_files = pathlib.Path(sample_folder).glob("*/*.fastq") # make variable for project file containing all samples' consensus sequences project_name = project_path.parts[-1] all_samples_consens_seqs = pathlib.Path(project_path, project_name + "_all_samples.fasta") # initialize the file, and add reference to all consensus file with open(all_samples_consens_seqs, 'w') as fh: fh.write(f">{ref_name}\n{ref_seq}\n") p = pathlib.Path(project_path, project_name + '_mapping.csv') with open(p, 'w') as fh: fh.close() for sample_fastq in all_sample_files: if not sample_fastq.is_file(): print(f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample") with open(log_file, "a") as handle: handle.write(f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample") continue run = sample_analysis(sample_fastq, plot_folder, log_file, use_minmap2, chosen_ref_scheme, chosen_ref_scheme_bed_file, threads, msa_cons_only, min_depth, use_gaps, all_samples_consens_seqs) if not run: continue # align the master consensus file sample_summary(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name) print("sample processing completed\n") with open(log_file, "a") as handle: handle.write(f"\nsample processing completed\n\n") targzpath = pathlib.Path(project_path.parent, run_name + ".tar.gz") tarcmd = f"tar cf - {fast5_dir} | pigz -7 -p 16 > {targzpath}" try_except_exit_on_fail(tarcmd) print(tarcmd) with open(log_file, "a") as handle: handle.write(f"\n{tarcmd}\n\n")