def setUp(self): self.paths = Paths(base_path=self.test_folder) self.folder_names = [ self.paths.input_folder, self.paths.output_folder, self.paths.align_report_folder, self.paths.raw_stat_data_folder, self.paths.read_fasta_folder, self.paths.ref_seq_folder, self.paths.annotation_folder, self.paths.read_alignment_index_folder, self.paths.read_alignments_folder, self.paths.processed_reads_folder, self.paths.unaligned_reads_folder, self.paths.coverage_raw_folder, self.paths.coverage_tnoar_min_norm_folder, self.paths.coverage_tnoar_mil_norm_folder, self.paths.gene_quanti_base_folder, self.paths.gene_wise_quanti_combined_path, ] self.static_files = [ self.paths.read_processing_stats_path, self.paths.read_alignments_stats_path, self.paths.read_file_stats, self.paths.read_alignment_stats_table_path, self.paths.ref_seq_file_stats, self.paths.index_path, ]
def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args) self._helpers = Helpers(args) self._read_files = None self._ref_seq_files = None self._align_viz = AlignViz()
def setUp(self): self.paths = Paths(base_path=self.test_folder) self.folder_names = [ self.paths.input_folder, self.paths.output_folder, self.paths.align_report_folder, self.paths.raw_stat_data_folder, self.paths.read_fasta_folder, self.paths.ref_seq_folder, self.paths.annotation_folder, self.paths.read_alignment_index_folder, self.paths.read_alignments_folder, self.paths.processed_reads_folder, self.paths.unaligned_reads_folder, self.paths.coverage_raw_folder, self.paths.coverage_tnoar_min_norm_folder, self.paths.coverage_tnoar_mil_norm_folder, self.paths.gene_quanti_base_folder, self.paths.gene_wise_quanti_combined_path] self.static_files = [ self.paths.read_processing_stats_path, self.paths.read_alignments_stats_path, self.paths.read_file_stats, self.paths.read_alignment_stats_table_path, self.paths.ref_seq_file_stats, self.paths.index_path]
def data_paths(): parser = argparse.ArgumentParser() parser.add_argument("project_path", default="/tmp/test", nargs="?") args = parser.parse_args() args.project_path = "/tmp/test" paths = Paths(args) test_folder = "/tmp/test" test_files = ["foo.fa", "bar.fa"] test_lib_names = ["foo", "bar"] folder_names = [ paths.input_folder, paths.output_folder, paths.align_report_folder, paths.raw_stat_data_folder, paths.read_fasta_folder, paths.ref_seq_folder, paths.annotation_folder, paths.read_alignment_index_folder, paths.read_alignments_folder, paths.processed_reads_folder, paths.unaligned_reads_folder, paths.coverage_raw_folder, paths.coverage_tnoar_min_norm_folder, paths.coverage_tnoar_mil_norm_folder, paths.gene_quanti_base_folder, paths.gene_wise_quanti_combined_path ] static_files = [ paths.read_processing_stats_path, paths.read_alignments_stats_path, paths.read_file_stats, paths.read_alignment_stats_table_path, paths.ref_seq_file_stats, paths.index_path ] global base_path global test_folder global test_files global test_lib_names global paths global folder_names global static_files
class CreateProject(object): """Create a READemption project including the necessary folders in order to perform ongoing procession steps.""" def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args) def create_project(self, version): """Create a new project.""" sys.stdout.write( " ___ _______ ___ __ _\n" " / _ \/ __/ _ | / _ \___ __ _ ___ / /_(_)__ ___\n" " / , _/ _// __ |/ // / -_) ' \/ _ \/ __/ / _ \/ _ \\\n" "/_/|_/___/_/ |_/____/\__/_/_/_/ .__/\__/_/\___/_//_/\n" " / /\n" "====================================================\n" "========================================\n" "=======================\n" "==============\n\n" "[http://pythonhosted.org/READemption/]\n\n") project_creator = ProjectCreator() project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders()) project_creator.create_version_file(self._paths.version_path, version) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( self._args.project_path)) print(self._paths.base_path) sys.stdout.write( "Please copy read files into folder \"%s\" and " "reference sequences files into folder \"%s\".\n" % ( self._paths.read_fasta_folder, self._paths.ref_seq_folder))
class Helpers(object): def __init__(self, args): self._paths = Paths(args) self._args = args def test_folder_existance(self, task_specific_folders): """Test the existance of required folders.""" for folder in ( self._paths.required_base_folders() + task_specific_folders): if not os.path.exists(folder): self.write_err_msg_and_quit( "Error! Folder '%s' does not exist! Is the given project " "folder name correct?\n" % folder) def file_needs_to_be_created(self, file_path, quiet=False): """Test if a file exists of need to be created.""" if not self._args.check_for_existing_files: return True if os.path.exists(file_path): if not quiet: sys.stderr.write( "File %s exists. Skipping its generation.\n" % file_path) return False return True def check_job_completeness(self, jobs): """Check the completness of each job in a list""" for job in concurrent.futures.as_completed(jobs): if job.exception(): raise(job.exception()) def write_err_msg_and_quit(self, msg): """Write error message and close the program gracefully.""" sys.stderr.write(msg) sys.exit(1) def was_paired_end_alignment(self, lib_names): """Check if the mapping was done in paired- or single-end mode""" if len(lib_names) * 2 == len(self._paths.get_read_files()): return True return False
class PerformAlignment(object): """Perform the alignment with either Segemehl or STAR.""" def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args) self._helpers = Helpers(args) self._read_files = None self._ref_seq_files = None self._align_viz = AlignViz() def align_reads(self): """Perform the alignment of the reads.""" self._helpers.test_folder_existance( self._paths.required_read_alignment_folders()) assert self._args.paired_end in [True, False] self._ref_seq_files = self._paths.get_ref_seq_files() self._paths.set_ref_seq_paths(self._ref_seq_files) self._test_align_file_existance() if not self._args.paired_end: # Single end reads self._read_files = self._paths.get_read_files() self._lib_names = self._paths.get_lib_names_single_end() self._paths.set_read_files_dep_file_lists_single_end( self._read_files, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() if not self._args.cutadapt: self._prepare_reads_single_end() else: self._prepare_reads_se_cutadapt() if self._args.segemehl: self._align_single_end_reads() else: self._align_se_star() else: # Paired end reads self._read_file_pairs = self._paths.get_read_file_pairs() self._lib_names = self._paths.get_lib_names_paired_end() self._paths.set_read_files_dep_file_lists_paired_end( self._read_file_pairs, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() if not self._args.cutadapt: self._prepare_reads_paired_end() else: self._prepare_reads_pe_cutadapt() if self._args.segemehl: self._align_paired_end_reads() else: self._align_pe_star() self._sam_to_bam(self._paths.primary_read_aligner_sam_paths, self._paths.primary_read_aligner_bam_prefix_paths, self._paths.primary_read_aligner_bam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.primary_read_aligner_bam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_stats_path) final_unaligned_reads_paths = self._paths.unaligned_reads_paths if self._args.realign: self._run_realigner_and_process_alignments() self._merge_bam_files() final_unaligned_reads_paths = ( self._paths.realigned_unaligned_reads_paths) if self._args.crossalign_cleaning_str is not None: self._remove_crossaligned_reads() if not self._args.cutadapt: self._generate_read_alignment_stats( self._lib_names, self._paths.read_alignment_bam_paths, final_unaligned_reads_paths, self._paths.read_alignments_stats_path) self._write_alignment_stat_table() self._align_viz.alignment_viz( self._paths.read_alignments_stats_path, "{}".format(self._paths.viz_align_base_folder)) self._align_viz.processing_viz( self._paths.read_processing_stats_path, "{}".format(self._paths.viz_align_base_folder)) self._align_viz.alignment_processing_overview( self._paths.read_processing_stats_path, self._paths.read_alignments_stats_path, "{}".format(self._paths.viz_align_base_folder)) def _test_align_file_existance(self): """Test if the input file for the the align subcommand exist.""" if len(self._paths.get_read_files()) == 0: self._helpers.write_err_msg_and_quit( "Error! No read libraries given!\n") if len(self._ref_seq_files) == 0: self._helpers.write_err_msg_and_quit( "Error! No reference sequence files given!\n") def _set_primary_aligner_paths_to_final_paths(self): # If no remapping is performed the paths of the final bam files # is the paths of the primary mapper self._paths.primary_read_aligner_bam_prefix_paths = ( self._paths.read_alignment_bam_prefix_paths) self._paths.primary_read_aligner_bam_paths = ( self._paths.read_alignment_bam_paths) self._paths.primary_read_aligner_stats_path = ( self._paths.read_alignments_stats_path) def _prepare_reads_single_end(self): """Manage the prepartion of reads before the actual mappings.""" read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path, processed_read_path in zip( self._lib_names, self._paths.read_paths, self._paths.processed_read_paths): if not self._helpers.file_needs_to_be_created( processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=self._args.poly_a_clipping, min_read_length=self._args.min_read_length, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter, reverse_complement=self._args.reverse_complement) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_single_end, read_path, processed_read_path) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _prepare_reads_se_cutadapt(self): cutadapt = Cutadapt(self._args.cutadapt_bin) read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path, processed_read_path in zip( self._lib_names, self._paths.read_paths, self._paths.processed_read_paths): if not self._helpers.file_needs_to_be_created( processed_read_path): continue cutadapt = Cutadapt(self._args.cutadapt_options, self._args.cutadapt_bin) read_files_and_jobs[lib_name] = executor.submit( cutadapt.run_cutadapt_se, read_path, self._paths.processed_reads_folder, lib_name) # self._paths.gzip_processed_reads() self._helpers.check_job_completeness(read_files_and_jobs.values()) def _evaluet_job_and_generate_stat_file(self, read_files_and_jobs): raw_stat_data_writer = RawStatDataWriter(pretty=True) # Evaluate thread outcome self._helpers.check_job_completeness(read_files_and_jobs.values()) if not self._helpers.file_needs_to_be_created( self._paths.read_processing_stats_path): return # Create a dict of the read file names and the processing # counting results read_files_and_stats = dict([ (lib_name, job.result()) for lib_name, job in read_files_and_jobs.items() ]) raw_stat_data_writer.write(read_files_and_stats, self._paths.read_processing_stats_path) def _align_se_star(self): read_aligner = STAR(self._args.STAR_bin) if self._helpers.file_needs_to_be_created(self._paths.index_path_star): read_aligner.build_index( int(self._args.processes), self._paths.read_alignment_index_folder, " ".join([ self._paths.ref_seq_folder + '/' + ref for ref in self._paths.get_ref_seq_files() ]), int(self._args.indexN)) for read_path, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_paths, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._helpers.file_needs_to_be_created(output_path): continue elif not self._helpers.file_needs_to_be_created(bam_path): continue read_aligner.align_reads( int(self._args.processes), self._paths.read_alignment_index_folder, read_path, output_path, (self._paths.annotation_folder + '/' + " ".join(self._paths.get_annotation_files())), paired_end=False, include_annotation=False) self._paths.relocate_and_rename_star_output_se() def _align_single_end_reads(self): """Manage the actual alignment of single end reads.""" read_aligner = Segemehl(self._args.segemehl_bin, self._args.progress) if self._helpers.file_needs_to_be_created(self._paths.index_path): read_aligner.build_index(self._paths.ref_seq_paths, self._paths.index_path) for read_path, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_paths, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.read_alignment_bam_paths): if not self._helpers.file_needs_to_be_created(output_path): continue elif not self._helpers.file_needs_to_be_created(bam_path): continue read_aligner.run_alignment(read_path, self._paths.index_path, self._paths.ref_seq_paths, output_path, nomatch_path, int(self._args.processes), int(self._args.hit_strategy), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=False) def _prepare_reads_paired_end(self): read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path_pair, processed_read_path_pair in zip( self._lib_names, self._paths.read_path_pairs, self._paths.processed_read_path_pairs): for processed_read_path in processed_read_path_pair: if not self._helpers.file_needs_to_be_created( processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=False, min_read_length=self._args.min_read_length, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_paired_end, read_path_pair, processed_read_path_pair) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _prepare_reads_pe_cutadapt(self): cutadapt = Cutadapt(self._args.cutadapt_bin) read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path_pair, processed_read_path_pair in zip( self._lib_names, self._paths.read_path_pairs, self._paths.processed_read_path_pairs): if not self._helpers.file_needs_to_be_created( processed_read_path_pair): continue cutadapt = Cutadapt(self._args.cutadapt_options, self._args.cutadapt_bin) read_files_and_jobs[lib_name] = executor.submit( cutadapt.run_cutadapt_pe, read_path_pair, self._paths.processed_reads_folder, lib_name) self._helpers.check_job_completeness(read_files_and_jobs.values()) def _align_pe_star(self): read_aligner = STAR(self._args.STAR_bin) if self._helpers.file_needs_to_be_created(self._paths.index_path_star): read_aligner.build_index( int(self._args.processes), self._paths.read_alignment_index_folder, " ".join([ self._paths.ref_seq_folder + '/' + ref for ref in self._paths.get_ref_seq_files() ]), int(self._args.indexN)) for read_path_pair, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_path_pairs, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._helpers.file_needs_to_be_created(output_path): continue elif not self._helpers.file_needs_to_be_created(bam_path): continue read_aligner.align_reads( int(self._args.processes), self._paths.read_alignment_index_folder, read_path_pair, (self._paths.read_alignments_folder + '/' + " ".join(self._paths.get_lib_names_paired_end()) + '_'), (self._paths.annotation_folder + '/' + " ".join(self._paths.get_annotation_files())), paired_end=True, include_annotation=False) self._paths.relocate_and_rename_star_output_pe() self._paths.relocate_and_rename_star_output() def _align_paired_end_reads(self): """Manage the actual alignemnt of paired end reads.""" read_aligner = Segemehl(self._args.segemehl_bin, self._args.progress) if self._helpers.file_needs_to_be_created(self._paths.index_path): read_aligner.build_index(self._paths.ref_seq_paths, self._paths.index_path) for read_path_pair, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_path_pairs, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._helpers.file_needs_to_be_created(output_path): continue elif not self._helpers.file_needs_to_be_created(bam_path): continue read_aligner.run_alignment(read_path_pair, self._paths.index_path, self._paths.ref_seq_paths, output_path, int(self._args.processes), nomatch_path, int(self._args.hit_strategy), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=True) def _sam_to_bam(self, sam_paths, bam_prefixes_paths, bam_paths): """Manage the conversion of mapped read from SAM to BAM format.""" sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for sam_path, bam_prefix_path, bam_path in zip( sam_paths, bam_prefixes_paths, bam_paths): if not self._helpers.file_needs_to_be_created(bam_path): continue jobs.append( executor.submit(sam_to_bam_converter.sam_to_bam, sam_path, bam_prefix_path)) # Evaluate thread outcome self._helpers.check_job_completeness(jobs) def _generate_read_alignment_stats(self, lib_names, result_bam_paths, unaligned_reads_paths, output_stats_path): """Manage the generation of alingment statistics.""" raw_stat_data_writer = RawStatDataWriter(pretty=True) read_files_and_jobs = {} if not self._helpers.file_needs_to_be_created(output_stats_path): return with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for (lib_name, read_alignment_bam_path, unaligned_reads_path) in zip(lib_names, result_bam_paths, unaligned_reads_paths): read_aligner_stats = ReadAlignerStats() read_files_and_jobs[lib_name] = executor.submit( read_aligner_stats.count, read_alignment_bam_path, unaligned_reads_path) # Evaluate thread outcome self._helpers.check_job_completeness(read_files_and_jobs.values()) read_files_and_stats = dict([ (lib_name, job.result()) for lib_name, job in read_files_and_jobs.items() ]) raw_stat_data_writer.write(read_files_and_stats, output_stats_path) def _run_realigner_and_process_alignments(self): # As the realigner needs a *sorted* SAM file self._generate_sorted_tmp_sam_file() self._realign_unmapped_reads() self._sam_to_bam(self._paths.read_realigner_sam_paths, self._paths.read_realigner_bam_prefixes_paths, self._paths.read_realigner_sam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.read_realigner_bam_paths, self._paths.realigned_unaligned_reads_paths, self._paths.read_realigner_stats_path) def _generate_sorted_tmp_sam_file(self): sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for bam_path, sam_path in zip( self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_tmp_sam_paths): jobs.append( executor.submit(sam_to_bam_converter.bam_to_sam, bam_path, sam_path)) # Evaluate thread outcome self._helpers.check_job_completeness(jobs) def _realign_unmapped_reads(self): read_realigner = ReadRealigner(self._args.lack_bin, self._args.progress) for (query_fasta_path, query_sam_path, realignment_sam_path, unaligned_reads_path) in zip( self._paths.unaligned_reads_paths, self._paths.read_realigner_tmp_sam_paths, self._paths.read_realigner_sam_paths, self._paths.realigned_unaligned_reads_paths): read_realigner.run_alignment(query_fasta_path, query_sam_path, self._paths.ref_seq_paths, realignment_sam_path, unaligned_reads_path, int(self._args.processes), int(self._args.segemehl_accuracy)) os.remove(query_sam_path) def _merge_bam_files(self): jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for merged_bam, primary_aligner_bam, realigner_bam in zip( self._paths.read_alignment_bam_paths, self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths): bam_merger = BamMerger() jobs.append( executor.submit(bam_merger.merge, merged_bam, primary_aligner_bam, realigner_bam)) self._helpers.check_job_completeness(jobs) if not self._args.keep_original_alignments: for bam_file_list in [ self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths ]: for bam_file in bam_file_list: os.remove(bam_file) os.remove("%s.bai" % bam_file) def _remove_crossaligned_reads(self): self._string_to_species_and_sequence_ids() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for (bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path) in zip( self._paths.read_alignment_bam_paths, self._paths.read_alignment_bam_with_crossmappings_paths, self._paths.read_alignment_bam_cross_cleaned_tmp_paths, self._paths.crossmapped_reads_paths): jobs.append( executor.submit(self._remove_crossaligned_reads_for_lib, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path)) # Evaluate thread outcome self._helpers.check_job_completeness(jobs) def _string_to_species_and_sequence_ids(self): self._species_and_sequence_ids = {} orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";") if len(orgs_and_seq_ids_strs) < 2: self._helpers.write_err_msg_and_quit( "Error! Only one organism is defined for the cross align " "removal. This does not make sense.\nYou gave the " "following input:\n%s\n" % self._args.crossalign_cleaning_str) for org_and_seq_ids_str in orgs_and_seq_ids_strs: org, seq_ids_str = org_and_seq_ids_str.strip().split(":") seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")] if "" in seq_ids: seq_ids.remove("") if len(seq_ids) < 1: self._helpers.write_err_msg_and_quit( "Error! No sequence ID was given for the species '%s'. " "This does not make sense.\nYou gave the " "following input:\n%s\n" % (org, self._args.crossalign_cleaning_str)) self._species_and_sequence_ids[org] = seq_ids def _remove_crossaligned_reads_for_lib(self, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path): # Perform the removal or cross aligned reads cross_align_filter = CrossAlignFilter(bam_path, bam_cleaned_tmp_path, crossmapped_reads_path, self._species_and_sequence_ids) cross_align_filter.determine_crossmapped_reads() cross_align_filter.write_crossmapping_free_bam() # Rename the original mapping file that potentially # contains cross aligned reads os.rename(bam_path, bam_with_crossmappings_path) os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai") # Move the cross aligned filtered file to the final mapping # path os.rename(bam_cleaned_tmp_path, bam_path) os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai") def _write_alignment_stat_table(self): """Manage the creation of the mapping statistic output table.""" raw_stat_data_reader = RawStatDataReader() read_processing_stats = raw_stat_data_reader.read( self._paths.read_processing_stats_path) final_alignment_stats = raw_stat_data_reader.read( self._paths.read_alignments_stats_path) realignment_stats = None primary_aligner_stats = None if self._args.realign: primary_aligner_stats = raw_stat_data_reader.read( self._paths.primary_read_aligner_stats_path) realignment_stats = raw_stat_data_reader.read( self._paths.read_realigner_stats_path) read_aligner_stats_table = ReadAlignerStatsTable( read_processing_stats, final_alignment_stats, primary_aligner_stats, realignment_stats, self._lib_names, self._paths.read_alignment_stats_table_path, self._args.paired_end) read_aligner_stats_table.write()
def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args.project_path) self._read_files = None self._ref_seq_files = None
class Controller(object): """Manage the actions of the subcommands. The Controller take care of providing the argumentes like path names and the parallel processing of tasks. """ def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args.project_path) self._read_files = None self._ref_seq_files = None def create_project(self, version): """Create a new project.""" sys.stdout.write( " ___ _______ ___ __ _\n" " / _ \/ __/ _ | / _ \___ __ _ ___ / /_(_)__ ___\n" " / , _/ _// __ |/ // / -_) ' \/ _ \/ __/ / _ \/ _ \\\n" "/_/|_/___/_/ |_/____/\__/_/_/_/ .__/\__/_/\___/_//_/\n" " / /\n" "====================================================\n" "========================================\n" "=======================\n" "==============\n\n" "[http://pythonhosted.org/READemption/]\n\n") project_creator = ProjectCreator() project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders()) project_creator.create_version_file(self._paths.version_path, version) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( self._args.project_path)) sys.stdout.write("Please copy read files into folder \"%s\" and " "reference sequences files into folder \"%s\".\n" % ( self._paths.read_fasta_folder, self._paths.ref_seq_folder)) def align_reads(self): """Perform the alignment of the reads.""" self._test_folder_existance( self._paths.required_read_alignment_folders()) assert self._args.paired_end in [True, False] self._ref_seq_files = self._paths.get_ref_seq_files() self._paths.set_ref_seq_paths(self._ref_seq_files) self._test_align_file_existance() if not self._args.paired_end: # Single end reads self._read_files = self._paths.get_read_files() self._lib_names = self._paths.get_lib_names_single_end() self._paths.set_read_files_dep_file_lists_single_end( self._read_files, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_single_end() if self._args.star: self.align_SE_STAR() else: self._align_single_end_reads() else: # Paired end reads self._read_file_pairs = self._paths.get_read_file_pairs() self._lib_names = self._paths.get_lib_names_paired_end() self._paths.set_read_files_dep_file_lists_paired_end( self._read_file_pairs, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_paired_end() if self._args.star: self.align_PE_STAR() else: self._align_paired_end_reads() self._sam_to_bam( self._paths.primary_read_aligner_sam_paths, self._paths.primary_read_aligner_bam_prefix_paths, self._paths.primary_read_aligner_bam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.primary_read_aligner_bam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_stats_path) final_unaligned_reads_paths = self._paths.unaligned_reads_paths if self._args.realign: self._run_realigner_and_process_alignments() self._merge_bam_files() final_unaligned_reads_paths = ( self._paths.realigned_unaligned_reads_paths) if self._args.crossalign_cleaning_str is not None: self._remove_crossaligned_reads() self._generate_read_alignment_stats( self._lib_names, self._paths.read_alignment_bam_paths, final_unaligned_reads_paths, self._paths.read_alignments_stats_path) self._write_alignment_stat_table() def align_SE_STAR(self): read_aligner = STAR_Align( self._args.STAR_bin) if self._file_needs_to_be_created(self._paths.index_path_star): read_aligner.build_index( int(self._args.processes), self._paths.read_alignment_index_folder, " ".join([self._paths.ref_seq_folder + '/' + ref for ref in self._paths.get_ref_seq_files()]), int(self._args.indexN)) for read_path, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_paths, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._file_needs_to_be_created(output_path): continue elif not self._file_needs_to_be_created(bam_path): continue read_aligner.align_reads( int(self._args.processes), self._paths.read_alignment_index_folder, read_path, (self._paths.read_alignments_folder + '/' + " ".join(self._paths.get_lib_names_single_end()) + '_'), (self._paths.annotation_folder + '/' + " ".join(self._paths.get_annotation_files())), paired_end=False, include_annotation=False) self._paths.change_primary_aligned_sam_SE() if os.path.isfile( self._paths.read_alignments_folder + '/' + " ".join(str(lib_name) for lib_name in self._paths.get_lib_names_single_end()) + '_Unmapped.out.mate1'): self._paths.change_unmapped_filename_SE() def align_PE_STAR(self): read_aligner = STAR_Align( self._args.STAR_bin) if self._file_needs_to_be_created(self._paths.index_path_star): read_aligner.build_index( int(self._args.processes), self._paths.read_alignment_index_folder, " ".join([self._paths.ref_seq_folder + '/' + ref for ref in self._paths.get_ref_seq_files()]), int(self._args.indexN)) for read_path_pair, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_path_pairs, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._file_needs_to_be_created(output_path): continue elif not self._file_needs_to_be_created(bam_path): continue read_aligner.align_reads( int(self._args.processes), self._paths.read_alignment_index_folder, read_path_pair, (self._paths.read_alignments_folder + '/' + " ".join(self._paths.get_lib_names_paired_end()) + '_'), (self._paths.annotation_folder + '/' + " ".join(self._paths.get_annotation_files())), paired_end=True, include_annotation=False) self._paths.change_primary_aligned_sam_PE() if os.path.isfile( self._paths.read_alignments_folder + '/' + " ".join(str(lib_name) for lib_name in self._paths.get_lib_names_paired_end()) + '_Unmapped.out.mate1'): self._paths.change_unmapped_filename_PE() def _remove_crossaligned_reads(self): self._string_to_species_and_sequence_ids() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for (bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path) in zip( self._paths.read_alignment_bam_paths, self._paths.read_alignment_bam_with_crossmappings_paths, self._paths.read_alignment_bam_cross_cleaned_tmp_paths, self._paths.crossmapped_reads_paths): jobs.append(executor.submit( self._remove_crossaligned_reads_for_lib, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path)) # Evaluate thread outcome self._check_job_completeness(jobs) def _remove_crossaligned_reads_for_lib( self, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path): # Perform the removal or cross aligned reads cross_align_filter = CrossAlignFilter( bam_path, bam_cleaned_tmp_path, crossmapped_reads_path, self._species_and_sequence_ids) cross_align_filter.determine_crossmapped_reads() cross_align_filter.write_crossmapping_free_bam() # Rename the original mapping file that potentially # contains cross aligned reads os.rename(bam_path, bam_with_crossmappings_path) os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai") # Move the cross aligned filtered file to the final mapping # path os.rename(bam_cleaned_tmp_path, bam_path) os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai") def _string_to_species_and_sequence_ids(self): self._species_and_sequence_ids = {} orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";") if len(orgs_and_seq_ids_strs) < 2: self._write_err_msg_and_quit( "Error! Only one organism is defined for the cross align " "removal. This does not make sense.\nYou gave the " "following input:\n%s\n" % self._args.crossalign_cleaning_str) for org_and_seq_ids_str in orgs_and_seq_ids_strs: org, seq_ids_str = org_and_seq_ids_str.strip().split(":") seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")] if "" in seq_ids: seq_ids.remove("") if len(seq_ids) < 1: self._write_err_msg_and_quit( "Error! No sequence ID was given for the species '%s'. " "This does not make sense.\nYou gave the " "following input:\n%s\n" % ( org, self._args.crossalign_cleaning_str)) self._species_and_sequence_ids[org] = seq_ids def _set_primary_aligner_paths_to_final_paths(self): # If no remapping is performed the paths of the final bam files # is the paths of the primary mapper self._paths.primary_read_aligner_bam_prefix_paths = ( self._paths.read_alignment_bam_prefix_paths) self._paths.primary_read_aligner_bam_paths = ( self._paths.read_alignment_bam_paths) self._paths.primary_read_aligner_stats_path = ( self._paths.read_alignments_stats_path) def _merge_bam_files(self): jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for merged_bam, primary_aligner_bam, realigner_bam in zip( self._paths.read_alignment_bam_paths, self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths): bam_merger = BamMerger() jobs.append(executor.submit( bam_merger.merge, merged_bam, primary_aligner_bam, realigner_bam)) self._check_job_completeness(jobs) if not self._args.keep_original_alignments: for bam_file_list in [ self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths]: for bam_file in bam_file_list: os.remove(bam_file) os.remove("%s.bai" % bam_file) def _run_realigner_and_process_alignments(self): # As the realigner needs a *sorted* SAM file self._generate_sorted_tmp_sam_file() self._realign_unmapped_reads() self._sam_to_bam( self._paths.read_realigner_sam_paths, self._paths.read_realigner_bam_prefixes_paths, self._paths.read_realigner_sam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.read_realigner_bam_paths, self._paths.realigned_unaligned_reads_paths, self._paths.read_realigner_stats_path) def _test_align_file_existance(self): """Test if the input file for the the align subcommand exist.""" if len(self._paths.get_read_files()) == 0: self._write_err_msg_and_quit("Error! No read libraries given!\n") if len(self._ref_seq_files) == 0: self._write_err_msg_and_quit( "Error! No reference sequence files given!\n") def _test_folder_existance(self, task_specific_folders): """Test the existance of required folders.""" for folder in ( self._paths.required_base_folders() + task_specific_folders): if not os.path.exists(folder): self._write_err_msg_and_quit( "Error! Folder '%s' does not exist! Is the given project " "folder name correct?\n" % folder) def _file_needs_to_be_created(self, file_path, quiet=False): """Test if a file exists of need to be created.""" if not self._args.check_for_existing_files: return True if os.path.exists(file_path): if not quiet: sys.stderr.write( "File %s exists. Skipping its generation.\n" % file_path) return False return True def _prepare_reads_single_end(self): """Manage the prepartion of reads before the actual mappings.""" read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path, processed_read_path in zip( self._lib_names, self._paths.read_paths, self._paths.processed_read_paths): if not self._file_needs_to_be_created( processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=self._args.poly_a_clipping, min_read_length=self._args.min_read_length, fastq=self._args.fastq, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter, reverse_complement=self._args.reverse_complement) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_single_end, read_path, processed_read_path) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _prepare_reads_paired_end(self): read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path_pair, processed_read_path_pair in zip( self._lib_names, self._paths.read_path_pairs, self._paths.processed_read_path_pairs): for processed_read_path in processed_read_path_pair: if not self._file_needs_to_be_created( processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=False, min_read_length=self._args.min_read_length, fastq=self._args.fastq, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_paired_end, read_path_pair, processed_read_path_pair) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _evaluet_job_and_generate_stat_file(self, read_files_and_jobs): raw_stat_data_writer = RawStatDataWriter(pretty=True) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) if not self._file_needs_to_be_created( self._paths.read_processing_stats_path): return # Create a dict of the read file names and the processing # counting results read_files_and_stats = dict( [(lib_name, job.result()) for lib_name, job in read_files_and_jobs.items()]) raw_stat_data_writer.write( read_files_and_stats, self._paths.read_processing_stats_path) def _align_single_end_reads(self): """Manage the actual alignment of single end reads.""" read_aligner = Segemehl( self._args.segemehl_bin, self._args.progress) if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index( self._paths.ref_seq_paths, self._paths.index_path) for read_path, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_paths, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.read_alignment_bam_paths): if not self._file_needs_to_be_created(output_path): continue elif not self._file_needs_to_be_created(bam_path): continue read_aligner.run_alignment( read_path, self._paths.index_path, self._paths.ref_seq_paths, output_path, int(self._args.processes), nomatch_path, int(self._args.hit_strategy), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=False) def _align_paired_end_reads(self): """Manage the actual alignemnt of paired end reads.""" read_aligner = Segemehl( self._args.segemehl_bin, self._args.progress) if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index( self._paths.ref_seq_paths, self._paths.index_path) for read_path_pair, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_path_pairs, self._paths.primary_read_aligner_sam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_bam_paths): if not self._file_needs_to_be_created(output_path): continue elif not self._file_needs_to_be_created(bam_path): continue read_aligner.run_alignment( read_path_pair, self._paths.index_path, self._paths.ref_seq_paths, output_path, int(self._args.processes), nomatch_path, int(self._args.hit_strategy), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=True) def _realign_unmapped_reads(self): read_realigner = ReadRealigner( self._args.lack_bin, self._args.progress) for (query_fasta_path, query_sam_path, realignment_sam_path, unaligned_reads_path) in zip( self._paths.unaligned_reads_paths, self._paths.read_realigner_tmp_sam_paths, self._paths.read_realigner_sam_paths, self._paths.realigned_unaligned_reads_paths): read_realigner.run_alignment( query_fasta_path, query_sam_path, self._paths.ref_seq_paths, realignment_sam_path, unaligned_reads_path, int(self._args.processes), int(self._args.segemehl_accuracy)) os.remove(query_sam_path) def _sam_to_bam(self, sam_paths, bam_prefixes_paths, bam_paths): """Manage the conversion of mapped read from SAM to BAM format.""" sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for sam_path, bam_prefix_path, bam_path in zip( sam_paths, bam_prefixes_paths, bam_paths): if not self._file_needs_to_be_created(bam_path): continue jobs.append(executor.submit( sam_to_bam_converter.sam_to_bam, sam_path, bam_prefix_path)) # Evaluate thread outcome self._check_job_completeness(jobs) def _generate_sorted_tmp_sam_file(self): sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for bam_path, sam_path in zip( self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_tmp_sam_paths): jobs.append(executor.submit( sam_to_bam_converter.bam_to_sam, bam_path, sam_path)) # Evaluate thread outcome self._check_job_completeness(jobs) def _generate_read_alignment_stats( self, lib_names, result_bam_paths, unaligned_reads_paths, output_stats_path): """Manage the generation of alingment statistics.""" raw_stat_data_writer = RawStatDataWriter(pretty=True) read_files_and_jobs = {} if not self._file_needs_to_be_created(output_stats_path): return with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for (lib_name, read_alignment_bam_path, unaligned_reads_path) in zip( lib_names, result_bam_paths, unaligned_reads_paths): read_aligner_stats = ReadAlignerStats() read_files_and_jobs[lib_name] = executor.submit( read_aligner_stats.count, read_alignment_bam_path, unaligned_reads_path) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) read_files_and_stats = dict( [(lib_name, job.result()) for lib_name, job in read_files_and_jobs.items()]) raw_stat_data_writer.write(read_files_and_stats, output_stats_path) def _write_alignment_stat_table(self): """Manage the creation of the mapping statistic output table.""" raw_stat_data_reader = RawStatDataReader() read_processing_stats = raw_stat_data_reader.read( self._paths.read_processing_stats_path) final_alignment_stats = raw_stat_data_reader.read( self._paths.read_alignments_stats_path) realignment_stats = None primary_aligner_stats = None if self._args.realign: primary_aligner_stats = raw_stat_data_reader.read( self._paths.primary_read_aligner_stats_path) realignment_stats = raw_stat_data_reader.read( self._paths.read_realigner_stats_path) read_aligner_stats_table = ReadAlignerStatsTable( read_processing_stats, final_alignment_stats, primary_aligner_stats, realignment_stats, self._lib_names, self._paths.read_alignment_stats_table_path, self._args.paired_end) read_aligner_stats_table.write() def _ref_ids_to_file(self, ref_seq_paths): """Translate the reference ID to file paths.""" ref_ids_to_file = {} fasta_parser = FastaParser() for ref_seq_path in ref_seq_paths: ref_seq_file = os.path.basename(ref_seq_path) with open(ref_seq_path) as ref_seq_fh: ref_seq_id = fasta_parser.header_id( fasta_parser.single_entry_file_header(ref_seq_fh)) ref_ids_to_file[ref_seq_id] = ref_seq_file return ref_ids_to_file def create_coverage_files(self): """Create coverage files based on the read alignments. The coverages are calculated per replicon and the results are written to the output file. This might be slower but if all coverages are detmined at once the data structure will become too large when working with large reference sequences. """ self._test_folder_existance(self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) for read_file, attributes in alignment_stats[0].items()]) min_no_of_aligned_reads = float(min( read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append(executor.submit( self._create_coverage_files_for_lib, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads)) # Evaluate thread outcome self._check_job_completeness(jobs) def _all_coverage_file_exist( self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Test the existance of all coverage file of a library""" files = [] for strand in strands: files.append(self._paths.wiggle_file_raw_path(lib_name, strand)) files.append(self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads)) files.append(self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads)) if not any([self._file_needs_to_be_created(file, quiet=True) for file in files]): sys.stderr.write( "The files %s exists. Skipping their generation.\n" % ", " .join(files)) return True return False def _create_coverage_files_for_lib( self, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads): """Perform the coverage calculation for a given library.""" if not self._args.non_strand_specific: strands = ["forward", "reverse"] else: strands = ["forward_and_reverse"] if self._all_coverage_file_exist( lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): return read_count_splitting = True if self._args.skip_read_count_splitting: read_count_splitting = False coverage_calculator = CoverageCalculator( read_count_splitting=read_count_splitting, uniquely_aligned_only=self._args.unique_only, coverage_style=self._args.coverage_style, clip_length=self._args.clip_length, non_strand_specific=self._args.non_strand_specific) (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm) = self._wiggle_writers( lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads) for ref_seq, coverages in coverage_calculator.ref_seq_and_coverages( bam_path): for strand in strands: coverage_writers_raw[strand].write_replicons_coverages( ref_seq, coverages[strand]) coverage_writers_tnoar_min_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], factor=min_no_of_aligned_reads/no_of_aligned_reads) coverage_writers_tnoar_mil_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], factor=1000000/no_of_aligned_reads) for strand in strands: coverage_writers_raw[strand].close_file() def _wiggle_writers(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Write the calculated coverages to wiggle files.""" coverage_writers_raw = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open(self._paths.wiggle_file_raw_path(lib_name, strand), "w"))) for strand in strands]) coverage_writers_tnoar_min_norm = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open(self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads), "w"))) for strand in strands]) coverage_writers_tnoar_mil_norm = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open(self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads), "w"))) for strand in strands]) return (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm) def _check_job_completeness(self, jobs): """Check the completness of each job in a list""" for job in concurrent.futures.as_completed(jobs): if job.exception(): raise(job.exception()) def quantify_gene_wise(self): """Manage the counting of aligned reads per gene.""" self._test_folder_existance( self._paths.required_gene_quanti_folders()) norm_by_alignment_freq = True norm_by_overlap_freq = True if self._args.no_count_split_by_alignment_no: norm_by_alignment_freq = False if self._args.no_count_splitting_by_gene_no: norm_by_overlap_freq = False raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = sorted(list(alignment_stats[0].keys())) annotation_files = self._paths.get_annotation_files() self._paths.set_annotation_paths(annotation_files) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_alignment_path in zip( lib_names, self._paths.read_alignment_bam_paths): jobs.append(executor.submit( self._quantify_gene_wise, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files)) # Evaluate thread outcome self._check_job_completeness(jobs) self._gene_quanti_create_overview( annotation_files, self._paths.annotation_paths, lib_names) def _was_paired_end_alignment(self, lib_names): """Check if the mapping was done in paired- or single-end mode""" if len(lib_names) * 2 == len(self._paths.get_read_files()): return True return False def _quantify_gene_wise( self, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files): """Perform the gene wise quantification for a given library.""" gene_quanti_paths = [ self._paths.gene_quanti_path(lib_name, annotation_file) for annotation_file in annotation_files] # Check if all output files for this library exist - if so # skip their creation if not any([self._file_needs_to_be_created( gene_quanti_path, quiet=True) for gene_quanti_path in gene_quanti_paths]): sys.stderr.write( "The file(s) %s exist(s). Skipping their/its generation.\n" % ", " .join(gene_quanti_paths)) return gene_wise_quantification = GeneWiseQuantification( min_overlap=self._args.min_overlap, read_region=self._args.read_region, clip_length=self._args.clip_length, norm_by_alignment_freq=norm_by_alignment_freq, norm_by_overlap_freq=norm_by_overlap_freq, allowed_features_str=self._args.allowed_features, skip_antisense=self._args.skip_antisense, unique_only=self._args.unique_only) gene_wise_quantification.calc_overlaps_per_alignment( read_alignment_path, self._paths.annotation_paths) for annotation_file, annotation_path in zip( annotation_files, self._paths.annotation_paths): gene_wise_quantification.quantify( read_alignment_path, annotation_path, self._paths.gene_quanti_path( lib_name, annotation_file), self._args.pseudocounts) def _gene_quanti_create_overview( self, annotation_files, annotation_paths, lib_names): """Create an overview table of all gene quantification for all libs.""" strand_specific = True if self._args.non_strand_specific: strand_specific = False gene_wise_overview = GeneWiseOverview( allowed_features_str=self._args.allowed_features, skip_antisense=self._args.skip_antisense, strand_specific=strand_specific) path_and_name_combos = {} for annotation_file, annotation_path in zip( annotation_files, annotation_paths): path_and_name_combos[annotation_path] = [] for read_file in lib_names: path_and_name_combos[annotation_path].append( [read_file, self._paths.gene_quanti_path( read_file, annotation_file)]) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_path): gene_wise_overview.create_overview_raw_countings( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_path) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_rpkm_path): gene_wise_overview.create_overview_rpkm( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_rpkm_path, self._libs_and_total_num_of_aligned_reads()) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_tnoar_path): gene_wise_overview.create_overview_norm_by_tnoar( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_tnoar_path, self._libs_and_total_num_of_aligned_reads()) def _libs_and_total_num_of_aligned_reads(self): """Read the total number of reads per library.""" with open(self._paths .read_alignments_stats_path) as read_aligner_stats_fh: read_aligner_stats = json.loads(read_aligner_stats_fh.read()) return dict([(lib, values["stats_total"]["no_of_aligned_reads"]) for lib, values in read_aligner_stats.items()]) def _libs_and_total_num_of_uniquely_aligned_reads(self): """Read the total number of reads per library.""" with open(self._paths .read_alignments_stats_path) as read_aligner_stats_fh: read_aligner_stats = json.loads(read_aligner_stats_fh.read()) return dict([(lib, values[ "stats_total"]["no_of_uniquely_aligned_reads"]) for lib, values in read_aligner_stats.items()]) def compare_with_deseq(self): """Manage the pairwise expression comparison with DESeq.""" self._test_folder_existance( self._paths.required_deseq_folders()) arg_libs = [self._paths._clean_file_name(lib) for lib in self._args.libs.split(",")] conditions = self._args.conditions.split(",") self._check_deseq_args(arg_libs, conditions) deseq_runner = DESeqRunner( arg_libs, conditions, self._paths.deseq_raw_folder, self._paths.deseq_extended_folder, self._paths.deseq_script_path, self._paths.deseq_pca_heatmap_path, self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, self._args.cooks_cutoff_off) deseq_runner.create_deseq_script_file() deseq_runner.write_session_info_file() deseq_runner.run_deseq() deseq_runner.merge_counting_files_with_results() def _check_deseq_args(self, arg_libs, conditions): """Test if the given arguments are sufficient.""" if len(arg_libs) != len(conditions): self._write_err_msg_and_quit( "Error - The read library file list and condition list must " "have the same number of elements. You entered \n%s " "(= %s elements)\nand \n%s (= %s elements).\n" % ( self._args.libs, len(arg_libs), self._args.conditions, len(conditions))) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " "following read list string is suggested: \"%s\"\n" % ( ", ".join(lib_names), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._write_err_msg_and_quit( "The library \"%s\" is not present in your list of " "libraries. Please add it.\n" % (lib)) def _write_err_msg_and_quit(self, msg): """Write error message and close the program gracefully.""" sys.stderr.write(msg) sys.exit(1) def viz_align(self): """Generate plots based on the read processing and mapping""" from reademptionlib.vizalign import AlignViz align_viz = AlignViz( self._paths.get_lib_names_single_end() if not self._args.paired_end else self._paths.get_lib_names_paired_end(), self._paths.read_processing_stats_path, self._paths.read_alignments_stats_path) align_viz.read_stat_files() align_viz.plot_input_read_length( self._paths.viz_align_input_read_length_plot_path) align_viz.plot_processed_read_length( self._paths.viz_align_processed_reads_length_plot_path) def viz_gene_quanti(self): """Generate plots based on the gene-wise read countings""" from reademptionlib.vizgenequanti import GeneQuantiViz gene_quanti_viz = GeneQuantiViz( self._paths.gene_wise_quanti_combined_path, self._paths.get_lib_names_single_end() if not self._args.paired_end else self._paths.get_lib_names_paired_end()) gene_quanti_viz.parse_input_table() gene_quanti_viz.plot_correlations( self._paths.viz_gene_quanti_scatter_plot_path) gene_quanti_viz.plot_annotation_class_quantification( self._paths.viz_gene_quanti_rna_classes_plot_path) def viz_deseq(self): """Generate plots based on the DESeq analysis""" from reademptionlib.vizdeseq import DESeqViz deseq_path_template = ( self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") deseq_viz = DESeqViz( self._paths.deseq_script_path, deseq_path_template, max_pvalue=self._args.max_pvalue) deseq_viz.create_scatter_plots( self._paths.viz_deseq_scatter_plot_path) deseq_viz.create_volcano_plots( self._paths.viz_deseq_volcano_plot_path, self._paths.viz_deseq_volcano_plot_adj_path) def viz_align_2(self): from reademptionlib.vizalign2 import AlignViz2 align_viz2 = AlignViz2() if self._args.paired_end: lib_names = self._paths.get_lib_names_paired_end() else: lib_names = self._paths.get_lib_names_single_end() align_viz2.alignment_stats( self._paths.read_alignments_stats_path, str(self._paths.viz_align_base_folder) + '/' + "_".join( lib_names + ['out_json_alignment']), str(self._paths.viz_align_base_folder) + '/' + "_".join( lib_names + ['BAM_stats_alignment'])) align_viz2.process_stats( self._paths.read_processing_stats_path, str(self._paths.viz_align_base_folder) + '/' + "_".join( lib_names + ['out_json_process']), str(self._paths.viz_align_base_folder) + '/' + "_".join( lib_names + ['BAM_stats_process'])) def viz_align_TK(self): """Generate plots based on the read processing and mapping""" from reademptionlib.vizalign2 import AlignViz2 align_viz_tk = AlignViz2() if self._args.input_align: align_viz_tk.alignment_stats( str(self._args.input_align), str(self._args.output_align), str(self._args.output_align)) if self._args.input_process: align_viz_tk.process_stats( str(self._args.input_process), str(self._args.output_process), str(self._args.output_process))
class Controller(object): """Manage the actions of the subcommands. The Controller takes care of providing the arguments like path names and the parallel processing of tasks. """ def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args.project_path) self._read_files = None self._ref_seq_files = None def create_project(self, version): """Create a new project.""" sys.stdout.write( " ___ _______ ___ __ _\n" " / _ \\/ __/ _ | / _ \\___ __ _ ___ / /_(_)__ ___\n" " / , _/ _// __ |/ // / -_) ' \\/ _ \\/ __/ / _ \\/ _ \\\n" "/_/|_/___/_/ |_/____/\\__/_/_/_/ .__/\\__/_/\\___/_//_/\n" " / /\n" "====================================================\n" "========================================\n" "=======================\n" "==============\n\n" "[https://reademption.readthedocs.io/en/latest/]\n\n") project_creator = ProjectCreator() project_creator.create_root_folder(self._args.project_path) project_creator.create_subfolders(self._paths.required_folders()) project_creator.create_version_file(self._paths.version_path, version) sys.stdout.write('Created folder "%s" and required subfolders.\n' % (self._args.project_path)) sys.stdout.write( 'Please copy read files into folder "%s" and ' 'reference sequences files into folder "%s".\n' % (self._paths.read_fasta_folder, self._paths.ref_seq_folder)) def align_reads(self): """Perform the alignment of the reads.""" self._args.realign = False self._test_folder_existance( self._paths.required_read_alignment_folders()) assert self._args.paired_end in [True, False] self._ref_seq_files = self._paths.get_ref_seq_files() self._paths.set_ref_seq_paths(self._ref_seq_files) self._test_align_file_existance() if not self._args.paired_end: # Single end reads self._read_files = self._paths.get_read_files() self._lib_names = self._paths.get_lib_names_single_end() self._paths.set_read_files_dep_file_lists_single_end( self._read_files, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_single_end() self._align_single_end_reads() else: # Paired end reads self._read_file_pairs = self._paths.get_read_file_pairs() self._lib_names = self._paths.get_lib_names_paired_end() self._paths.set_read_files_dep_file_lists_paired_end( self._read_file_pairs, self._lib_names) if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_paired_end() self._align_paired_end_reads() # self._sam_to_bam( # self._paths.primary_read_aligner_sam_paths, # self._paths.primary_read_aligner_bam_prefix_paths, # self._paths.primary_read_aligner_bam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.primary_read_aligner_bam_paths, self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_stats_path, ) final_unaligned_reads_paths = self._paths.unaligned_reads_paths if self._args.realign: self._run_realigner_and_process_alignments() self._merge_bam_files() final_unaligned_reads_paths = ( self._paths.realigned_unaligned_reads_paths) if self._args.crossalign_cleaning_str is not None: self._remove_crossaligned_reads() self._generate_read_alignment_stats( self._lib_names, self._paths.read_alignment_bam_paths, final_unaligned_reads_paths, self._paths.read_alignments_stats_path, ) self._write_alignment_stat_table() def _remove_crossaligned_reads(self): self._string_to_species_and_sequence_ids() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for ( bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path, ) in zip( self._paths.read_alignment_bam_paths, self._paths.read_alignment_bam_with_crossmappings_paths, self._paths.read_alignment_bam_cross_cleaned_tmp_paths, self._paths.crossmapped_reads_paths, ): jobs.append( executor.submit( self._remove_crossaligned_reads_for_lib, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path, )) # Evaluate thread outcome self._check_job_completeness(jobs) def _remove_crossaligned_reads_for_lib( self, bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path, ): # Perform the removal or cross aligned reads cross_align_filter = CrossAlignFilter( bam_path, bam_cleaned_tmp_path, crossmapped_reads_path, self._species_and_sequence_ids, ) cross_align_filter.determine_crossmapped_reads() cross_align_filter.write_crossmapping_free_bam() # Rename the original mapping file that potentially # contains cross aligned reads os.rename(bam_path, bam_with_crossmappings_path) os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai") # Move the cross aligned filtered file to the final mapping # path os.rename(bam_cleaned_tmp_path, bam_path) os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai") def _string_to_species_and_sequence_ids(self): self._species_and_sequence_ids = {} orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";") if len(orgs_and_seq_ids_strs) < 2: self._write_err_msg_and_quit( "Error! Only one organism is defined for the cross align " "removal. This does not make sense.\nYou gave the " "following input:\n%s\n" % self._args.crossalign_cleaning_str) for org_and_seq_ids_str in orgs_and_seq_ids_strs: org, seq_ids_str = org_and_seq_ids_str.strip().split(":") seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")] if "" in seq_ids: seq_ids.remove("") if len(seq_ids) < 1: self._write_err_msg_and_quit( "Error! No sequence ID was given for the species '%s'. " "This does not make sense.\nYou gave the " "following input:\n%s\n" % (org, self._args.crossalign_cleaning_str)) self._species_and_sequence_ids[org] = seq_ids def _set_primary_aligner_paths_to_final_paths(self): # If no remapping is performed the paths of the final bam files # is the paths of the primary mapper self._paths.primary_read_aligner_bam_prefix_paths = ( self._paths.read_alignment_bam_prefix_paths) self._paths.primary_read_aligner_bam_paths = ( self._paths.read_alignment_bam_paths) self._paths.primary_read_aligner_stats_path = ( self._paths.read_alignments_stats_path) def _merge_bam_files(self): jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for merged_bam, primary_aligner_bam, realigner_bam in zip( self._paths.read_alignment_bam_paths, self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths, ): bam_merger = BamMerger() jobs.append( executor.submit( bam_merger.merge, merged_bam, primary_aligner_bam, realigner_bam, )) self._check_job_completeness(jobs) if not self._args.keep_original_alignments: for bam_file_list in [ self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths, ]: for bam_file in bam_file_list: os.remove(bam_file) os.remove("%s.bai" % bam_file) def _run_realigner_and_process_alignments(self): # As the realigner needs a *sorted* SAM file self._generate_sorted_tmp_sam_file() self._realign_unmapped_reads() # self._sam_to_bam( # self._paths.read_realigner_sam_paths, # self._paths.read_realigner_bam_prefixes_paths, # self._paths.read_realigner_sam_paths) self._generate_read_alignment_stats( self._lib_names, self._paths.read_realigner_bam_paths, self._paths.realigned_unaligned_reads_paths, self._paths.read_realigner_stats_path, ) def _test_align_file_existance(self): """Test if the input file for the the align subcommand exist.""" if len(self._paths.get_read_files()) == 0: self._write_err_msg_and_quit("Error! No read libraries given!\n") if len(self._ref_seq_files) == 0: self._write_err_msg_and_quit( "Error! No reference sequence files given!\n") def _test_folder_existance(self, task_specific_folders): """Test the existance of required folders.""" for folder in (self._paths.required_base_folders() + task_specific_folders): if not os.path.exists(folder): self._write_err_msg_and_quit( "Error! Folder '%s' does not exist! Is the given project " "folder name correct?\n" % folder) def _file_needs_to_be_created(self, file_path, quiet=False): """Test if a file exists of need to be created.""" if not self._args.check_for_existing_files: return True if os.path.exists(file_path): if not quiet: sys.stderr.write("File %s exists. Skipping its generation.\n" % file_path) return False return True def _prepare_reads_single_end(self): """Manage the prepartion of reads before the actual mappings.""" read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path, processed_read_path in zip( self._lib_names, self._paths.read_paths, self._paths.processed_read_paths, ): if not self._file_needs_to_be_created(processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=self._args.poly_a_clipping, min_read_length=self._args.min_read_length, fastq=self._args.fastq, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter, reverse_complement=self._args.reverse_complement, ) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_single_end, read_path, processed_read_path, ) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _prepare_reads_paired_end(self): read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_path_pair, processed_read_path_pair in zip( self._lib_names, self._paths.read_path_pairs, self._paths.processed_read_path_pairs, ): for processed_read_path in processed_read_path_pair: if not self._file_needs_to_be_created(processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=False, min_read_length=self._args.min_read_length, fastq=self._args.fastq, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter, reverse_complement=self._args.reverse_complement, ) read_files_and_jobs[lib_name] = executor.submit( read_processor.process_paired_end, read_path_pair, processed_read_path_pair, ) self._evaluet_job_and_generate_stat_file(read_files_and_jobs) def _evaluet_job_and_generate_stat_file(self, read_files_and_jobs): raw_stat_data_writer = RawStatDataWriter(pretty=True) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) if not self._file_needs_to_be_created( self._paths.read_processing_stats_path): return # Create a dict of the read file names and the processing # counting results read_files_and_stats = dict([ (lib_name, job.result()) for lib_name, job in read_files_and_jobs.items() ]) raw_stat_data_writer.write(read_files_and_stats, self._paths.read_processing_stats_path) def _align_single_end_reads(self): """Manage the actual alignment of single end reads.""" read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress) if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index(self._paths.ref_seq_paths, self._paths.index_path) for read_path, output_path, nomatch_path, bam_path in zip( self._paths.processed_read_paths, self._paths.primary_read_aligner_bam_paths, self._paths.unaligned_reads_paths, self._paths.read_alignment_bam_paths, ): if not self._file_needs_to_be_created(output_path): continue elif not self._file_needs_to_be_created(bam_path): continue read_aligner.run_alignment( read_path, self._paths.index_path, self._paths.ref_seq_paths, output_path, nomatch_path, int(self._args.processes), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=False, ) def _align_paired_end_reads(self): """Manage the actual alignemnt of paired end reads.""" read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress) if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index(self._paths.ref_seq_paths, self._paths.index_path) for read_path_pair, output_path, nomatch_path in zip( self._paths.processed_read_path_pairs, self._paths.primary_read_aligner_bam_paths, self._paths.unaligned_reads_paths, ): if not self._file_needs_to_be_created(output_path): continue # elif not self._file_needs_to_be_created(bam_path): # continue read_aligner.run_alignment( read_path_pair, self._paths.index_path, self._paths.ref_seq_paths, output_path, nomatch_path, int(self._args.processes), int(self._args.segemehl_accuracy), float(self._args.segemehl_evalue), self._args.split, paired_end=True, ) def _realign_unmapped_reads(self): read_realigner = ReadRealigner(self._args.lack_bin, self._args.progress) for ( query_fasta_path, query_sam_path, realignment_sam_path, unaligned_reads_path, ) in zip( self._paths.unaligned_reads_paths, self._paths.read_realigner_tmp_sam_paths, self._paths.read_realigner_sam_paths, self._paths.realigned_unaligned_reads_paths, ): read_realigner.run_alignment( query_fasta_path, query_sam_path, self._paths.ref_seq_paths, realignment_sam_path, unaligned_reads_path, int(self._args.processes), int(self._args.segemehl_accuracy), ) os.remove(query_sam_path) def _sam_to_bam(self, sam_paths, bam_prefixes_paths, bam_paths): """Manage the conversion of mapped read from SAM to BAM format.""" sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for sam_path, bam_prefix_path, bam_path in zip( sam_paths, bam_prefixes_paths, bam_paths): if not self._file_needs_to_be_created(bam_path): continue jobs.append( executor.submit( sam_to_bam_converter.sam_to_bam, sam_path, bam_prefix_path, )) # Evaluate thread outcome self._check_job_completeness(jobs) def _generate_sorted_tmp_sam_file(self): sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for bam_path, sam_path in zip( self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_tmp_sam_paths, ): jobs.append( executor.submit(sam_to_bam_converter.bam_to_sam, bam_path, sam_path)) # Evaluate thread outcome self._check_job_completeness(jobs) def _generate_read_alignment_stats( self, lib_names, result_bam_paths, unaligned_reads_paths, output_stats_path, ): """Manage the generation of alingment statistics.""" raw_stat_data_writer = RawStatDataWriter(pretty=True) read_files_and_jobs = {} if not self._file_needs_to_be_created(output_stats_path): return with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for ( lib_name, read_alignment_bam_path, unaligned_reads_path, ) in zip(lib_names, result_bam_paths, unaligned_reads_paths): read_aligner_stats = ReadAlignerStats() read_files_and_jobs[lib_name] = executor.submit( read_aligner_stats.count, read_alignment_bam_path, unaligned_reads_path, ) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) read_files_and_stats = dict([ (lib_name, job.result()) for lib_name, job in read_files_and_jobs.items() ]) raw_stat_data_writer.write(read_files_and_stats, output_stats_path) def _write_alignment_stat_table(self): """Manage the creation of the mapping statistic output table.""" raw_stat_data_reader = RawStatDataReader() read_processing_stats = raw_stat_data_reader.read( self._paths.read_processing_stats_path) final_alignment_stats = raw_stat_data_reader.read( self._paths.read_alignments_stats_path) realignment_stats = None primary_aligner_stats = None if self._args.realign: primary_aligner_stats = raw_stat_data_reader.read( self._paths.primary_read_aligner_stats_path) realignment_stats = raw_stat_data_reader.read( self._paths.read_realigner_stats_path) read_aligner_stats_table = ReadAlignerStatsTable( read_processing_stats, final_alignment_stats, primary_aligner_stats, realignment_stats, self._lib_names, self._paths.read_alignment_stats_table_path, self._args.paired_end, ) read_aligner_stats_table.write() def _ref_ids_to_file(self, ref_seq_paths): """Translate the reference ID to file paths.""" ref_ids_to_file = {} fasta_parser = FastaParser() for ref_seq_path in ref_seq_paths: ref_seq_file = os.path.basename(ref_seq_path) with open(ref_seq_path) as ref_seq_fh: ref_seq_id = fasta_parser.header_id( fasta_parser.single_entry_file_header(ref_seq_fh)) ref_ids_to_file[ref_seq_id] = ref_seq_file return ref_ids_to_file def create_coverage_files(self): """Create coverage files based on the read alignments. The coverages are calculated per replicon and the results are written to the output file. This might be slower but if all coverages are determined at once the data structure will become too large when working with large reference sequences. """ self._test_folder_existance(self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) for read_file, attributes in alignment_stats[0].items() ]) min_no_of_aligned_reads = float( min(read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append( executor.submit( self._create_coverage_files_for_lib, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads, )) # Evaluate thread outcome self._check_job_completeness(jobs) def _all_coverage_file_exist(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Test the existance of all coverage files of a library""" files = [] for strand in strands: files.append(self._paths.wiggle_file_raw_path(lib_name, strand)) files.append( self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads, )) files.append( self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads)) if not any([ self._file_needs_to_be_created(file, quiet=True) for file in files ]): sys.stderr.write( "The files %s exists. Skipping their generation.\n" % ", ".join(files)) return True return False def _create_coverage_files_for_lib(self, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads): """Perform the coverage calculation for a given library.""" if not self._args.non_strand_specific: strands = ["forward", "reverse"] else: strands = ["forward_and_reverse"] if self._all_coverage_file_exist(lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): return read_count_splitting = True if self._args.skip_read_count_splitting: read_count_splitting = False coverage_calculator = CoverageCalculator( read_count_splitting=read_count_splitting, uniquely_aligned_only=self._args.unique_only, coverage_style=self._args.coverage_style, clip_length=self._args.clip_length, non_strand_specific=self._args.non_strand_specific, ) ( coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm, ) = self._wiggle_writers(lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads) for ref_seq, coverages in coverage_calculator.ref_seq_and_coverages( bam_path): for strand in strands: coverage_writers_raw[strand].write_replicons_coverages( ref_seq, coverages[strand]) coverage_writers_tnoar_min_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], factor=min_no_of_aligned_reads / no_of_aligned_reads, ) coverage_writers_tnoar_mil_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], factor=1000000 / no_of_aligned_reads, ) for strand in strands: coverage_writers_raw[strand].close_file() def _wiggle_writers(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Write the calculated coverages to wiggle files.""" coverage_writers_raw = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open( self._paths.wiggle_file_raw_path(lib_name, strand), "w", ), ), ) for strand in strands]) coverage_writers_tnoar_min_norm = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open( self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads, ), "w", ), ), ) for strand in strands]) coverage_writers_tnoar_mil_norm = dict([( strand, WiggleWriter( "%s_%s" % (lib_name, strand), open( self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads, ), "w", ), ), ) for strand in strands]) return ( coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm, ) def _check_job_completeness(self, jobs): """Check the completness of each job in a list""" for job in concurrent.futures.as_completed(jobs): if job.exception(): raise (job.exception()) def quantify_gene_wise(self): """Manage the counting of aligned reads per gene.""" self._test_folder_existance(self._paths.required_gene_quanti_folders()) norm_by_alignment_freq = True norm_by_overlap_freq = True if self._args.no_count_split_by_alignment_no: norm_by_alignment_freq = False if self._args.no_count_splitting_by_gene_no: norm_by_overlap_freq = False raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = sorted(list(alignment_stats[0].keys())) annotation_files = self._paths.get_annotation_files() self._paths.set_annotation_paths(annotation_files) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_alignment_path in zip( lib_names, self._paths.read_alignment_bam_paths): jobs.append( executor.submit( self._quantify_gene_wise, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files, )) # Evaluate thread outcome self._check_job_completeness(jobs) self._gene_quanti_create_overview(annotation_files, self._paths.annotation_paths, lib_names) def _was_paired_end_alignment(self, lib_names): """Check if the mapping was done in paired- or single-end mode""" if len(lib_names) * 2 == len(self._paths.get_read_files()): return True return False def _quantify_gene_wise( self, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files, ): """Perform the gene wise quantification for a given library.""" gene_quanti_paths = [ self._paths.gene_quanti_path(lib_name, annotation_file) for annotation_file in annotation_files ] # Check if all output files for this library exist - if so # skip their creation if not any([ self._file_needs_to_be_created(gene_quanti_path, quiet=True) for gene_quanti_path in gene_quanti_paths ]): sys.stderr.write( "The file(s) %s exist(s). Skipping their/its generation.\n" % ", ".join(gene_quanti_paths)) return strand_specific = True if self._args.non_strand_specific: strand_specific = False gene_wise_quantification = GeneWiseQuantification( min_overlap=self._args.min_overlap, read_region=self._args.read_region, clip_length=self._args.clip_length, norm_by_alignment_freq=norm_by_alignment_freq, norm_by_overlap_freq=norm_by_overlap_freq, allowed_features_str=self._args.allowed_features, add_antisense=self._args.add_antisense, antisense_only=self._args.antisense_only, strand_specific=strand_specific, unique_only=self._args.unique_only, ) if norm_by_overlap_freq: gene_wise_quantification.calc_overlaps_per_alignment( read_alignment_path, self._paths.annotation_paths) for annotation_file, annotation_path in zip( annotation_files, self._paths.annotation_paths): gene_wise_quantification.quantify( read_alignment_path, annotation_path, self._paths.gene_quanti_path(lib_name, annotation_file), self._args.pseudocounts, ) def _gene_quanti_create_overview(self, annotation_files, annotation_paths, lib_names): """Create an overview table of all gene quantification for all libs.""" strand_specific = True if self._args.non_strand_specific: strand_specific = False gene_wise_overview = GeneWiseOverview( allowed_features_str=self._args.allowed_features, add_antisense=self._args.add_antisense, antisense_only=self._args.antisense_only, strand_specific=strand_specific, ) path_and_name_combos = {} for annotation_file, annotation_path in zip(annotation_files, annotation_paths): path_and_name_combos[annotation_path] = [] for read_file in lib_names: path_and_name_combos[annotation_path].append([ read_file, self._paths.gene_quanti_path(read_file, annotation_file), ]) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_path): gene_wise_overview.create_overview_raw_countings( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_path, ) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_rpkm_path): gene_wise_overview.create_overview_rpkm( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_rpkm_path, self._libs_and_total_num_of_aligned_reads(), ) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_tnoar_path): gene_wise_overview.create_overview_norm_by_tnoar( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_tnoar_path, self._libs_and_total_num_of_aligned_reads(), ) if self._file_needs_to_be_created( self._paths.gene_wise_quanti_combined_tpm_path): gene_wise_overview.create_overview_tpm( self._paths.gene_wise_quanti_combined_path, self._paths.gene_wise_quanti_combined_tpm_path, ) def _libs_and_total_num_of_aligned_reads(self): """Read the total number of reads per library.""" with open(self._paths.read_alignments_stats_path ) as read_aligner_stats_fh: read_aligner_stats = json.loads(read_aligner_stats_fh.read()) return dict([(lib, values["stats_total"]["no_of_aligned_reads"]) for lib, values in read_aligner_stats.items()]) def _libs_and_total_num_of_uniquely_aligned_reads(self): """Read the total number of reads per library.""" with open(self._paths.read_alignments_stats_path ) as read_aligner_stats_fh: read_aligner_stats = json.loads(read_aligner_stats_fh.read()) return dict([(lib, values["stats_total"]["no_of_uniquely_aligned_reads"]) for lib, values in read_aligner_stats.items()]) def compare_with_deseq(self): """Manage the pairwise expression comparison with DESeq.""" self._test_folder_existance(self._paths.required_deseq_folders()) arg_libs = [ self._paths._clean_file_name(lib) for lib in self._args.libs.split(",") ] conditions = self._args.conditions.split(",") self._check_deseq_args(arg_libs, conditions) deseq_runner = DESeqRunner( arg_libs, conditions, self._paths.deseq_raw_folder, self._paths.deseq_extended_folder, self._paths.deseq_script_path, self._paths.deseq_pca_heatmap_path, self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, self._args.fc_shrinkage_off, self._args.cooks_cutoff_off, ) deseq_runner.create_deseq_script_file() deseq_runner.write_session_info_file() deseq_runner.run_deseq() deseq_runner.merge_counting_files_with_results() def _check_deseq_args(self, arg_libs, conditions): """Test if the given arguments are sufficient.""" if len(arg_libs) != len(conditions): self._write_err_msg_and_quit( "Error - The read library file list and condition list must " "have the same number of elements. You entered \n%s " "(= %s elements)\nand \n%s (= %s elements).\n" % ( self._args.libs, len(arg_libs), self._args.conditions, len(conditions), )) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " 'following read list string is suggested: "%s"\n' % (", ".join(lib_names), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._write_err_msg_and_quit( 'The library "%s" is not present in your list of ' "libraries. Please add it.\n" % (lib)) def _write_err_msg_and_quit(self, msg): """Write error message and close the program gracefully.""" sys.stderr.write(msg) sys.exit(1) def viz_align(self): """Generate plots based on the read processing and mapping""" from reademptionlib.vizalign import AlignViz align_viz = AlignViz( self._paths.get_lib_names_single_end() if not self._args.paired_end else self._paths.get_lib_names_paired_end(), self._paths.read_processing_stats_path, self._paths.read_alignments_stats_path, ) align_viz.read_stat_files() align_viz.plot_input_read_length( self._paths.viz_align_input_read_length_plot_path) align_viz.plot_processed_read_length( self._paths.viz_align_processed_reads_length_plot_path) def viz_gene_quanti(self): """Generate plots based on the gene-wise read countings""" from reademptionlib.vizgenequanti import GeneQuantiViz gene_quanti_viz = GeneQuantiViz( self._paths.gene_wise_quanti_combined_path, self._paths.get_lib_names_single_end() if not self._args.paired_end else self._paths.get_lib_names_paired_end(), ) gene_quanti_viz.parse_input_table() gene_quanti_viz.plot_correlations( self._paths.viz_gene_quanti_scatter_plot_path) gene_quanti_viz.plot_annotation_class_quantification( self._paths.viz_gene_quanti_rna_classes_plot_path) def viz_deseq(self): """Generate plots based on the DESeq analysis""" from reademptionlib.vizdeseq import DESeqViz deseq_path_template = (self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") deseq_viz = DESeqViz( self._paths.deseq_script_path, deseq_path_template, max_pvalue=self._args.max_pvalue, ) deseq_viz.create_scatter_plots(self._paths.viz_deseq_scatter_plot_path) deseq_viz.create_volcano_plots( self._paths.viz_deseq_volcano_plot_path, self._paths.viz_deseq_volcano_plot_adj_path, )
def __init__(self, args): self._paths = Paths(args) self._args = args
class RunDeseq(object): def __init__(self, args): self._args = args self._paths = Paths(args) self._helpers = Helpers(args) def compare_with_deseq(self): """Manage the pairwise expression comparison with DESeq.""" self._helpers.test_folder_existance( self._paths.required_deseq_folders()) arg_libs = [ self._paths._clean_file_name(lib) for lib in self._args.libs.split(",") ] conditions = self._args.conditions.split(",") self._check_deseq_args(arg_libs, conditions) deseq_runner = DESeqRunner( arg_libs, conditions, self._paths.deseq_raw_folder, self._paths.deseq_extended_folder, self._paths.deseq_script_path, self._paths.deseq_pca_heatmap_path, self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, self._args.cooks_cutoff_off) deseq_runner.create_deseq_script_file() deseq_runner.write_session_info_file() deseq_runner.run_deseq() deseq_runner.merge_counting_files_with_results() self._viz_deseq() deseq_runner.create_final_output_files() def _check_deseq_args(self, arg_libs, conditions): """Test if the given arguments are sufficient.""" if len(arg_libs) != len(conditions): self._helpers.write_err_msg_and_quit( "Error - The read library file list and condition list must " "have the same number of elements. You entered \n%s " "(= %s elements)\nand \n%s (= %s elements).\n" % (self._args.libs, len(arg_libs), self._args.conditions, len(conditions))) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._helpers.write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " "following read list string is suggested: \"%s\"\n" % (", ".join(lib_names), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._helpers.write_err_msg_and_quit( "The library \"%s\" is not present in your list of " "libraries. Please add it.\n" % (lib)) def _viz_deseq(self): """Generate plots based on the DESeq analysis""" from reademptionlib.vizdeseq import DESeqViz conditions = list(set(self._args.conditions.split(','))) comparison_path_template_1 = "{}/deseq_comp_{}_vs_{}_with_annotation_and_countings.csv".format( self._paths.deseq_extended_folder, conditions[0], conditions[1]) comparison_1 = "{}_vs_{}".format(conditions[0], conditions[1]) deseq_viz = DESeqViz(comparison_path_template_1, self._paths.viz_deseq_base_folder, self._args.padj_cutoff, comparison_1, self._args.alpha, self._args.color_sig, self._args.color_non_sig, self._args.shape, self._args.glyph_size) deseq_viz.read_and_modificate_input() comparison_path_template_2 = "{}/deseq_comp_{}_vs_{}_with_annotation_and_countings.csv".format( self._paths.deseq_extended_folder, conditions[1], conditions[0]) comparison_2 = "{}_vs_{}".format(conditions[1], conditions[0]) deseq_viz = DESeqViz(comparison_path_template_2, self._paths.viz_deseq_base_folder, self._args.padj_cutoff, comparison_2, self._args.alpha, self._args.color_sig, self._args.color_non_sig, self._args.shape, self._args.glyph_size) deseq_viz.read_and_modificate_input()
def __init__(self, args): """Create an instance.""" self._args = args self._paths = Paths(args) self._helpers = Helpers(args)
class CalculateCoverage(object): def __init__(self, args): self._args = args self._paths = Paths(args) self._helpers = Helpers(args) def create_coverage_files(self): """Create coverage files based on the read alignments. The coverages are calculated per replicon and the results are written to the output file. This might be slower but if all coverages are detmined at once the data structure will become too large when working with large reference sequences. """ self._helpers.test_folder_existance( self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._helpers.was_paired_end_alignment( lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) for read_file, attributes in alignment_stats[0].items() ]) min_no_of_aligned_reads = float( min(read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append( executor.submit(self._create_coverage_files_for_lib, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads)) # Evaluate thread outcome self._helpers.check_job_completeness(jobs) def _create_coverage_files_for_lib(self, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads): """Perform the coverage calculation for a given library.""" if not self._args.non_strand_specific: strands = ["forward", "reverse"] else: strands = ["forward_and_reverse"] if self._all_coverage_file_exist(lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): return read_count_splitting = True if self._args.skip_read_count_splitting: read_count_splitting = False coverage_calculator = CoverageCalculator( read_count_splitting=read_count_splitting, uniquely_aligned_only=self._args.unique_only, coverage_style=self._args.coverage_style, clip_length=self._args.clip_length, non_strand_specific=self._args.non_strand_specific) (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm) = self._wiggle_writers( lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads) for ref_seq, coverages in coverage_calculator.ref_seq_and_coverages( bam_path): for strand in strands: coverage_writers_raw[strand].write_replicons_coverages( ref_seq, coverages[strand]) coverage_writers_tnoar_min_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], factor=min_no_of_aligned_reads / no_of_aligned_reads) coverage_writers_tnoar_mil_norm[ strand].write_replicons_coverages(ref_seq, coverages[strand], factor=1000000 / no_of_aligned_reads) for strand in strands: coverage_writers_raw[strand].close_file() def _all_coverage_file_exist(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Test the existance of all coverage file of a library""" files = [] for strand in strands: files.append(self._paths.wiggle_file_raw_path(lib_name, strand)) files.append( self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads)) files.append( self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads)) if not any([ self._helpers.file_needs_to_be_created(file, quiet=True) for file in files ]): sys.stderr.write( "The files %s exists. Skipping their generation.\n" % ", ".join(files)) return True return False def _wiggle_writers(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Write the calculated coverages to wiggle files.""" coverage_writers_raw = dict([ (strand, WiggleWriter( "%s_%s" % (lib_name, strand), open(self._paths.wiggle_file_raw_path(lib_name, strand), "w"))) for strand in strands ]) coverage_writers_tnoar_min_norm = dict([ (strand, WiggleWriter( "%s_%s" % (lib_name, strand), open( self._paths.wiggle_file_tnoar_norm_min_path( lib_name, strand, multi=min_no_of_aligned_reads, div=no_of_aligned_reads), "w"))) for strand in strands ]) coverage_writers_tnoar_mil_norm = dict([ (strand, WiggleWriter( "%s_%s" % (lib_name, strand), open( self._paths.wiggle_file_tnoar_norm_mil_path( lib_name, strand, multi=1000000, div=no_of_aligned_reads), "w"))) for strand in strands ]) return (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm)
def __init__(self, args): self._args = args self._paths = Paths(args) self._helpers = Helpers(args)
class TestPaths(unittest.TestCase): test_folder = "/tmp/test" test_files = ["foo.fa", "bar.fa"] test_lib_names = ["foo", "bar"] def setUp(self): self.paths = Paths(base_path=self.test_folder) self.folder_names = [ self.paths.input_folder, self.paths.output_folder, self.paths.align_report_folder, self.paths.raw_stat_data_folder, self.paths.read_fasta_folder, self.paths.ref_seq_folder, self.paths.annotation_folder, self.paths.read_alignment_index_folder, self.paths.read_alignments_folder, self.paths.processed_reads_folder, self.paths.unaligned_reads_folder, self.paths.coverage_raw_folder, self.paths.coverage_tnoar_min_norm_folder, self.paths.coverage_tnoar_mil_norm_folder, self.paths.gene_quanti_base_folder, self.paths.gene_wise_quanti_combined_path ] self.static_files = [ self.paths.read_processing_stats_path, self.paths.read_alignments_stats_path, self.paths.read_file_stats, self.paths.read_alignment_stats_table_path, self.paths.ref_seq_file_stats, self.paths.index_path ] def tearDown(self): self._remove_folder_if_exists(self.test_folder) def test_set_folder_names(self): self.paths._set_folder_names() for folder_name in self.folder_names: assert (folder_name != '') self.assertEqual(self.folder_names.count(folder_name), 1) def test_set_folder_names_with_base_path(self): self.paths._set_folder_names() for folder_name in self.folder_names: assert (folder_name != '') self.assertEqual(self.folder_names.count(folder_name), 1) def test_set_files(self): self.paths._set_static_files for file_name in self.static_files: assert (file_name != '') self.assertEqual(self.static_files.count(file_name), 1) def test_get_sorted_folder_content(self): self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual( self.paths._get_sorted_folder_content(self.test_folder), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_required_folders(self): self.assertEqual(len(self.paths.required_folders()), 25) def test_get_read_files(self): self.paths.read_fasta_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_read_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_get_ref_seq_files(self): self.paths.ref_seq_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_ref_seq_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_get_annotation_files(self): self.paths.annotation_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_annotation_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_set_read_files_dep_file_lists(self): self.paths.set_read_files_dep_file_lists_single_end( self.test_files, self.test_lib_names) for path_list in [ self.paths.read_paths, self.paths.processed_read_paths, self.paths.primary_read_aligner_bam_paths, self.paths.unaligned_reads_paths ]: assert (isinstance(path_list, list)) def test_path_list_without_appendix(self): self.assertEqual( self.paths._path_list(self.test_folder, self.test_lib_names), ['/tmp/test/foo', '/tmp/test/bar']) def test_path_list_with_appendix(self): self.assertEqual( self.paths._path_list(self.test_folder, self.test_lib_names, appendix=".boing"), ['/tmp/test/foo.boing', '/tmp/test/bar.boing']) def test_set_ref_seq_paths(self): self.paths.set_ref_seq_paths(self.test_files) self.assertEqual(self.paths.ref_seq_paths, [ "%s/%s" % (self.paths.ref_seq_folder, file) for file in self.test_files ]) def _create_folder_with_files(self, folder, file_names): os.mkdir(self.test_folder) for file_name in ["foo", "bar"]: open("%s/%s" % (self.test_folder, file_name), "w").close() def _remove_folder_if_exists(self, folder): if os.path.exists(folder): shutil.rmtree(folder)
class TestPaths(unittest.TestCase): test_folder = "/tmp/test" test_files = ["foo.fa", "bar.fa"] test_lib_names = ["foo", "bar"] def setUp(self): self.paths = Paths(base_path=self.test_folder) self.folder_names = [ self.paths.input_folder, self.paths.output_folder, self.paths.align_report_folder, self.paths.raw_stat_data_folder, self.paths.read_fasta_folder, self.paths.ref_seq_folder, self.paths.annotation_folder, self.paths.read_alignment_index_folder, self.paths.read_alignments_folder, self.paths.processed_reads_folder, self.paths.unaligned_reads_folder, self.paths.coverage_raw_folder, self.paths.coverage_tnoar_min_norm_folder, self.paths.coverage_tnoar_mil_norm_folder, self.paths.gene_quanti_base_folder, self.paths.gene_wise_quanti_combined_path] self.static_files = [ self.paths.read_processing_stats_path, self.paths.read_alignments_stats_path, self.paths.read_file_stats, self.paths.read_alignment_stats_table_path, self.paths.ref_seq_file_stats, self.paths.index_path] def tearDown(self): self._remove_folder_if_exists(self.test_folder) def test_set_folder_names(self): self.paths._set_folder_names() for folder_name in self.folder_names: assert(folder_name != '') self.assertEqual(self.folder_names.count(folder_name), 1) def test_set_folder_names_with_base_path(self): self.paths._set_folder_names() for folder_name in self.folder_names: assert(folder_name != '') self.assertEqual(self.folder_names.count(folder_name), 1) def test_set_files(self): self.paths._set_static_files for file_name in self.static_files: assert(file_name != '') self.assertEqual(self.static_files.count(file_name), 1) def test_get_sorted_folder_content(self): self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual( self.paths._get_sorted_folder_content(self.test_folder), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_required_folders(self): self.assertEqual(len(self.paths.required_folders()), 25) def test_get_read_files(self): self.paths.read_fasta_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_read_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_get_ref_seq_files(self): self.paths.ref_seq_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_ref_seq_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_get_annotation_files(self): self.paths.annotation_folder = self.test_folder self._create_folder_with_files(self.test_folder, self.test_lib_names) self.assertEqual(self.paths.get_annotation_files(), sorted(self.test_lib_names)) self._remove_folder_if_exists(self.test_folder) def test_set_read_files_dep_file_lists(self): self.paths.set_read_files_dep_file_lists_single_end( self.test_files, self.test_lib_names) for path_list in [ self.paths.read_paths, self.paths.processed_read_paths, self.paths.primary_read_aligner_sam_paths, self.paths.unaligned_reads_paths]: assert(isinstance(path_list, list)) def test_path_list_without_appendix(self): self.assertEqual( self.paths._path_list(self.test_folder, self.test_lib_names), ['/tmp/test/foo', '/tmp/test/bar']) def test_path_list_with_appendix(self): self.assertEqual( self.paths._path_list( self.test_folder, self.test_lib_names, appendix=".boing"), ['/tmp/test/foo.boing', '/tmp/test/bar.boing']) def test_set_ref_seq_paths(self): self.paths.set_ref_seq_paths(self.test_files) self.assertEqual( self.paths.ref_seq_paths, ["%s/%s" % (self.paths.ref_seq_folder, file) for file in self.test_files]) def _create_folder_with_files(self, folder, file_names): os.mkdir(self.test_folder) for file_name in ["foo", "bar"]: open("%s/%s" % (self.test_folder, file_name), "w").close() def _remove_folder_if_exists(self, folder): if os.path.exists(folder): shutil.rmtree(folder)