Beispiel #1
0
 def _check_deseq_args(self, arg_libs, conditions):
     """Test if the given arguments are sufficient."""
     if len(arg_libs) != len(conditions):
         self._write_err_msg_and_quit(
             "Error - The read library file list and condition list must "
             "have the same number of elements. You entered \n%s "
             "(= %s elements)\nand \n%s (= %s elements).\n" % (
                 self._args.libs,
                 len(arg_libs),
                 self._args.conditions,
                 len(conditions),
             ))
     raw_stat_data_reader = RawStatDataReader()
     alignment_stats = [
         raw_stat_data_reader.read(self._paths.read_alignments_stats_path)
     ]
     lib_names = list(alignment_stats[0].keys())
     if len(lib_names) != len(arg_libs):
         self._write_err_msg_and_quit(
             "The number of read libraries is lower or higher than "
             "expected. The following read libs are available: %s\nThe "
             'following read list string is suggested: "%s"\n' %
             (", ".join(lib_names), ",".join(lib_names)))
     for lib in lib_names:
         if lib not in arg_libs:
             self._write_err_msg_and_quit(
                 'The library "%s" is not present in your list of '
                 "libraries. Please add it.\n" % (lib))
Beispiel #2
0
    def create_coverage_files(self):
        """Create coverage files based on the read alignments.

        The coverages are calculated per replicon and the results are
        written to the output file. This might be slower but if all
        coverages are determined at once the data structure will become
        too large when working with large reference sequences.

        """
        self._test_folder_existance(self._paths.required_coverage_folders())
        raw_stat_data_reader = RawStatDataReader()
        alignment_stats = [
            raw_stat_data_reader.read(self._paths.read_alignments_stats_path)
        ]
        lib_names = list(alignment_stats[0].keys())
        was_paired_end_alignment = self._was_paired_end_alignment(lib_names)
        if not was_paired_end_alignment:
            self._paths.set_read_files_dep_file_lists_single_end(
                self._paths.get_read_files(), lib_names)
        else:
            self._paths.set_read_files_dep_file_lists_paired_end(
                self._paths.get_read_files(), lib_names)
        # Get number of aligned or number of uniquely aligned reads
        if not self._args.normalize_by_uniquely:
            aligned_counting = "no_of_aligned_reads"
        else:
            aligned_counting = "no_of_uniquely_aligned_reads"
        read_files_aligned_read_freq = dict([
            (read_file, round(attributes["stats_total"][aligned_counting]))
            for read_file, attributes in alignment_stats[0].items()
        ])
        min_no_of_aligned_reads = float(
            min(read_files_aligned_read_freq.values()))
        # Run the generation of coverage in parallel
        jobs = []
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=self._args.processes) as executor:
            for lib_name, bam_path in zip(
                    lib_names, self._paths.read_alignment_bam_paths):
                no_of_aligned_reads = float(
                    read_files_aligned_read_freq[lib_name])
                jobs.append(
                    executor.submit(
                        self._create_coverage_files_for_lib,
                        lib_name,
                        bam_path,
                        no_of_aligned_reads,
                        min_no_of_aligned_reads,
                    ))
        # Evaluate thread outcome
        self._check_job_completeness(jobs)
Beispiel #3
0
    def create_coverage_files(self):
        """Create coverage files based on the read alignments.

        The coverages are calculated per replicon and the results are
        written to the output file. This might be slower but if all
        coverages are detmined at once the data structure will become
        too large when working with large reference sequences.

        """
        self._test_folder_existance(self._paths.required_coverage_folders())
        raw_stat_data_reader = RawStatDataReader()
        alignment_stats = [raw_stat_data_reader.read(
            self._paths.read_alignments_stats_path)]
        lib_names = list(alignment_stats[0].keys())
        was_paired_end_alignment = self._was_paired_end_alignment(lib_names)
        if not was_paired_end_alignment:
            self._paths.set_read_files_dep_file_lists_single_end(
                self._paths.get_read_files(), lib_names)
        else:
            self._paths.set_read_files_dep_file_lists_paired_end(
                self._paths.get_read_files(), lib_names)
        # Get number of aligned or number of uniquely aligned reads
        if not self._args.normalize_by_uniquely:
            aligned_counting = "no_of_aligned_reads"
        else:
            aligned_counting = "no_of_uniquely_aligned_reads"
        read_files_aligned_read_freq = dict([
            (read_file,
             round(attributes["stats_total"][aligned_counting]))
            for read_file, attributes in alignment_stats[0].items()])
        min_no_of_aligned_reads = float(min(
            read_files_aligned_read_freq.values()))
        # Run the generation of coverage in parallel
        jobs = []
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=self._args.processes) as executor:
            for lib_name, bam_path in zip(
                    lib_names, self._paths.read_alignment_bam_paths):
                no_of_aligned_reads = float(
                    read_files_aligned_read_freq[lib_name])
                jobs.append(executor.submit(
                    self._create_coverage_files_for_lib,
                    lib_name, bam_path, no_of_aligned_reads,
                    min_no_of_aligned_reads))
        # Evaluate thread outcome
        self._check_job_completeness(jobs)
Beispiel #4
0
 def quantify_gene_wise(self):
     """Manage the counting of aligned reads per gene."""
     self._test_folder_existance(self._paths.required_gene_quanti_folders())
     norm_by_alignment_freq = True
     norm_by_overlap_freq = True
     if self._args.no_count_split_by_alignment_no:
         norm_by_alignment_freq = False
     if self._args.no_count_splitting_by_gene_no:
         norm_by_overlap_freq = False
     raw_stat_data_reader = RawStatDataReader()
     alignment_stats = [
         raw_stat_data_reader.read(self._paths.read_alignments_stats_path)
     ]
     lib_names = sorted(list(alignment_stats[0].keys()))
     annotation_files = self._paths.get_annotation_files()
     self._paths.set_annotation_paths(annotation_files)
     was_paired_end_alignment = self._was_paired_end_alignment(lib_names)
     if not was_paired_end_alignment:
         self._paths.set_read_files_dep_file_lists_single_end(
             self._paths.get_read_files(), lib_names)
     else:
         self._paths.set_read_files_dep_file_lists_paired_end(
             self._paths.get_read_files(), lib_names)
     jobs = []
     with concurrent.futures.ProcessPoolExecutor(
             max_workers=self._args.processes) as executor:
         for lib_name, read_alignment_path in zip(
                 lib_names, self._paths.read_alignment_bam_paths):
             jobs.append(
                 executor.submit(
                     self._quantify_gene_wise,
                     lib_name,
                     read_alignment_path,
                     norm_by_alignment_freq,
                     norm_by_overlap_freq,
                     annotation_files,
                 ))
     # Evaluate thread outcome
     self._check_job_completeness(jobs)
     self._gene_quanti_create_overview(annotation_files,
                                       self._paths.annotation_paths,
                                       lib_names)
Beispiel #5
0
 def _write_alignment_stat_table(self):
     """Manage the creation of the mapping statistic output table."""
     raw_stat_data_reader = RawStatDataReader()
     read_processing_stats = raw_stat_data_reader.read(
         self._paths.read_processing_stats_path)
     final_alignment_stats = raw_stat_data_reader.read(
         self._paths.read_alignments_stats_path)
     realignment_stats = None
     primary_aligner_stats = None
     if self._args.realign:
         primary_aligner_stats = raw_stat_data_reader.read(
             self._paths.primary_read_aligner_stats_path)
         realignment_stats = raw_stat_data_reader.read(
             self._paths.read_realigner_stats_path)
     read_aligner_stats_table = ReadAlignerStatsTable(
         read_processing_stats, final_alignment_stats,
         primary_aligner_stats,
         realignment_stats, self._lib_names,
         self._paths.read_alignment_stats_table_path, self._args.paired_end)
     read_aligner_stats_table.write()
Beispiel #6
0
 def _check_deseq_args(self, arg_libs, conditions):
     """Test if the given arguments are sufficient."""
     if len(arg_libs) != len(conditions):
         self._write_err_msg_and_quit(
             "Error - The read library file list and condition list must "
             "have the same number of elements. You entered \n%s "
             "(= %s elements)\nand \n%s (= %s elements).\n" % (
                 self._args.libs, len(arg_libs), self._args.conditions,
                 len(conditions)))
     raw_stat_data_reader = RawStatDataReader()
     alignment_stats = [raw_stat_data_reader.read(
         self._paths.read_alignments_stats_path)]
     lib_names = list(alignment_stats[0].keys())
     if len(lib_names) != len(arg_libs):
         self._write_err_msg_and_quit(
             "The number of read libraries is lower or higher than "
             "expected. The following read libs are available: %s\nThe "
             "following read list string is suggested: \"%s\"\n" % (
                 ", ".join(lib_names), ",".join(lib_names)))
     for lib in lib_names:
         if lib not in arg_libs:
             self._write_err_msg_and_quit(
                 "The library \"%s\" is not present in your list of "
                 "libraries. Please add it.\n" % (lib))
Beispiel #7
0
 def quantify_gene_wise(self):
     """Manage the counting of aligned reads per gene."""
     self._test_folder_existance(
         self._paths.required_gene_quanti_folders())
     norm_by_alignment_freq = True
     norm_by_overlap_freq = True
     if self._args.no_count_split_by_alignment_no:
         norm_by_alignment_freq = False
     if self._args.no_count_splitting_by_gene_no:
         norm_by_overlap_freq = False
     raw_stat_data_reader = RawStatDataReader()
     alignment_stats = [raw_stat_data_reader.read(
         self._paths.read_alignments_stats_path)]
     lib_names = sorted(list(alignment_stats[0].keys()))
     annotation_files = self._paths.get_annotation_files()
     self._paths.set_annotation_paths(annotation_files)
     was_paired_end_alignment = self._was_paired_end_alignment(lib_names)
     if not was_paired_end_alignment:
         self._paths.set_read_files_dep_file_lists_single_end(
             self._paths.get_read_files(), lib_names)
     else:
         self._paths.set_read_files_dep_file_lists_paired_end(
             self._paths.get_read_files(), lib_names)
     jobs = []
     with concurrent.futures.ProcessPoolExecutor(
             max_workers=self._args.processes) as executor:
         for lib_name, read_alignment_path in zip(
                 lib_names, self._paths.read_alignment_bam_paths):
             jobs.append(executor.submit(
                 self._quantify_gene_wise, lib_name,
                 read_alignment_path, norm_by_alignment_freq,
                 norm_by_overlap_freq, annotation_files))
     # Evaluate thread outcome
     self._check_job_completeness(jobs)
     self._gene_quanti_create_overview(
         annotation_files, self._paths.annotation_paths, lib_names)
Beispiel #8
0
 def _write_alignment_stat_table(self):
     """Manage the creation of the mapping statistic output table."""
     raw_stat_data_reader = RawStatDataReader()
     read_processing_stats = raw_stat_data_reader.read(
         self._paths.read_processing_stats_path)
     final_alignment_stats = raw_stat_data_reader.read(
         self._paths.read_alignments_stats_path)
     realignment_stats = None
     primary_aligner_stats = None
     if self._args.realign:
         primary_aligner_stats = raw_stat_data_reader.read(
             self._paths.primary_read_aligner_stats_path)
         realignment_stats = raw_stat_data_reader.read(
             self._paths.read_realigner_stats_path)
     read_aligner_stats_table = ReadAlignerStatsTable(
         read_processing_stats, final_alignment_stats,
         primary_aligner_stats, realignment_stats, self._lib_names,
         self._paths.read_alignment_stats_table_path, self._args.paired_end)
     read_aligner_stats_table.write()