Esempio n. 1
0
    def __init__(self):
        namespace = self._parse_args()
        self.input_file_name = namespace.input
        self.refdata_file_name = namespace.refdata
        self.chunk_id = namespace.chunk
        self.mapped_reads_directory = Utilities.ends_with_slash(
            os.path.dirname(os.path.abspath(self.input_file_name)))

        self._output_directory = Utilities.ends_with_slash("/".join(
            os.path.dirname(os.path.abspath(
                self.input_file_name)).split("/")[:-1]))
        self.logs_directory = "{}Logs/".format(self._output_directory)
        self.statistics_directory = "{}Statistics/".format(
            self._output_directory)
        self._create_dirs()
Esempio n. 2
0
 def prepare_nfasta_for_indexing(input_file: str,
                                 output_dir: str,
                                 preserve_headers: bool = False,
                                 chop: bool = False,
                                 chunk_length: int = int(3.6 * 10**9)):
     array = FASTAArray.parse(Utilities.load_string(input_file))
     if not preserve_headers:
         array._fix_headers()
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     output_file_mask = (output_dir + Utilities.filename_only(input_file))
     annotation_file = "{}_annotation.tsv".format(output_file_mask)
     array.dump_annotation(annotation_file)
     arrays_dict = {"{}.fasta".format(output_file_mask): array}
     if chop and array.get_total_length() >= chunk_length:
         print("Too large reference nFASTA file: '{}'. Splitting sequences".
               format(input_file))
         arrays_dict = array._chop_sequences(chunk_length)
         arrays_dict = {
             "{a}_{i}.fasta".format(a=output_file_mask, i=i): arrays_dict[i]
             for i in arrays_dict
         }
     refdatas_dict = {}
     counter = 0
     for chunk_file in arrays_dict:
         counter += 1
         arrays_dict[chunk_file].dump_fastas(chunk_file)
         refdatas_dict["sequence_{}".format(counter)] = {
             "reference_nfasta": chunk_file,
             "annotation": annotation_file
         }
     print("FASTA files created: {}".format(counter))
     return refdatas_dict
Esempio n. 3
0
 def fill_dict(nfasta_file: str):
     mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(nfasta_file))) + Utilities.filename_only(nfasta_file)
     d = {"ebwt_mask": "{}_colorspace".format(mask),
          "bt2_mask": "{}_bowtie2".format(mask),
          "fai": "{}_samtools.fai".format(mask),
          "genome": "{}_samtools.genome".format(mask),
          "annotation": "{}_annotation.tsv".format(mask)}
     return d
 def __init__(self):
     self._namespace = self.parse_args()
     self.input_nfasta = self._namespace.input
     self.preserve_headers_bool = self._namespace.preserve_headers
     self.not_large_index_bool = self._namespace.not_large_index
     self.chunk_length = int(self._namespace.size * 10**9)
     self.output_dir = Utilities.ends_with_slash(self._namespace.output)
     os.makedirs(self.output_dir, exist_ok=True)
Esempio n. 5
0
 def __init__(self):
     self._namespace = self._parse_args()
     self.sampledata = self._namespace.input
     self.target_length = CoveragesVerifier.get_wc_l(self._namespace.genome) + 1
     self.prefix = self._namespace.prefix
     self.suffix = self._namespace.suffix
     self.debugging_bool = self._namespace.debug
     self.output = self._namespace.output
     if len(self.output) == 0:
         self.output = "{}sampledata/{}.sampledata".format(Utilities.ends_with_slash(os.path.dirname(self.prefix)), Utilities.get_time())
Esempio n. 6
0
 def __init__(self, parsed_dictionary: dict):
     self._nfasta = parsed_dictionary["reference_nfasta"]
     self.db_name = parsed_dictionary.get("alias")
     if not self.db_name:
         self.db_name = Utilities.filename_only(self._nfasta)
     self._reference_mask = Utilities.ends_with_slash(os.path.dirname(os.path.realpath(self._nfasta))) + self.db_name
     self.bowtie_index_mask = parsed_dictionary["ebwt_mask"]
     self.bowtie2_index_mask = parsed_dictionary["bt2_mask"]
     self.samtools_index_file = parsed_dictionary["fai"]
     self.bedtools_genome_file = parsed_dictionary["genome"]
     self.annotation_file = parsed_dictionary["annotation"]
Esempio n. 7
0
    def compile(input_file: str,
                output_dir: str,
                preserve_headers: bool = False,
                chop: bool = False,
                chunk_length: int = int(3.6 * 10**9)):
        import json
        from modules.FASTAArray import FASTAArray
        from modules.RefDataLine import RefDataLine

        output_dir = Utilities.ends_with_slash(output_dir)
        os.makedirs(output_dir, exist_ok=True)
        refdatas_dict = FASTAArray.prepare_nfasta_for_indexing(
            input_file=input_file,
            output_dir=output_dir,
            preserve_headers=preserve_headers,
            chop=chop,
            chunk_length=chunk_length)
        output_dict = {}
        for sequence_id in refdatas_dict:
            annotation_dict = refdatas_dict[sequence_id]
            nfasta_file = annotation_dict.get("reference_nfasta")
            if not nfasta_file:
                continue
            indexing_dict = {"alias": Utilities.filename_only(nfasta_file)}
            indexing_dict.update(RefDataLine.fill_dict(nfasta_file))
            indexing_dict.update(annotation_dict)
            print("Processing nFASTA: '{}'".format(nfasta_file))
            refdata = RefDataLine(indexing_dict)
            refdata.index()
            output_dict[sequence_id] = indexing_dict
        output_file = "{a}{b}_refdata.json".format(
            a=Utilities.ends_with_slash(output_dir),
            b=Utilities.filename_only(input_file))
        Utilities.dump_string(
            string=json.dumps(output_dict, sort_keys=False, indent=4) + "\n",
            file=output_file)
        print("Created reference data linker: '{}'".format(output_file))
        return output_file
Esempio n. 8
0
 def __init__(self):
     namespace = self._parse_args()
     self.sampledata_file_name = namespace.input
     self.refdata_file_name = namespace.refdata
     self.input_mask = namespace.mask
     # *_output_mask are attributes of RefDataLine class
     self.threads_number = self._parse_threads_number(namespace.threads)
     self.no_coverage_bool = namespace.no_coverage
     self.output_dir = Utilities.ends_with_slash(namespace.output)
     self.logs_directory = "{}Logs/".format(self.output_dir)
     [
         os.makedirs(i, exist_ok=True)
         for i in [self.output_dir, self.logs_directory]
     ]
Esempio n. 9
0
 def __init__(self, sampledata: SampleDataLine, refdata: RefDataLine,
              input_mask: str, output_dir: str):
     # Output directories
     output_dir = Utilities.ends_with_slash(output_dir)
     unmapped_reads_directory = "{}Unmapped_reads/".format(output_dir)
     mapped_reads_directory = "{}Mapped_reads/".format(output_dir)
     statistics_directory = "{}Statistics/".format(output_dir)
     logs_directory = "{}Logs/".format(output_dir)
     for path in [
             unmapped_reads_directory, mapped_reads_directory,
             statistics_directory, logs_directory
     ]:
         os.makedirs(path, exist_ok=True)
     # Reference data
     self.bowtie_index_mask = refdata.bowtie_index_mask
     self.bowtie2_index_mask = refdata.bowtie2_index_mask
     self.samtools_index_file = refdata.samtools_index_file
     self.bedtools_genome_file = refdata.bedtools_genome_file
     self.annotation_file = refdata.annotation_file
     # Output masks
     mapped_output_mask = "{}_{}".format(input_mask,
                                         refdata.db_name).strip("_")
     unmapped_output_mask = "{}_no_{}".format(input_mask,
                                              refdata.db_name).strip("_")
     # Sample data
     sample_name = sampledata.name
     self.raw_reads_files_list = sampledata.raw_reads_files_list
     self.raw_reads_file_extension = self.raw_reads_files_list[0].split(
         ".")[-1]
     unmapped_reads_file_mask = "{a}{b}_{c}".format(
         a=unmapped_reads_directory, b=sample_name, c=unmapped_output_mask)
     self.unmapped_reads_file_name = "{a}.{b}".format(
         a=unmapped_reads_file_mask, b=self.raw_reads_file_extension)
     self.pairwise_unmapped_reads_files_list = [
         "{a}.{i}.{b}".format(a=unmapped_reads_file_mask,
                              i=i,
                              b=self.raw_reads_file_extension)
         for i in [1, 2]
     ]
     mapped_reads_file_mask = "{a}{b}_{c}".format(a=mapped_reads_directory,
                                                  b=sample_name,
                                                  c=mapped_output_mask)
     self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask)
     self.samtools_converted_file_name = "{}.bam".format(
         mapped_reads_file_mask)
     self.samtools_sorted_file_name = "{}_sorted.bam".format(
         mapped_reads_file_mask)
     self.samtools_index_file_name = "{}.bai".format(
         self.samtools_sorted_file_name)
     statistics_file_mask = "{a}{b}_{c}".format(a=statistics_directory,
                                                b=sample_name,
                                                c=mapped_output_mask)
     self.samtools_idxstats_file_name = "{}_idxstats.txt".format(
         statistics_file_mask)
     self.samtools_stats_file_name = "{}_sam_stats.txt".format(
         statistics_file_mask)
     self.bedtools_histogram_file_name = "{}_genomeCoverageBed.txt".format(
         statistics_file_mask)
     self.stacked_coverage_file_name = "{}_pos_bp.txt".format(
         statistics_file_mask)
     self.final_coverage_file_name = "{}_coverage.tsv".format(
         statistics_file_mask)
     logs_file_mask = "{a}{b}_{c}".format(a=logs_directory,
                                          b=sample_name,
                                          c=mapped_output_mask)
     self.aligner_log_file_name = "{}_aligner.log".format(logs_file_mask)
     self.samtools_converted_log_file_name = "{}_samtools_sort.log".format(
         logs_file_mask)
     self.samtools_index_log_file_name = "{}_samtools_index.log".format(
         logs_file_mask)
     self.samtools_idxstats_log_file_name = "{}_samtools_idxstats.log".format(
         logs_file_mask)
     self.samtools_stats_log_file_name = "{}_samtools_stats.log".format(
         logs_file_mask)
     self.genomeCoverageBed_log_file_name = "{}_genomeCoverageBed.log".format(
         logs_file_mask)
Esempio n. 10
0
    def __init__(self, refdata: RefDataLine):
        # Output directories
        output_dir = Utilities.ends_with_slash(
            os.path.dirname(os.path.realpath(mainInitializer.input_file_name)))
        mapped_reads_directory = "{}Mapped_reads/".format(output_dir)
        statistics_directory = "{}Statistics/".format(output_dir)
        logs_directory = "{}Logs/".format(output_dir)
        for path in [
                mapped_reads_directory, statistics_directory, logs_directory
        ]:
            os.makedirs(path, exist_ok=True)
        # Reference data
        self.samtools_index_file = refdata.samtools_index_file
        self.bedtools_genome_file = refdata.bedtools_genome_file
        self.annotation_file = refdata.annotation_file
        # Output files
        if mainInitializer.input_file_name.endswith("_sorted.bam"):
            sample_name = re.sub("_sorted.bam$", "",
                                 mainInitializer.input_file_name)
        else:
            sample_name = ".".join(
                mainInitializer.input_file_name.split(".")[:-1])
        mapped_reads_file_mask = "{a}{b}".format(a=mapped_reads_directory,
                                                 b=sample_name)
        self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask)
        self.samtools_converted_file_name = "{}.bam".format(
            mapped_reads_file_mask)
        self.samtools_sorted_file_name = "{}_sorted.bam".format(
            mapped_reads_file_mask)

        self._mapped_output_mask = refdata.db_name
        unmapped_output_mask = "_".join(["no", self._mapped_output_mask])
        # Sample data
        sample_name = sampledata.name
        self.raw_reads_files_list = sampledata.raw_reads_files_list
        self.raw_reads_file_extension = self.raw_reads_files_list[0].split(
            ".")[-1]
        unmapped_reads_file_mask = "{a}{b}_{c}".format(
            a=unmapped_reads_directory, b=sample_name, c=unmapped_output_mask)
        self.unmapped_reads_file_name = "{a}.{b}".format(
            a=unmapped_reads_file_mask, b=self.raw_reads_file_extension)
        self.pairwise_unmapped_reads_files_list = [
            "{a}.{i}.{b}".format(a=unmapped_reads_file_mask,
                                 i=i,
                                 b=self.raw_reads_file_extension)
            for i in [1, 2]
        ]
        mapped_reads_file_mask = "{a}{b}_{c}".format(
            a=mapped_output_mask, b=sample_name, c=self._mapped_output_mask)
        self.mapped_reads_file_name = "{}.sam".format(mapped_reads_file_mask)
        self.samtools_converted_file_name = "{}.bam".format(
            mapped_reads_file_mask)
        self.samtools_sorted_file_name = "{}_sorted.bam".format(
            mapped_reads_file_mask)
        self.samtools_index_file_name = "{}.bai".format(
            self.samtools_sorted_file_name)
        statistics_file_mask = "{a}{b}_{c}".format(a=statistics_directory,
                                                   b=sample_name,
                                                   c=self._mapped_output_mask)
        self.samtools_idxstats_file_name = "{}_idxstats.txt".format(
            statistics_file_mask)
        self.samtools_stats_file_name = "{}_sam_stats.txt".format(
            statistics_file_mask)
        self.bedtools_histogram_file_name = "{}_genomeCoverageBed.txt".format(
            statistics_file_mask)
        self.stacked_coverage_file_name = "{}_pos_bp.txt".format(
            statistics_file_mask)
        self.final_coverage_file_name = "{}_coverage.tsv".format(
            statistics_file_mask)
        logs_file_mask = "{a}{b}_{c}".format(a=logs_directory,
                                             b=sample_name,
                                             c=self._mapped_output_mask)
        self.aligner_log_file_name = "{}_aligner.log".format(logs_file_mask)
        self.samtools_converted_log_file_name = "{}_sam2bam.log".format(
            logs_file_mask)
        self.samtools_index_log_file_name = "{}_index_bam.log".format(
            logs_file_mask)