Exemple #1
0
    def _anonymize_data(self, list_of_output_gsa, file_path_output_gsa_pooled):
        """
        Anonymize reads and assemblies.

        @param list_of_output_gsa: List of file paths of assemblies
        @type list_of_output_gsa: list[str|unicode]
        @param file_path_output_gsa_pooled: file paths of assembly from all samples
        @type file_path_output_gsa_pooled: str | unicode

        @rtype: None
        """
        gs_mapping = GoldStandardFileFormat(
            column_name_gid=self._column_name_genome_id,
            column_name_ncbi=self._column_name_ncbi,
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose
        )
        file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path()

        directories_fastq_dir_in = [
            self._project_file_folder_handler.get_reads_dir(True, str(sample_index))
            for sample_index in range(self._number_of_samples)]

        if (self._read_simulator_type == "art" or self._read_simulator_type == "wgsim"):
            paired_end = True
        else:
            paired_end = False

        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
        for sample_index in range(self._number_of_samples):
            file_path_anonymous_reads_tmp, file_path_anonymous_mapping_tmp = self._anonymize_reads(
                directories_fastq_dir_in[sample_index],
                "S{}R".format(sample_index),
                paired_end)
            sample_id = str(sample_index)
            file_path_anonymous_reads_out = self._project_file_folder_handler.get_anonymous_reads_file_path(sample_id)
            file_path_anonymous_gs_mapping_out = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id)
            if self._phase_compress:
                file_path_anonymous_gs_mapping = tempfile.mktemp(
                    dir=self._project_file_folder_handler.get_tmp_wd(),
                    prefix="anonymous_gs_mapping")
            else:
                file_path_anonymous_gs_mapping = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id)
            with open(file_path_anonymous_gs_mapping, 'w') as stream_output:
                gs_mapping.gs_read_mapping(
                    file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp, stream_output
                )
            if self._phase_compress:
                self._list_tuple_archive_files.append(
                    (file_path_anonymous_reads_tmp, file_path_anonymous_reads_out+".gz"))
                self._list_tuple_archive_files.append(
                    (file_path_anonymous_gs_mapping, file_path_anonymous_gs_mapping_out+".gz"))
            else:
                shutil.move(file_path_anonymous_reads_tmp, file_path_anonymous_reads_out)

        if not self._phase_gsa and not self._phase_pooled_gsa:
            return

        samtools = SamtoolsWrapper(
            file_path_samtools=self._executable_samtools,
            max_processes=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug
            )

        if self._phase_gsa:
            for sample_index in range(self._number_of_samples):
                file_path_output_anonymous_gsa, file_path_anonymous_mapping_tmp = self._anonymize_gsa(
                    list_of_output_gsa[sample_index],
                    "S{}C".format(sample_index))
                sample_id = str(sample_index)
                file_path_output_anonymous_gsa_out = self._project_file_folder_handler.get_anonymous_gsa_file_path(sample_id)
                file_path_anonymous_gsa_mapping_out = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id)
                if self._phase_compress:
                    file_path_anonymous_gsa_mapping = tempfile.mktemp(
                        dir=self._project_file_folder_handler.get_tmp_wd(),
                        prefix="anonymous_gsa_mapping")
                else:
                    file_path_anonymous_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id)

                list_file_paths_read_positions = [
                    samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id))
                    ]
                with open(file_path_anonymous_gsa_mapping, 'w') as stream_output:
                    gs_mapping.gs_contig_mapping(
                        file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp,
                        list_file_paths_read_positions, stream_output
                    )
                if self._phase_compress:
                    self._list_tuple_archive_files.append(
                        (file_path_output_anonymous_gsa, file_path_output_anonymous_gsa_out+".gz"))
                    self._list_tuple_archive_files.append(
                        (file_path_anonymous_gsa_mapping, file_path_anonymous_gsa_mapping_out+".gz"))
                else:
                    shutil.move(file_path_output_anonymous_gsa, file_path_output_anonymous_gsa_out)
        if self._phase_pooled_gsa:
            file_path_output_anonymous, file_path_anonymous_mapping_tmp = self._anonymize_pooled_gsa(
                file_path_output_gsa_pooled,
                "PC")
            file_path_output_anonymous_out = self._project_file_folder_handler.get_anonymous_gsa_pooled_file_path()
            file_path_anonymous_gsa_mapping_out = self._project_file_folder_handler.get_anonymous_gsa_pooled_map_file_path()
            if self._phase_compress:
                file_path_anonymous_gsa_mapping = tempfile.mktemp(
                    dir=self._project_file_folder_handler.get_tmp_wd(),
                    prefix="anonymous_gsa_pooled_mapping")
            else:
                file_path_anonymous_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_pooled_map_file_path()

            list_file_paths_read_positions = [
                samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(str(sample_index)))
                for sample_index in range(self._number_of_samples)
                ]
            with open(file_path_anonymous_gsa_mapping, 'w') as stream_output:
                gs_mapping.gs_contig_mapping(
                    file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp,
                    list_file_paths_read_positions, stream_output
                )
            if self._phase_compress:
                self._list_tuple_archive_files.append(
                    (file_path_output_anonymous, file_path_output_anonymous_out+".gz"))
                self._list_tuple_archive_files.append(
                    (file_path_anonymous_gsa_mapping, file_path_anonymous_gsa_mapping_out+".gz"))
            else:
                shutil.move(file_path_output_anonymous, file_path_output_anonymous_out)
Exemple #2
0
    def _create_binning_gs(self, list_of_output_gsa):
        """
        Create binning gold standard without anonymization first

        @param list_of_output_gsa: List of file paths of assemblies
        @type list_of_output_gsa: list[str|unicode]
        @param file_path_output_gsa_pooled: file paths of assembly from all samples
        @type file_path_output_gsa_pooled: str | unicode

        @rtype: None
        """
        gff = GoldStandardFileFormat(logfile = self._logfile, verbose = self._verbose)
        # read-based binning
        file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path()
        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
        dict_sequence_to_genome_id = gff.get_dict_sequence_to_genome_id(file_path_genome_locations)
        dict_genome_id_to_tax_id = gff.get_dict_genome_id_to_tax_id(file_path_metadata)
        
        directories_fastq_dir_in = [
            self._project_file_folder_handler.get_reads_dir(True, str(sample_index))
            for sample_index in range(self._number_of_samples)]

        if (self._read_simulator_type == "art" or self._read_simulator_type == "wgsim"):
            paired_end = True
        else:
            paired_end = False
        
        for sample_index in range(self._number_of_samples):
            sample_id = str(sample_index)
            readfiles = directories_fastq_dir_in[sample_index]
            if self._phase_compress:
                file_path_gs_mapping = tempfile.mktemp(
                    dir=self._project_file_folder_handler.get_tmp_wd(),
                    prefix="gs_mapping")
            else:
                file_path_gs_mapping = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id)
            samtools = SamtoolsWrapper(
                file_path_samtools=self._executable_samtools,
                max_processes=self._max_processors,
                tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
                logfile=self._logfile,
                verbose=self._verbose,
                debug=self._debug
                )
            list_file_paths_read_positions = [ 
                samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id))
                ]
            dict_original_seq_pos = gff.get_dict_sequence_name_to_positions(list_file_paths_read_positions)
            with open(file_path_gs_mapping, 'w') as stream_output:
                row_format = "{aid}\t{gid}\t{tid}\t{sid}\n"
                line = '#' + row_format.format(
                    aid="anonymous_read_id",
                    gid="genome_id",
                    tid="tax_id",
                    sid="read_id")
                stream_output.write(line)
                for read in dict_original_seq_pos:
                    seq_id = read.strip().split(' ')[0]
                    gen_id = read.strip().split('-')[0]
                    genome_id = dict_sequence_to_genome_id[gen_id]
                    tax_id = dict_genome_id_to_tax_id[genome_id]
                    line = row_format.format(
                        aid=seq_id,
                        gid=genome_id,
                        tid=tax_id,
                        sid=seq_id,
                    )
                    stream_output.write(line)
            if self._phase_compress:
                self._list_tuple_archive_files.append(
                    (file_path_gs_mapping, self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id)+".gz"))
            
            if self._phase_compress:
                file_path_gsa_mapping = tempfile.mktemp(
                    dir=self._project_file_folder_handler.get_tmp_wd(),
                    prefix="anonymous_gsa_mapping")
            else:
                file_path_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id)
            samtools = SamtoolsWrapper(
                file_path_samtools=self._executable_samtools,
                max_processes=self._max_processors,
                tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
                logfile=self._logfile,
                verbose=self._verbose,
                debug=self._debug
                )
            list_file_paths_read_positions = [
                samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id))
                ]
            dict_original_seq_pos = gff.get_dict_sequence_name_to_positions(list_file_paths_read_positions)
            file_path_output_anonymous_gsa_out = self._project_file_folder_handler.get_anonymous_gsa_file_path(sample_id)
            
            gsa = list_of_output_gsa[sample_index]
            with open(gsa, 'r') as gs:
                with open(file_path_gsa_mapping, 'w') as stream_output:
                    row_format = "{name}\t{genome_id}\t{tax_id}\t{length}\n"
                    stream_output.write("@@SEQUENCEID\tBINID\tTAXID\t_LENGTH\n")
                    for seq_id in gs:
                        if not seq_id.startswith(">"):
                            continue
                        seq_id = seq_id[1:].strip()
                        seq_info = seq_id.rsplit("_from_", 1)
                        # print(seq_info)
                        sequence_id = seq_info[0]
                        # pos_start, pos_end = re.findall(r'\d+', seq_info[1])[:2]
                        pos_start = int(seq_info[1].split("_", 1)[0])
                        pos_end = int(seq_info[1].split("_to_", 1)[1].split("_", 1)[0])

                        genome_id = dict_sequence_to_genome_id[sequence_id]
                        tax_id = dict_genome_id_to_tax_id[genome_id]
                        stream_output.write(row_format.format(
                            name=seq_id,
                            genome_id=genome_id,
                            tax_id=tax_id,
                            length=str(pos_end-pos_start+1)
                            )
                        )
                if self._phase_compress:
                    self._list_tuple_archive_files.append(
                        (file_path_gsa_mapping, self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id)))
                else:
                    shutil.move(file_path_gsa_mapping, file_path_output_anonymous_gsa_out)