Python GoldStandardFileFormat Exemples

Langage de programmation: Python

Espace de nommage/Pack: scripts.GoldStandardFileFormat.goldstandardfileformat

Exemples au hotexamples.com: 2

Python GoldStandardFileFormat - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de scripts.GoldStandardFileFormat.goldstandardfileformat.GoldStandardFileFormat extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

GoldStandardFileFormat(2)

get_dict_genome_id_to_tax_id(1)

get_dict_sequence_name_to_positions(1)

get_dict_sequence_to_genome_id(1)

gs_contig_mapping(1)

gs_read_mapping(1)

Méthodes fréquemment utilisées

GoldStandardFileFormat (2)

get_dict_genome_id_to_tax_id (1)

get_dict_sequence_name_to_positions (1)

get_dict_sequence_to_genome_id (1)

gs_contig_mapping (1)

gs_read_mapping (1)

Exemple #1

0

Afficher le fichier

def _anonymize_data(self, list_of_output_gsa, file_path_output_gsa_pooled): """ Anonymize reads and assemblies. @param list_of_output_gsa: List of file paths of assemblies @type list_of_output_gsa: list[str|unicode] @param file_path_output_gsa_pooled: file paths of assembly from all samples @type file_path_output_gsa_pooled: str | unicode @rtype: None """ gs_mapping = GoldStandardFileFormat( column_name_gid=self._column_name_genome_id, column_name_ncbi=self._column_name_ncbi, separator=self._separator, logfile=self._logfile, verbose=self._verbose ) file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path() directories_fastq_dir_in = [ self._project_file_folder_handler.get_reads_dir(True, str(sample_index)) for sample_index in range(self._number_of_samples)] if (self._read_simulator_type == "art" or self._read_simulator_type == "wgsim"): paired_end = True else: paired_end = False file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path() for sample_index in range(self._number_of_samples): file_path_anonymous_reads_tmp, file_path_anonymous_mapping_tmp = self._anonymize_reads( directories_fastq_dir_in[sample_index], "S{}R".format(sample_index), paired_end) sample_id = str(sample_index) file_path_anonymous_reads_out = self._project_file_folder_handler.get_anonymous_reads_file_path(sample_id) file_path_anonymous_gs_mapping_out = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id) if self._phase_compress: file_path_anonymous_gs_mapping = tempfile.mktemp( dir=self._project_file_folder_handler.get_tmp_wd(), prefix="anonymous_gs_mapping") else: file_path_anonymous_gs_mapping = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id) with open(file_path_anonymous_gs_mapping, 'w') as stream_output: gs_mapping.gs_read_mapping( file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp, stream_output ) if self._phase_compress: self._list_tuple_archive_files.append( (file_path_anonymous_reads_tmp, file_path_anonymous_reads_out+".gz")) self._list_tuple_archive_files.append( (file_path_anonymous_gs_mapping, file_path_anonymous_gs_mapping_out+".gz")) else: shutil.move(file_path_anonymous_reads_tmp, file_path_anonymous_reads_out) if not self._phase_gsa and not self._phase_pooled_gsa: return samtools = SamtoolsWrapper( file_path_samtools=self._executable_samtools, max_processes=self._max_processors, tmp_dir=self._project_file_folder_handler.get_tmp_wd(), logfile=self._logfile, verbose=self._verbose, debug=self._debug ) if self._phase_gsa: for sample_index in range(self._number_of_samples): file_path_output_anonymous_gsa, file_path_anonymous_mapping_tmp = self._anonymize_gsa( list_of_output_gsa[sample_index], "S{}C".format(sample_index)) sample_id = str(sample_index) file_path_output_anonymous_gsa_out = self._project_file_folder_handler.get_anonymous_gsa_file_path(sample_id) file_path_anonymous_gsa_mapping_out = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id) if self._phase_compress: file_path_anonymous_gsa_mapping = tempfile.mktemp( dir=self._project_file_folder_handler.get_tmp_wd(), prefix="anonymous_gsa_mapping") else: file_path_anonymous_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id) list_file_paths_read_positions = [ samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id)) ] with open(file_path_anonymous_gsa_mapping, 'w') as stream_output: gs_mapping.gs_contig_mapping( file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp, list_file_paths_read_positions, stream_output ) if self._phase_compress: self._list_tuple_archive_files.append( (file_path_output_anonymous_gsa, file_path_output_anonymous_gsa_out+".gz")) self._list_tuple_archive_files.append( (file_path_anonymous_gsa_mapping, file_path_anonymous_gsa_mapping_out+".gz")) else: shutil.move(file_path_output_anonymous_gsa, file_path_output_anonymous_gsa_out) if self._phase_pooled_gsa: file_path_output_anonymous, file_path_anonymous_mapping_tmp = self._anonymize_pooled_gsa( file_path_output_gsa_pooled, "PC") file_path_output_anonymous_out = self._project_file_folder_handler.get_anonymous_gsa_pooled_file_path() file_path_anonymous_gsa_mapping_out = self._project_file_folder_handler.get_anonymous_gsa_pooled_map_file_path() if self._phase_compress: file_path_anonymous_gsa_mapping = tempfile.mktemp( dir=self._project_file_folder_handler.get_tmp_wd(), prefix="anonymous_gsa_pooled_mapping") else: file_path_anonymous_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_pooled_map_file_path() list_file_paths_read_positions = [ samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(str(sample_index))) for sample_index in range(self._number_of_samples) ] with open(file_path_anonymous_gsa_mapping, 'w') as stream_output: gs_mapping.gs_contig_mapping( file_path_genome_locations, file_path_metadata, file_path_anonymous_mapping_tmp, list_file_paths_read_positions, stream_output ) if self._phase_compress: self._list_tuple_archive_files.append( (file_path_output_anonymous, file_path_output_anonymous_out+".gz")) self._list_tuple_archive_files.append( (file_path_anonymous_gsa_mapping, file_path_anonymous_gsa_mapping_out+".gz")) else: shutil.move(file_path_output_anonymous, file_path_output_anonymous_out)

Exemple #2

0

Afficher le fichier

def _create_binning_gs(self, list_of_output_gsa): """ Create binning gold standard without anonymization first @param list_of_output_gsa: List of file paths of assemblies @type list_of_output_gsa: list[str|unicode] @param file_path_output_gsa_pooled: file paths of assembly from all samples @type file_path_output_gsa_pooled: str | unicode @rtype: None """ gff = GoldStandardFileFormat(logfile = self._logfile, verbose = self._verbose) # read-based binning file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path() file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path() dict_sequence_to_genome_id = gff.get_dict_sequence_to_genome_id(file_path_genome_locations) dict_genome_id_to_tax_id = gff.get_dict_genome_id_to_tax_id(file_path_metadata) directories_fastq_dir_in = [ self._project_file_folder_handler.get_reads_dir(True, str(sample_index)) for sample_index in range(self._number_of_samples)] if (self._read_simulator_type == "art" or self._read_simulator_type == "wgsim"): paired_end = True else: paired_end = False for sample_index in range(self._number_of_samples): sample_id = str(sample_index) readfiles = directories_fastq_dir_in[sample_index] if self._phase_compress: file_path_gs_mapping = tempfile.mktemp( dir=self._project_file_folder_handler.get_tmp_wd(), prefix="gs_mapping") else: file_path_gs_mapping = self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id) samtools = SamtoolsWrapper( file_path_samtools=self._executable_samtools, max_processes=self._max_processors, tmp_dir=self._project_file_folder_handler.get_tmp_wd(), logfile=self._logfile, verbose=self._verbose, debug=self._debug ) list_file_paths_read_positions = [ samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id)) ] dict_original_seq_pos = gff.get_dict_sequence_name_to_positions(list_file_paths_read_positions) with open(file_path_gs_mapping, 'w') as stream_output: row_format = "{aid}\t{gid}\t{tid}\t{sid}\n" line = '#' + row_format.format( aid="anonymous_read_id", gid="genome_id", tid="tax_id", sid="read_id") stream_output.write(line) for read in dict_original_seq_pos: seq_id = read.strip().split(' ')[0] gen_id = read.strip().split('-')[0] genome_id = dict_sequence_to_genome_id[gen_id] tax_id = dict_genome_id_to_tax_id[genome_id] line = row_format.format( aid=seq_id, gid=genome_id, tid=tax_id, sid=seq_id, ) stream_output.write(line) if self._phase_compress: self._list_tuple_archive_files.append( (file_path_gs_mapping, self._project_file_folder_handler.get_anonymous_reads_map_file_path(sample_id)+".gz")) if self._phase_compress: file_path_gsa_mapping = tempfile.mktemp( dir=self._project_file_folder_handler.get_tmp_wd(), prefix="anonymous_gsa_mapping") else: file_path_gsa_mapping = self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id) samtools = SamtoolsWrapper( file_path_samtools=self._executable_samtools, max_processes=self._max_processors, tmp_dir=self._project_file_folder_handler.get_tmp_wd(), logfile=self._logfile, verbose=self._verbose, debug=self._debug ) list_file_paths_read_positions = [ samtools.read_start_positions_from_dir_of_bam(self._project_file_folder_handler.get_bam_dir(sample_id)) ] dict_original_seq_pos = gff.get_dict_sequence_name_to_positions(list_file_paths_read_positions) file_path_output_anonymous_gsa_out = self._project_file_folder_handler.get_anonymous_gsa_file_path(sample_id) gsa = list_of_output_gsa[sample_index] with open(gsa, 'r') as gs: with open(file_path_gsa_mapping, 'w') as stream_output: row_format = "{name}\t{genome_id}\t{tax_id}\t{length}\n" stream_output.write("@@SEQUENCEID\tBINID\tTAXID\t_LENGTH\n") for seq_id in gs: if not seq_id.startswith(">"): continue seq_id = seq_id[1:].strip() seq_info = seq_id.rsplit("_from_", 1) # print(seq_info) sequence_id = seq_info[0] # pos_start, pos_end = re.findall(r'\d+', seq_info[1])[:2] pos_start = int(seq_info[1].split("_", 1)[0]) pos_end = int(seq_info[1].split("_to_", 1)[1].split("_", 1)[0]) genome_id = dict_sequence_to_genome_id[sequence_id] tax_id = dict_genome_id_to_tax_id[genome_id] stream_output.write(row_format.format( name=seq_id, genome_id=genome_id, tax_id=tax_id, length=str(pos_end-pos_start+1) ) ) if self._phase_compress: self._list_tuple_archive_files.append( (file_path_gsa_mapping, self._project_file_folder_handler.get_anonymous_gsa_map_file_path(sample_id))) else: shutil.move(file_path_gsa_mapping, file_path_output_anonymous_gsa_out)