def _read_genome_location_file(self, file_path): """ Read file with the file paths of gnomes @param file_path: File genome id associated with the file path of a genome @type file_path: str | unicode @return: Dictionary of genome id to file path @rtype: dict[str|unicode, str|unicode] """ self._logger.info('Reading genome location file') assert self.validate_file(file_path) dict_id_file_path = {} metadata_table = MetadataTable(logfile=self._logfile, verbose=self._verbose, separator=self._separator) iterator_distributions = metadata_table.parse_file(file_path, as_list=True) for genome_id, file_path_genome in iterator_distributions: assert genome_id != '', "Invalid genomid: '{}'".format(genome_id) assert file_path_genome != '', "Invalid file path: '{}'".format( genome_id) assert self.validate_file( file_path_genome), "Invalid file path: '{}'".format(genome_id) # check uniqueness assert genome_id not in dict_id_file_path, "Genome '{}' not unique in the distribution file!".format( genome_id) dict_id_file_path[genome_id] = file_path_genome return dict_id_file_path
def write_taxonomic_profile_from_abundance_files(self, metadata_table, list_of_file_paths, directory_output, sample_id=""): """ Write a taxonomic profile file for each relative abundance file @param metadata_table: Contains metadata of all communities @type metadata_table: MetadataTable @param list_of_file_paths: List of abundance file paths @type list_of_file_paths: list[str | unicode] @param directory_output: Profiles are written in this directory @type directory_output: str | unicode @param sample_id: Identifier of a sample @type sample_id: str | unicode """ metadata_table_tmp = MetadataTable(logfile=self._logfile, verbose=self._verbose) for index_abundance, file_path in enumerate(list_of_file_paths): community_abundance = metadata_table_tmp.parse_file( file_path, column_names=False) file_path_output = os.path.join( directory_output, self._filename_taxonomic_profile.format( sample_index=index_abundance)) with open(file_path_output, 'w') as stream_output: self.write_taxonomic_profile(community_abundance, stream_output, metadata_table, sample_id)
def _read_distribution_file(self, file_path): """ Read file with the distribution of a sample @param file_path: File genome id associated with the abundance of a genome @type file_path: str | unicode @return: Dictionary of genome id to file path @rtype: dict[str|unicode, float] """ self._logger.info('Reading distribution file') assert self.validate_file(file_path) dict_id_abundance = {} # dict_id_file_path = {} metadata_table = MetadataTable(logfile=self._logfile, verbose=self._verbose, separator=self._separator) iterator_distributions = metadata_table.parse_file(file_path, as_list=True) # for genome_id, abundance, genome_length, file_path_genome in iterator_distributions: abundance_sum = 0. for genome_id, abundance in iterator_distributions: assert genome_id != '', "Invalid genom id: '{}'".format(genome_id) assert abundance != '', "Invalid abundance: '{}'".format(genome_id) abundance = float(abundance) assert self.validate_number(abundance, zero=True), "Invalid abundance: '{}'".format(genome_id) assert genome_id not in dict_id_abundance, "Genome '{}' not unique in the distribution file!".format(genome_id) dict_id_abundance[genome_id] = abundance abundance_sum += abundance dict_id_abundance = {x : dict_id_abundance[x]/abundance_sum for x in dict_id_abundance} # normalise to 1 return dict_id_abundance
def merge_communities(self, list_of_communities, list_of_comunity_distribution_file_paths, index_sample, file_path_output): """ Combine distributions of communities and adjust them according to their ratio. @param list_of_communities: List of community inputs @type list_of_communities: list[Community] @param list_of_comunity_distribution_file_paths: List of distributions @type list_of_comunity_distribution_file_paths: list[str | unicode] @param index_sample: Index of sample @type index_sample: int | long @param file_path_output: Sample distribution file path @type file_path_output: str | unicode @return: Nothing @rtype: None """ assert isinstance(list_of_communities, list) for community in list_of_communities: assert isinstance(community, Community) # assert isinstance(metadata_table, MetadataTable) # read communities and adapt to ratio list_of_community_total_abundance = [0] * len(list_of_communities) sample_total_abundance = 0 genomes = set() metadata_table_community = MetadataTable(logfile=self._logfile, verbose=self._verbose) for index_community, file_path in enumerate( list_of_comunity_distribution_file_paths): community_distribution = metadata_table_community.parse_file( file_path, column_names=False) for row in community_distribution: genome_id = row[0] if genome_id in genomes: raise ValueError( "Genome id '{}' not unique".format(genome_id)) genomes.add(genome_id) abundance = row[index_sample + 1] list_of_community_total_abundance[index_community] += float( abundance) # * float(sequence_info[4]) community_distribution.close() for index_community, _ in enumerate( list_of_comunity_distribution_file_paths): sample_total_abundance += list_of_community_total_abundance[ index_community] # out.append(read_communities[0][0]) list_of_community_factor = [0.0] * len(list_of_communities) for index_community, _ in enumerate( list_of_comunity_distribution_file_paths): ratio = float(list_of_communities[index_community].ratio) community_total_abundance = float( list_of_community_total_abundance[index_community]) current_proportion_in_sample = community_total_abundance / float( sample_total_abundance) list_of_community_factor[ index_community] = ratio / current_proportion_in_sample # self.update_community(communities[index_community], factor) # join communities communities = [] for index_community, file_path in enumerate( list_of_comunity_distribution_file_paths): communities.append( metadata_table_community.parse_file(file_path, column_names=False)) # print_ratios(communities) with open(file_path_output, 'w') as stream_output: self._write_joined_community(communities, list_of_community_factor, index_sample, stream_output)