Esempio n. 1
0
    def get_coverage_and_detection_dict(self,bin_id):
        _bin = summarizer.Bin(self.summary, bin_id)
        self.coverage_values_per_nt = get_coverage_values_per_nucleotide(_bin.split_coverage_values_per_nt_dict, self.samples)

        # getting the total length of all contigs
        self.total_length = _bin.total_length

        self.init_coverage_and_detection_dataframes(_bin.gene_coverages, _bin.gene_detection)
Esempio n. 2
0
    def get_gene_presence_in_the_environment_dict(self):
        if not isinstance(self.fraction_of_median_coverage, float):
            raise ConfigError("Fraction of median coverage must of type `float`.")

        if not isinstance(self.min_detection, float):
            raise ConfigError("Minimum detection must be of type `float`")

        self.run.info('Fraction of median coverage for core genes', self.fraction_of_median_coverage)
        self.run.info('Min detection of a genome in at last one metagenome', self.min_detection)

        self.progress.new('Working on gene presence/absence')
        self.progress.update('...')

        gene_presence_in_the_environment_dict = {}
        for profile_db_path in self.unique_profile_db_path_to_internal_genome_name:
            self.progress.update('Collection info from profile db at %s ...' % (profile_db_path))
            summary = self.get_summary_object_for_profile_db(profile_db_path)

            for internal_genome_name in self.unique_profile_db_path_to_internal_genome_name[profile_db_path]:
                genome_name = self.descriptions.genomes[internal_genome_name]['bin_id']

                self.progress.update('Working on genome %s in profile db %s ...' % (internal_genome_name, profile_db_path))

                # for each genome, first we will see whether it is detected in at least one metagenome
                detection_across_metagenomes = summary.collection_profile[genome_name]['detection']
                num_metagenomes_above_min_detection = [m for m in detection_across_metagenomes if detection_across_metagenomes[m] > self.min_detection]
                not_enough_detection = False if len(num_metagenomes_above_min_detection) else True

                gene_presence_in_the_environment_dict[genome_name] = {}
                split_names_of_interest = self.descriptions.get_split_names_of_interest_for_internal_genome(self.descriptions.genomes[internal_genome_name])

                genome_bin_summary = summarizer.Bin(summary, genome_name, split_names_of_interest)
                gene_coverages_across_samples = utils.get_values_of_gene_level_coverage_stats_as_dict(genome_bin_summary.gene_level_coverage_stats_dict, "mean_coverage")

                # at this point we have all the genes in the genome bin. what we need is to characterize their detection. first,
                # summarize the coverage of each gene in all samples:
                sum_gene_coverages_across_samples = dict([(gene_callers_id, sum(gene_coverages_across_samples[gene_callers_id].values())) for gene_callers_id in gene_coverages_across_samples])

                # now we will identify the median coverage
                median_coverage_across_samples = numpy.median(list(sum_gene_coverages_across_samples.values()))

                # now we will store decide whether a gene found in this genome is also found in the environment, and store that
                # information into `gene_presence_in_the_environment_dict`, and move on to the next stage.
                for gene_caller_id in sum_gene_coverages_across_samples:
                    if not_enough_detection:
                        _class = 'NA'
                    elif sum_gene_coverages_across_samples[gene_caller_id] < median_coverage_across_samples * self.fraction_of_median_coverage:
                        _class = 'EAG'
                    else:
                        _class = 'ECG'

                    gene_presence_in_the_environment_dict[genome_name][gene_caller_id] = _class

        self.progress.end()

        return gene_presence_in_the_environment_dict
Esempio n. 3
0
 def get_coverage_and_detection_dict(self, bin_id):
     _bin = summarizer.Bin(self.summary, bin_id)
     self.gene_coverages = pd.DataFrame.from_dict(_bin.gene_coverages,
                                                  orient='index',
                                                  dtype=float)
     print(self.gene_coverages)
     self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True)
     self.Ng = len(self.gene_coverages.index)
     self.gene_detections = pd.DataFrame.from_dict(_bin.gene_detection,
                                                   orient='index',
                                                   dtype=float)
     self.gene_detections.drop(self.samples_to_exclude,
                               axis=1,
                               inplace=True)
     self.samples = set(self.gene_coverages.columns.values)
Esempio n. 4
0
 def get_coverage_and_detection_dict(self, bin_id):
     _bin = summarizer.Bin(self.summary, bin_id)
     self.gene_coverages = _bin.gene_coverages
     self.gene_detection = _bin.gene_detection
     self.samples = set(next(iter(self.gene_coverages.values())).keys())