Ejemplo n.º 1
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detection_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.number_of_positive_samples = None
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.gene_coverages = {}
        self.gene_detection = {}
        self.samples = {}
        self.positive_samples = {}
        self.negative_samples = {}
        self.gene_class_information = {}
        self.samples_information = {}
        self.profile_db = {}

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = self.profile_db.gene_coverages_dict
                self.gene_detection = self.profile_db.gene_detection_dict
                self.samples = set(
                    next(iter(self.gene_coverages.values())).keys())
Ejemplo n.º 2
0
    def load_from_anvio_files(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header = 'Available states (%d)' % len(self.states_table.states), lc = 'green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state:
            if not self.state in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state               
Ejemplo n.º 3
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.exclude_samples = A('exclude_samples')
        self.include_samples = A('include_samples')
        self.profile_db = {}
        self.coverage_values_per_nt = {}
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.sample_detection_information_was_initiated = False
        self.positive_samples = []
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_detection_information = pd.DataFrame.empty
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty
        self.additional_description = ''
        self.total_length = None
        self.samples_coverage_stats_dicts_was_initiated = False
        self.samples_coverage_stats_dicts = pd.DataFrame.empty
        self.non_outlier_indices = {}

        if self.exclude_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(self.exclude_samples, 'rU').readlines()
            ])

            if not self.samples_to_exclude:
                raise ConfigError(
                    "You asked to exclude samples, but provided an empty list."
                )

            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        if self.include_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.include_samples)
            self.samples_to_include = set([
                l.split('\t')[0].strip()
                for l in open(self.include_samples, 'rU').readlines()
            ])

            if not self.samples_to_include:
                raise ConfigError(
                    "You provided an empty list of samples to include.")

            run.info(
                'Including Samples',
                'The following samples will be included: %s' %
                self.samples_to_include,
            )
        else:
            self.samples_to_include = set([])

        # run sanity check on all input arguments
        self.sanity_check()

        if self.profile_db_path is None:
            # TODO: this will probably be removed because we don't save the coverage information in nucleotide level.
            pass
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
                self.init_samples(self.summary.p_meta['samples'])
            else:
                self.profile_db = ProfileSuperclass(args)
                self.init_samples(self.profile_db.p_meta['samples'])
                self.profile_db.init_split_coverage_values_per_nt_dict()
                self.profile_db.init_gene_level_coverage_stats_dicts()
                self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
                    self.profile_db.split_coverage_values_per_nt_dict,
                    self.samples)

                # comply with the new design and get gene_coverages and gene_detection dicsts from
                # gene_level_coverage_stats_dict.
                gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts(
                )

                self.init_coverage_and_detection_dataframes(
                    gene_coverages, gene_detection)

                # getting the total length of all contigs
                self.total_length = self.profile_db.p_meta['total_length']
Ejemplo n.º 4
0
class MetagenomeCentricGeneClassifier:
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.exclude_samples = A('exclude_samples')
        self.include_samples = A('include_samples')
        self.profile_db = {}
        self.coverage_values_per_nt = {}
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.sample_detection_information_was_initiated = False
        self.positive_samples = []
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_detection_information = pd.DataFrame.empty
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty
        self.additional_description = ''
        self.total_length = None
        self.samples_coverage_stats_dicts_was_initiated = False
        self.samples_coverage_stats_dicts = pd.DataFrame.empty
        self.non_outlier_indices = {}

        if self.exclude_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(self.exclude_samples, 'rU').readlines()
            ])

            if not self.samples_to_exclude:
                raise ConfigError(
                    "You asked to exclude samples, but provided an empty list."
                )

            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        if self.include_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.include_samples)
            self.samples_to_include = set([
                l.split('\t')[0].strip()
                for l in open(self.include_samples, 'rU').readlines()
            ])

            if not self.samples_to_include:
                raise ConfigError(
                    "You provided an empty list of samples to include.")

            run.info(
                'Including Samples',
                'The following samples will be included: %s' %
                self.samples_to_include,
            )
        else:
            self.samples_to_include = set([])

        # run sanity check on all input arguments
        self.sanity_check()

        if self.profile_db_path is None:
            # TODO: this will probably be removed because we don't save the coverage information in nucleotide level.
            pass
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
                self.init_samples(self.summary.p_meta['samples'])
            else:
                self.profile_db = ProfileSuperclass(args)
                self.init_samples(self.profile_db.p_meta['samples'])
                self.profile_db.init_split_coverage_values_per_nt_dict()
                self.profile_db.init_gene_level_coverage_stats_dicts()
                self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
                    self.profile_db.split_coverage_values_per_nt_dict,
                    self.samples)

                # comply with the new design and get gene_coverages and gene_detection dicsts from
                # gene_level_coverage_stats_dict.
                gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts(
                )

                self.init_coverage_and_detection_dataframes(
                    gene_coverages, gene_detection)

                # getting the total length of all contigs
                self.total_length = self.profile_db.p_meta['total_length']

    def get_gene_coverages_and_gene_detection_dicts(self):
        gene_coverages = {}
        gene_detection = {}

        A = lambda x: self.profile_db.gene_level_coverage_stats_dict[
            gene_callers_id][sample_name][x]

        gene_caller_ids = list(
            self.profile_db.gene_level_coverage_stats_dict.keys())

        # populate gene coverage and detection dictionaries
        if self.profile_db.gene_level_coverage_stats_dict:
            for gene_callers_id in gene_caller_ids:
                gene_coverages[gene_callers_id], gene_detection[
                    gene_callers_id] = {}, {}

                for sample_name in self.profile_db.p_meta['samples']:
                    gene_coverages[gene_callers_id][sample_name] = A(
                        'mean_coverage')
                    gene_detection[gene_callers_id][sample_name] = A(
                        'detection')

        return gene_coverages, gene_detection

    def check_if_valid_portion_value(self, arg_name, arg_value):
        """ Helper function to verify that an argument has a valid value for a non-zero portion (i.e. greater than zero and a max of 1)"""
        if arg_value <= 0 or arg_value > 1:
            raise ConfigError(
                "%s value must be greater than zero and a max of 1, the value you supplied %s"
                % (arg_name, arg_value))

    def sanity_check(self):
        """Basic sanity check for class inputs"""

        if self.profile_db_path is None and self.gene_coverages_data_file_path is None:
            raise ConfigError(
                "You must provide either a profile.db or a gene coverage self.gene_coverages_filtered data file"
            )

        if self.profile_db_path and self.gene_coverages_data_file_path:
            raise ConfigError(
                "You provided both a profile database and a gene coverage self.gene_coverages_filtered data file, you \
            must provide only one or the other (hint: if you have a profile database, the use that"
            )

        # checking output file
        filesnpaths.is_output_file_writable(self.output_file_prefix +
                                            '-additional-layers.txt',
                                            ok_if_exists=False)

        # checking alpha
        if not isinstance(self.alpha, float):
            raise ConfigError("alpha value must be a type float.")
        # alpha must be a min of 0 and smaller than 0.5
        if self.alpha < 0 or self.alpha >= 0.5:
            raise ConfigError(
                "alpha must be a minimum of 0 and smaller than 0.5")

        if self.collection_name:
            if not self.profile_db_path:
                raise ConfigError(
                    "You specified a collection name %s, but you provided a gene coverage self.gene_coverages_filtered data file \
                 collections are only available when working with a profile database."
                    % self.collection_name)

        if self.exclude_samples and self.include_samples:
            raise ConfigError(
                "You cannot use both --include-samples and --exclude-samples! Please choose one."
            )

    def init_samples(self, samples_list):
        """ Create the set of samples according to user input and store it in self.samples"""
        samples = set(samples_list) - self.samples_to_exclude
        if self.include_samples:
            samples_to_include_that_are_not_there = self.samples_to_include - samples
            if samples_to_include_that_are_not_there:
                raise ConfigError(
                    "You requested to include some samples that are not in the profile database. Here are the samples in the profile database: %s. \
                                And here are the samples you requested, and that are not there: %s"
                    % (samples, samples_to_include_that_are_not_there))
            samples = self.samples_to_include
        self.samples = samples

    def init_coverage_and_detection_dataframes(self, gene_coverages_dict,
                                               gene_detection_dict):
        """ Populate the following: self.gene_coverages, self.Ng, self.gene_detections.

            Notice that this function could get as input either an object of ProfileSuperclass or of summarizer.Bin
        """
        self.gene_coverages = pd.DataFrame.from_dict(gene_coverages_dict,
                                                     orient='index',
                                                     dtype=float)
        self.Ng = len(self.gene_coverages.index)
        self.gene_detections = pd.DataFrame.from_dict(gene_detection_dict,
                                                      orient='index',
                                                      dtype=float)

        if self.include_samples or self.exclude_samples:
            # Only include samples that the user want
            self.gene_coverages = self.gene_coverages[list(self.samples)]
            self.gene_detections = self.gene_detections[list(self.samples)]

    def init_sample_detection_information(self):
        """ Determine  positive, negative, and ambiguous samples with the genome detection information
        (--alpha, --genome-detection-uncertainty)
        """

        # FIXME: some of the following variables are never used.
        MCG_samples_information_table_name = 'MCG_classifier_samples_information'
        MCG_samples_information_table_structure = [
            'samples', 'presence', 'detection',
            'number_of_taxon_specific_core_detected'
        ]
        MCG_samples_information_table_types = ['str', 'bool', 'int', 'int']

        # create an empty dataframe
        samples_information = pd.DataFrame(
            index=self.samples,
            columns=MCG_samples_information_table_structure[1:])
        positive_samples = []
        negative_samples = []

        self.progress.new("Setting presence/absence in samples")
        num_samples, counter = len(self.samples), 1
        detection = {}
        for sample in self.samples:
            if num_samples > 100 and counter % 100 == 0:
                self.progress.update('%d of %d samples...' %
                                     (counter, num_samples))
            print("total length for %s is %s" % (sample, self.total_length))
            print(
                "the length of the vector: %s" %
                len(self.coverage_values_per_nt[sample])
            )  # FIXME: after testing this module, delete this line. it is only here to make sure that anvio is not lying to us.
            print(
                "number of nucleotide positions with non zero coverage in %s is %s "
                % (sample, np.count_nonzero(
                    self.coverage_values_per_nt[sample])))
            detection[sample] = np.count_nonzero(
                self.coverage_values_per_nt[sample]) / self.total_length
            if detection[sample] >= 0.5 + self.alpha:
                positive_samples.append(sample)
                samples_information['presence'][sample] = True
            elif detection[sample] <= 0.5 - self.alpha:
                negative_samples.append(sample)
                samples_information['presence'][sample] = False
            else:
                samples_information['presence'][sample] = None
            samples_information['detection'][sample] = detection[sample]
            counter += 1
        self.progress.end()

        self.positive_samples = positive_samples
        self.number_of_positive_samples = len(self.positive_samples)
        self.negative_samples = negative_samples
        self.samples_detection_information = samples_information
        self.run.warning('The number of positive samples is %s' %
                         self.number_of_positive_samples)
        self.run.warning('The number of negative samples is %s' %
                         len(self.negative_samples))
        self.sample_detection_information_was_initiated = True

    def init_samples_coverage_stats_dict(self):
        """ populate the samples_coverage_stats_dict."""
        if not self.sample_detection_information_was_initiated:
            self.init_sample_detection_information

        self.samples_coverage_stats_dicts = pd.DataFrame(
            index=self.samples,
            columns=columns_for_samples_coverage_stats_dict)

        num_samples, counter = len(self.samples), 1
        self.progress.new(
            "Finding nucleotide positions in samples with outlier coverage values"
        )
        for sample in self.positive_samples:
            if num_samples > 100 and counter % 100 == 0:
                self.progress.update('%d of %d samples...' %
                                     (counter, num_samples))

            # loop through positive samples
            # get the non-zero non-outlier information
            self.non_outlier_indices[
                sample], self.samples_coverage_stats_dicts.loc[
                    sample, ] = get_non_outliers_information(
                        self.coverage_values_per_nt[sample])

            self.run.info_single(
                'The mean and std of non-outliers in sample %s are: %s, %s respectively'
                %
                (sample,
                 self.samples_coverage_stats_dicts['non_outlier_mean_coverage']
                 [sample],
                 self.samples_coverage_stats_dicts['non_outlier_coverage_std']
                 [sample]))
            number_of_non_outliers = len(self.non_outlier_indices[sample])
            self.run.info_single(
                'The number of non-outliers is %s of %s (%.2f%%)' %
                (number_of_non_outliers, self.total_length,
                 100.0 * number_of_non_outliers / self.total_length))
        self.progress.end()

    def plot_TS(self):
        """ Creates a pdf file with the following plots for each sample the sorted nucleotide coverages \
        (with a the outliers in red and non-outliers in blue), and a histogram of coverages for the non-outliers"""
        # Creating a dircetory for the plots. If running on bins, each bin would be in a separate sub-directory

        if not self.samples_coverage_stats_dicts_was_initiated:
            self.init_samples_coverage_stats_dict()

        additional_description = ''
        if self.additional_description:
            additional_description = '-' + self.additional_description

        plot_dir = self.output_file_prefix + '-TS-plots' + '/'
        os.makedirs(plot_dir, exist_ok=True)
        self.progress.new(
            'Saving figures of taxon specific distributions to pdf')
        number_of_fininshed = 0
        for sample in self.positive_samples:
            coverages_pdf_output = plot_dir + sample + additional_description + '-coverages.pdf'
            pdf_output_file = PdfPages(coverages_pdf_output)
            v = self.coverage_values_per_nt[sample]
            # Using argsort so we can use the non_oulier indices
            sorting_indices = np.argsort(v)
            # we would need the reverse of the sorting of the indices to create the x axis for the non-outliers
            reverse_sorted_indices = np.zeros(len(sorting_indices))
            reverse_sorted_indices[sorting_indices] = range(
                len(reverse_sorted_indices))

            # plotting the ordered coverage values (per nucleotide)
            # the non-outliers are plotted in blue
            # the outlier values are plotted in red
            fig = plt.figure()
            ax = fig.add_subplot(111, rasterized=True)
            ax.set_xlabel = 'Nucleotide Number (ordered)'
            ax.set_ylabel = r'$Nucleotide Coverage^2$'
            x1 = range(
                len(v)
            )  # FIXME: this shouldn't be in the loop (only here because I need to fix the mock data)
            x2 = reverse_sorted_indices[self.non_outlier_indices[sample]]
            y2 = v[self.non_outlier_indices[sample]]
            # plot all in red
            ax.semilogy(x1, v[sorting_indices], 'r.', rasterized=True)
            # plot on top the non-outliers in blue
            ax.semilogy(x2,
                        v[self.non_outlier_indices[sample]],
                        'b.',
                        rasterized=True)
            fig.suptitle("%s - sorted coverage values with outliers" % sample)
            plt.savefig(pdf_output_file, format='pdf')
            plt.close()

            # plotting a histogram of the non-outliers
            # This would allow to see if they resemble a normal distribution
            hist_range = (min(v[self.non_outlier_indices[sample]]),
                          max(v[self.non_outlier_indices[sample]]))
            # computing the number of bins so that the width of a bin is ~1/4 of the standard deviation
            # FIXME: need to make it so the bins are only of integers (so the smallest bin is of width 1
            # and that bins are integers)
            number_of_hist_bins = np.ceil(
                (hist_range[1] - hist_range[0]) /
                (self.samples_coverage_stats_dicts['non_outlier_coverage_std']
                 [sample] / 4)
            ).astype(
                int
            )  # setting the histogram bins to be of the width of a quarter of std
            fig = plt.figure()
            ax = fig.add_subplot(111, rasterized=True)
            ax.set_xlabel = 'Coverage'
            ax.hist(v[self.non_outlier_indices[sample]],
                    number_of_hist_bins,
                    hist_range,
                    rasterized=True)
            fig.suptitle("%s - histogram of non-outliers" % sample)
            # adding the mean and std of the non-outliers as text to the plot
            text_for_hist = u'$\mu = %d$\n $\sigma = %d$' %\
                                (self.samples_coverage_stats_dicts['non_outlier_mean_coverage'][sample],\
                                 self.samples_coverage_stats_dicts['non_outlier_coverage_std'][sample])
            ax.text(0.8,
                    0.9,
                    text_for_hist,
                    ha='center',
                    va='center',
                    transform=ax.transAxes)
            plt.savefig(pdf_output_file, format='pdf')
            plt.close()
            # close the pdf file
            pdf_output_file.close()
            number_of_fininshed += 1
            self.progress.update(
                "Finished %d of %d" %
                (number_of_fininshed, self.number_of_positive_samples))
        self.progress.end()

    def get_gene_classes(self):
        """ The main process of this class - computes the class information for each gene"""
        # need to start a new gene_class_information dict
        # this is due to the fact that if the algorithm is ran on a list of bins then this necessary
        self.gene_class_information = pd.DataFrame(
            index=self.gene_coverages.index, columns=['gene_class'])

        # set the presence/absence values for samples
        self.init_sample_detection_information()

        # find the taxon-specific genes for each sample
        self.plot_TS()

    def get_coverage_and_detection_dict(self, bin_id):
        _bin = summarizer.Bin(self.summary, bin_id)
        self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
            _bin.split_coverage_values_per_nt_dict, self.samples)

        # getting the total length of all contigs
        self.total_length = _bin.total_length

        self.init_coverage_and_detection_dataframes(_bin.gene_coverages,
                                                    _bin.gene_detection)

    def classify(self):
        if self.collection_name:
            bin_names_in_collection = self.summary.bin_ids
            if self.bin_ids_file_path:
                filesnpaths.is_file_exists(self.bin_ids_file_path)
                bin_names_of_interest = [
                    line.strip()
                    for line in open(self.bin_ids_file_path).readlines()
                ]

                missing_bins = [
                    b for b in bin_names_of_interest
                    if b not in bin_names_in_collection
                ]
                if len(missing_bins):
                    raise ConfigError(
                        "Some bin names you declared do not appear to be in the collection %s. \
                                        These are the bins that are missing: %s, these are the bins that are \
                                        actually in your collection: %s" %
                        (self.collection_name, missing_bins,
                         bin_names_in_collection))
            elif self.bin_id:
                if self.bin_id not in bin_names_in_collection:
                    raise ConfigError("The bin you declared, %s, does not appear to be in the collection %s." \
                                      % (self.bin_id, self.collection_name))
                bin_names_of_interest = [self.bin_id]
            else:
                bin_names_of_interest = bin_names_in_collection

            for bin_id in bin_names_of_interest:
                self.run.info_single('Classifying genes in bin: %s' % bin_id)
                self.get_coverage_and_detection_dict(bin_id)
                self.additional_description = bin_id
                self.get_gene_classes()
                #self.save_gene_class_information_in_additional_layers(bin_id)
                #self.save_samples_information(bin_id)

        else:
            # No collection provided so running on the entire detection table
            self.get_gene_classes()
Ejemplo n.º 5
0
    def __init__(self, args, external_clustering = None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None
        self.state = A('state')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_metadata_path = A('additional_metadata')
        self.additional_view_path = A('additional_view')
        self.profile_db_path = A('profile_db')
        self.annotation_db_path = A('annotation_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.metadata = A('metadata')
        self.tree = A('tree')
        self.title = A('title')
        self.summary_index = A('summary_index')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.skip_check_names = A('skip_check_names')

        self.split_names_ordered = None
        self.splits_summary_index = {}
        self.additional_metadata = None
        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        AnnotationSuperclass.__init__(self, self.args)

        if self.annotation_db_path:
            self.completeness = completeness.Completeness(self.annotation_db_path)
            self.collections.populate_sources_dict(self.annotation_db_path, anvio.__annotation__version__)
        else:
            self.completeness = None

        if self.annotation_db_path and self.profile_db_path:
            # make sure we are not dealing with apples and oranges here.
            is_annotation_and_profile_dbs_compatible(self.annotation_db_path, self.profile_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if self.profile_db_path:
            if not self.annotation_db_path:
                raise ConfigError, "Anvi'o needs the annotation database to make sense of this run."

            ProfileSuperclass.__init__(self, args)

            # this is a weird place to do it, but we are going to ask AnnotationSuperclass function to load
            # all the split sequences since only now we know the mun_contig_length that was used to profile
            # this stuff
            self.init_split_sequences(self.p_meta['min_contig_length'])

            self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

            self.load_from_profile_database(args)
        else:
            self.load_from_files(args)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped automatically by the platform. Please\
                                    read the help menu for anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        tree = Tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'], format = 1)

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = [n.name for n in tree.get_leaves()]

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the annotation database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Annotation DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer = self.split_hmm_layers)

        if self.additional_metadata_path:
            filesnpaths.is_file_tab_delimited(self.additional_metadata_path)
            self.additional_metadata = self.additional_metadata_path

        self.check_names_consistency()
        self.convert_metadata_into_json()
Ejemplo n.º 6
0
    def load_full_mode(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\
                                should use the `--manual` flag if that's what your intention)."

        if not self.profile_db_path:
            raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\
                                Well. This does not make any sense."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(
            os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # load views from the profile database
        if self.p_meta['blank']:
            blank_dict = {}
            for split_name in self.splits_basic_info:
                blank_dict[split_name] = {'blank_view': 0}

            self.views['blank_view'] = {
                'header': ['blank_view'],
                'dict': blank_dict
            }
            self.default_view = 'blank_view'

        else:
            self.load_views()
            self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('',
                        header='Available views (%d)' % len(self.views),
                        lc='green')
            for view in self.views:
                run.info(
                    view,
                    'Via "%s" table' % self.views[view]['table_name'],
                    lc='crimson',
                    mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('',
                        header='Available states (%d)' %
                        len(self.states_table.states),
                        lc='green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' %
                         self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(
                self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [
                str
            ] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {
                'table_name':
                'NA',
                'header':
                additional_view_columns,
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(
                    self.additional_view_path, column_mapping=column_mapping)
            }

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings

        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = clustering_id
                self.p_meta['available_clusterings'] = [clustering_id]
                self.p_meta['clusterings'] = {
                    clustering_id: {
                        'newick': open(os.path.abspath(self.tree)).read()
                    }
                }
                run.info(
                    'Additional Tree',
                    "Splits will be organized based on '%s'." % clustering_id)
            else:
                self.p_meta['clusterings'][clustering_id] = {
                    'newick': open(os.path.abspath(self.tree)).read()
                }
                run.info(
                    'Additional Tree',
                    "'%s' has been added to available trees." % clustering_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace(
                '_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_profile_data_available:
            summary_cp_available = os.path.exists(
                os.path.join(os.path.dirname(self.profile_db_path),
                             'SUMMARY.cp'))
            self.run.warning(
                "Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" %
                ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."
                 ))

        if self.state_autoload:
            if not self.state_autoload in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state_autoload
Ejemplo n.º 7
0
class AlonsClassifier:
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detection_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.number_of_positive_samples = None
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.gene_coverages = {}
        self.gene_detection = {}
        self.samples = {}
        self.positive_samples = {}
        self.negative_samples = {}
        self.gene_class_information = {}
        self.samples_information = {}
        self.profile_db = {}

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = self.profile_db.gene_coverages_dict
                self.gene_detection = self.profile_db.gene_detection_dict
                self.samples = set(
                    next(iter(self.gene_coverages.values())).keys())

    def sanity_check(self):
        """Basic sanity check for class inputs"""

        if self.profile_db_path is None and self.gene_coverages_data_file_path is None:
            raise ConfigError(
                "You must provide either a profile.db or a gene coverage matrix data file"
            )

        if self.profile_db_path and self.gene_coverages_data_file_path:
            raise ConfigError(
                "You provided both a profile database and a gene coverage matrix data file, you \
            must provide only one or the other (hint: if you have a profile database, the use that"
            )

        # checking output file
        filesnpaths.is_output_file_writable(self.output_file_prefix +
                                            '-additional-layers.txt',
                                            ok_if_exists=False)
        # checking alpha
        if not isinstance(self.alpha, float):
            raise ConfigError("alpha value must be a type float.")
        if self.alpha <= 0 or self.alpha > 1:
            raise ConfigError(
                "alpha value must be greater than zero and a max of 1, the value you supplied %s"
                % self.alpha)

        # Checking beta
        if not isinstance(self.beta, float):
            raise ConfigError("beta value must be a type float.")
        if self.beta <= 0:
            raise ConfigError(
                "beta value must be greater than zero, the value you supplied %s"
                % self.beta)

        # Checking gamma
        if not isinstance(self.gamma, float):
            raise ConfigError("Gamma value must be a type float.")
        if self.gamma <= 0:
            raise ConfigError(
                "Gamma value must be greater than zero, the value you supplied %s"
                % self.gamma)

        # Checking eta
        if self.eta <= 0 or self.eta > 1:
            raise ConfigError(
                "eta value must be greater than zero and a max of 1, the value you supplied %s"
                % self.eta)

        if self.collection_name:
            if not self.profile_db_path:
                raise ConfigError(
                    "You specified a collection name %s, but you provided a gene coverage matrix data file \
                 collections are only available when working with a profile database."
                    % self.collection_name)

    def get_data_from_txt_file(self):
        """ Reads the coverage data from TAB delimited file """
        self.samples = set(
            utils.get_columns_of_TAB_delim_file(
                self.gene_coverages_data_file_path))
        self.gene_coverages = utils.get_TAB_delimited_file_as_dictionary(
            self.gene_coverages_data_file_path,
            column_mapping=[int] + [float] * len(self.samples))
        # checking if a gene_detection file was also supplied
        if self.gene_detection_data_file_path:
            self.gene_detection = utils.get_TAB_delimited_file_as_dictionary(
                self.gene_detection_data_file_path,
                column_mapping=[int] + [float] * len(self.samples))
            # making sure that the tables are compatible, notice we're only checking if gene_detection contains everything that's in gene_coverages (and not vise versa)
            for gene_id in self.gene_coverages:
                if gene_id not in self.gene_detection:
                    raise ConfigError(
                        "Your tables are not compatible. For example gene_id %s is in %s, but not in %s"
                        % (gene_id, self.gene_coverages_data_file_path,
                           self.gene_detection_data_file_path))
            gene_detection_sample_list = next(
                iter(self.gene_detection.values())).keys()
            for sample_id in next(iter(self.gene_coverages.values())).keys():
                if sample_id not in gene_detection_sample_list:
                    raise ConfigError(
                        "Your tables are not compatible. For example sample_id %s is in %s, but not in %s"
                        % (sample_id, self.gene_coverages_data_file_path,
                           self.gene_detection_data_file_path))

    def apply_func_to_genes_in_sample(self, func, list_of_genes=None):
        """ Apply the give function on the list of genes in each sample. The function is expected to accept a list """
        if not list_of_genes:
            list_of_genes = self.gene_coverages.keys()
        d = dict(
            zip(self.samples, [
                next(
                    map(func, [[
                        self.gene_coverages[gene_id][sample_id]
                        for gene_id in list_of_genes
                    ]])) for sample_id in self.samples
            ]))
        return d

    def get_mean_coverage_in_samples(self, list_of_genes=None):
        """ Returns a dictionary with of the average coverage value of the list of genes per sample. if no list of genes is
        supplied then the average is calculated over all genes """
        if not self.samples:
            # if all samples don't contain the genome then return 0 for mean value
            return 0
        else:
            mean_coverage_in_samples = self.apply_func_to_genes_in_sample(
                np.mean, list_of_genes)
            return mean_coverage_in_samples

    def get_std_in_samples(self, list_of_genes=None):
        """ Returns a dictionary with of the standard deviation of the coverage values of the list of genes per sample.
        if no list of genes is supplied then the average is calculated over all genes """
        std_in_samples = self.apply_func_to_genes_in_sample(
            np.std, list_of_genes)
        return std_in_samples

    def get_detection_of_genes(self, mean_coverage_in_samples, std_in_samples):
        """ Returns a dictionary (of dictionaries), where for each gene_id, and each sample_id the detection of the gene
        is determined. The criteria for detection is having coverage that is greater than 0 and also that is not more
        than gamma (default is gamma=3) standard deviations below the mean coverage in the sample.
        Notice that the mean coverage isn't the mean of all genes in the sample necesarilly. In fact it would be the mean of
        only the taxon-specific genes."""
        detection_of_genes = {}
        non_zero_non_detections = False
        for gene_id in self.gene_coverages:
            detection_of_genes[gene_id] = {}
            detection_of_genes[gene_id]['number_of_detections'] = 0
            detection_of_genes[gene_id][
                'detected_in_non_positive_samples'] = False
            for sample in self.samples:
                if sample in self.positive_samples or sample not in self.negative_samples:
                    # getting gene detection according to coverage criteria
                    # samples that are ambiguous (neither negative nor positive)
                    detection_of_genes[gene_id][
                        sample] = self.gene_coverages[gene_id][sample] > max(
                            0, mean_coverage_in_samples[sample] -
                            self.gamma * std_in_samples[sample])
                else:
                    # if the sample is a negative sample or the presence of the genome is unknown then the threshold for detections is
                    # more stringent and requires coverage significantly higher than the coverage of the TSC genes
                    detection_of_genes[gene_id][
                        sample] = self.gene_coverages[gene_id][sample] > max(
                            0, mean_coverage_in_samples[sample] +
                            self.gamma * std_in_samples[sample])
                    detection_of_genes[gene_id][
                        'detected_in_non_positive_samples'] += detection_of_genes[
                            gene_id][sample]
                if self.gene_detection:
                    # if we have the gene detection (previously known as "percent covered") information then we will also use it to determine detection in samples:
                    gene_detection_above_threshold = self.gene_detection[
                        gene_id][sample] > self.zeta
                    detection_of_genes[gene_id][sample] = detection_of_genes[
                        gene_id][sample] * gene_detection_above_threshold
                detection_of_genes[gene_id][
                    'number_of_detections'] += detection_of_genes[gene_id][
                        sample]
                if self.gene_coverages[gene_id][sample] > 0 and self.gene_coverages[gene_id][sample] < mean_coverage_in_samples[sample] - \
                        self.gamma*std_in_samples[sample]:
                    non_zero_non_detections = True
        if non_zero_non_detections:
            # print('gene %s, in some sample has non-zero coverage %s, and it has been marked as not detected due '
            # 'to the detection criteria' % (gene_id, data[gene_id][sample]))
            print(
                'some genes in some samples were marked as not detected due to the detection criteria'
            )

        return detection_of_genes

    def get_samples_information(self,
                                detection_of_genes,
                                alpha,
                                genes_to_consider=None):
        '''
        Setting the values for the samples_information dictionary with the logical variable detection and the number of TSC genes 
        for each sample. In addition the total number of positive samples (i.e. samples that contain the reference genome) saved in 
        number_of_positive_samples.
        '''
        if not genes_to_consider:
            # if no list of genes is supplied then considering all genes
            genes_to_consider = detection_of_genes.keys()
        samples_information = {}
        self.number_of_positive_samples = 0
        for sample_id in self.samples:
            samples_information[sample_id] = {}
            number_of_detected_genes_in_sample = len([
                gene_id for gene_id in genes_to_consider
                if detection_of_genes[gene_id][sample_id]
            ])
            samples_information[sample_id][
                'detection'] = number_of_detected_genes_in_sample > alpha * len(
                    genes_to_consider)
            # There should be a better way to do this, but for now, if there is an intermediate value of genes that is
            # detected then the sample would be considered neither positive nor negative
            if number_of_detected_genes_in_sample > alpha * len(
                    genes_to_consider):
                samples_information[sample_id]['detection'] = True
            elif number_of_detected_genes_in_sample > 0.5 * alpha * len(
                    genes_to_consider):
                samples_information[sample_id]['detection'] = None
            else:
                samples_information[sample_id]['detection'] = False
            self.number_of_positive_samples += 1 if samples_information[
                sample_id]['detection'] else 0
            samples_information[sample_id][
                'number_of_detected_genes'] = number_of_detected_genes_in_sample
        self.samples_information = samples_information
        self.run.warning('The number of positive samples is: %s ' %
                         self.number_of_positive_samples)

    def get_adjusted_std_for_gene_id(self, gene_id, mean_coverage_in_samples,
                                     detection_of_genes):
        """Returns the adjusted standard deviation for a gene_id. The adjusted standard deviation is calculated 
        only according to the positive samples (ones that contain the genome), and only in positive samples that 
        contain the gene (according to the gene detection criteria)."""
        # Note: originally I thought I would only consider samples in which the genome was detected, but in fact,
        # if a gene is detected in a sample in which the genome is not detected then that is a good sign that this is
        #  a TNS gene. But I still kept here the original definition of adjusted_std
        # adjusted_std = np.std([d[gene_id, sample_id] / mean_coverage_in_samples[sample_id] for sample_id in samples if (
        #         detection_of_genes[gene_id][sample_id] and samples_information[sample_id])])

        # First find all positive samples that also contain the gene
        positive_samples_with_gene = []
        for sample_id in self.positive_samples:
            if detection_of_genes[gene_id][sample_id]:
                positive_samples_with_gene.append(sample_id)
        if not positive_samples_with_gene:
            # If no positive samples contain the gene then we want the coverage consistency to be None
            return -1, 0
        else:
            adjusted_gene_coverages = [
                self.gene_coverages[gene_id][sample_id] /
                mean_coverage_in_samples[sample_id]
                for sample_id in positive_samples_with_gene
            ]
            adjusted_mean = np.mean(adjusted_gene_coverages)
            adjusted_std = np.std(adjusted_gene_coverages)
            return adjusted_std, adjusted_mean

    def get_adjusted_stds(self, mean_coverage_in_samples, detection_of_genes):
        """ Returns a dictionary with the gene ids as keys and the adjusted standard deviation per gene"""
        adjusted_std = {}
        adjusted_mean = {}
        # Make sure that there is at least one positive sample
        if not self.number_of_positive_samples:
            raise ConfigError(
                "It seems that the reference genome is not detected in any of your samples. \
            You might consider changing some of the parameters (for example --min-gene-detection, --min-portion-of-TSC \
            or --core-min-detection).")

        for gene_id in self.gene_coverages:
            adjusted_std[gene_id], adjusted_mean[
                gene_id] = self.get_adjusted_std_for_gene_id(
                    gene_id, mean_coverage_in_samples, detection_of_genes)
        return adjusted_std, adjusted_mean

    def get_gene_specificity(self, detection_of_genes):
        """ Find all genes that are detected in """
        gene_specificity = {}
        for gene_id in detection_of_genes:
            if detection_of_genes[gene_id]['detected_in_non_positive_samples']:
                # if the gene is detected in at least one negative sample then gene_specificity is False
                gene_specificity[gene_id] = False
            elif detection_of_genes[gene_id]['number_of_detections'] == 0:
                # if the gene is not detected in any sample then the gene_specificity is None
                gene_specificity[gene_id] = None
            else:
                # if the gene is only detected in positive samples the gene_specificity is True
                gene_specificity[gene_id] = True
        return gene_specificity

    def get_coverage_consistency(self, adjusted_stds, detection_of_genes,
                                 beta):
        """For each gene if the adjusted standard deviation (to understand what this is refer to Alon Shaiber) is smaller
        than beta then coverage_consistency is True, otherwise, coverage_consistency is False. If the gene is not
        detected in positive samples then coverage consistency is not defined (get's a value 'None')"""
        coverage_consistency = {}

        for gene_id in adjusted_stds:
            # if the gene is not detected in any sample then return None
            if detection_of_genes[gene_id]['number_of_detections'] <= 1:
                coverage_consistency[gene_id] = None
            elif adjusted_stds[gene_id] == -1:
                coverage_consistency[gene_id] = None
            else:
                if adjusted_stds[gene_id] < beta:
                    coverage_consistency[gene_id] = True
                else:
                    coverage_consistency[gene_id] = False
        return coverage_consistency

    def get_taxon_specificity(self, coverage_consistency, gene_specificity):
        taxon_specificity = {}
        for gene_id in coverage_consistency:
            if coverage_consistency[gene_id] is None or gene_specificity[
                    gene_id] is None:
                taxon_specificity[gene_id] = None
            elif coverage_consistency[gene_id] and gene_specificity[gene_id]:
                taxon_specificity[gene_id] = 'TS'
            else:
                taxon_specificity[gene_id] = 'TNS'
        return taxon_specificity

    def get_loss_function_value(self, taxon_specificity, adjusted_stds, beta):
        loss = 0
        for gene_id in taxon_specificity:
            if taxon_specificity[gene_id] == 'TS':
                # Notice: here adjusted std includes the samples that don't have the genome detected in them (it kind of
                # makes sense, because if the gene is detected even though the genome is not, then maybe it's not
                # taxon-specific
                loss += adjusted_stds[gene_id]
            else:
                loss += beta
        return loss

    def get_number_of_detections_for_gene(self, detection_of_genes, gene_id,
                                          samples):
        detections = 0
        for sample_id in samples:
            detections += detection_of_genes[gene_id][sample_id]
        return detections

    def get_core_accessory_info(self, detection_of_genes, gene_id, eta):
        """ Returns 'core'/'accessory' classification for each gene. This is done using only the samples in which the
        genome is detected """
        if detection_of_genes[gene_id]['number_of_detections'] == 0:
            return 'None'
        elif self.get_number_of_detections_for_gene(
                detection_of_genes, gene_id,
                self.positive_samples) < eta * len(self.positive_samples):
            return 'accessory'
        else:
            return 'core'

    def get_gene_class(self, taxon_specificity, core_or_accessory):
        if taxon_specificity == None or core_or_accessory == 'None':
            return 'None'
        elif taxon_specificity == 'TS':
            if core_or_accessory == 'core':
                return 'TSC'
            elif core_or_accessory == 'accessory':
                return 'TSA'
            else:
                print(
                    '%s is not valid. Value should be \'core\' or \'accessory\''
                    % core_or_accessory)
                exit(1)
        elif taxon_specificity == 'TNS':
            if core_or_accessory == 'core':
                return 'TNC'
            elif core_or_accessory == 'accessory':
                return 'TNA'
            else:
                print(
                    '%s is not valid. Value should be \'core\' or \'accessory\''
                    % core_or_accessory)
                exit(1)
        else:
            print('%s is not valid. Value should be \'TS\' or \'TNS\'' %
                  taxon_specificity)
            exit(1)

    def report_gene_class_information(self):
        C = lambda dictionary, field, value: len([
            dict_id for dict_id in dictionary
            if dictionary[dict_id][field] == value
        ])

        for gene_class in ['TSC', 'TSA', 'TNC', 'TNA', 'None']:
            self.run.info(
                'Num class %s' % gene_class,
                C(self.gene_class_information, 'gene_class', gene_class))

        # TODO: report the number of negative samples and the number of NA samples
        self.run.info('Num samples in which the genome is detected',
                      C(self.samples_information, 'detection', True),
                      mc='green')

    def get_gene_classes(self):
        """ returning the classification per gene along with detection in samples (i.e. for each sample, whether the
        genome has been detected in the sample or not """
        TSC_genes = set(self.gene_coverages.keys())
        converged = False
        loss = None
        self.gene_class_information = {}
        # Initializing all the samples to be positive
        self.positive_samples = self.samples
        self.negative_samples = {}
        self.adjusted_stds = 0
        self.adjusted_mean = dict.fromkeys(TSC_genes, 1)

        while not converged:
            # mean of coverage of all TS genes in each sample
            mean_coverage_of_TS_in_samples = self.get_mean_coverage_in_samples(
                TSC_genes)

            # Get the standard deviation of the taxon-specific genes in a sample
            # TODO: right now, single copy, and multi-copy genes would be treated identically. Hence, multi-copy genes
            # would skew both the mean and the std of the taxon-specific genes.
            std_of_TS_in_samples = self.get_std_in_samples(TSC_genes)
            detection_of_genes = self.get_detection_of_genes(
                mean_coverage_of_TS_in_samples, std_of_TS_in_samples)
            self.get_samples_information(detection_of_genes, self.alpha,
                                         TSC_genes)
            self.positive_samples = {
                sample_id
                for sample_id in self.samples
                if self.samples_information[sample_id]['detection']
            }
            self.negative_samples = {
                sample_id
                for sample_id in self.samples
                if self.samples_information[sample_id]['detection'] == False
            }
            self.adjusted_stds, self.adjusted_mean = self.get_adjusted_stds(
                mean_coverage_of_TS_in_samples, detection_of_genes)
            coverage_consistency = self.get_coverage_consistency(
                self.adjusted_stds, detection_of_genes, self.beta)
            gene_specificity = self.get_gene_specificity(detection_of_genes)
            taxon_specificity = self.get_taxon_specificity(
                coverage_consistency, gene_specificity)
            new_loss = self.get_loss_function_value(taxon_specificity,
                                                    self.adjusted_stds,
                                                    self.beta)
            epsilon = 2 * self.beta

            if loss is not None:
                converged = True
            loss = new_loss

            self.run.warning('current value of loss function: %s ' % loss)

            for gene_id in self.gene_coverages:
                # setup a dict for gene id:
                g = {}

                g['gene_specificity'] = gene_specificity[gene_id]
                g['gene_coverage_consistency'] = coverage_consistency[gene_id]
                g['number_of_detections'] = detection_of_genes[gene_id][
                    'number_of_detections']
                g['core_or_accessory'] = self.get_core_accessory_info(
                    detection_of_genes, gene_id, self.eta)
                g['gene_class'] = self.get_gene_class(
                    taxon_specificity[gene_id], g['core_or_accessory'])
                g['adjusted_stds'] = self.adjusted_stds[gene_id]
                g['adjusted_mean'] = self.adjusted_mean[gene_id]

                # counting the number of positive samples that contain the gene
                g['detection_in_positive_samples'] = len([
                    sample_id for sample_id in self.positive_samples
                    if detection_of_genes[gene_id][sample_id]
                ])

                # Getting the portion of positive samples that contain the gene
                g['portion_detected'] = g[
                    'detection_in_positive_samples'] / len(
                        self.positive_samples
                    ) if g['detection_in_positive_samples'] else 0

                self.gene_class_information[gene_id] = g

            TSC_genes = {
                gene_id
                for gene_id in self.gene_class_information
                if self.gene_class_information[gene_id]['gene_class'] == 'TSC'
            }

            self.report_gene_class_information()

        self.get_samples_information(detection_of_genes,
                                     self.alpha,
                                     genes_to_consider=TSC_genes)

    def get_specificity_from_class_id(self, class_id):
        try:
            class_id = int(class_id)
        except:
            raise ConfigError(
                "Classes must be of type integer. You sent this: ", class_id)

        classes = {0: 'None', 1: 'TS', 2: 'TS', 3: 'TS', 4: 'TNS', 5: 'TNS'}

        try:
            return classes(class_id)
        except:
            raise ConfigError(
                "The class id '%d' is not a valid one. Try one of these: '%s'"
                % (class_id, ', '.join(list(classes.keys()))))

    def save_gene_class_information_in_additional_layers(
            self, additional_description=''):
        if not self.additional_layers_to_append:
            additional_column_titles = []
            additional_layers_dict = self.gene_class_information
        else:
            additional_column_titles = utils.get_columns_of_TAB_delim_file(
                self.additional_layers_to_append)
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.additional_layers_to_append,
                dict_to_append=self.gene_class_information,
                assign_none_for_missing=True,
                column_mapping=[int] + [str] * len(additional_column_titles))

        if additional_description:
            additional_description = '-' + additional_description

        additional_layers_file_name = self.output_file_prefix + additional_description + '-additional-layers.txt'
        headers = headers = [
            'gene_callers_id', 'gene_class', 'number_of_detections',
            'portion_detected', 'gene_specificity',
            'gene_coverage_consistency', 'core_or_accessory', 'adjusted_mean',
            'adjusted_stds'
        ] + additional_column_titles

        utils.store_dict_as_TAB_delimited_file(additional_layers_dict,
                                               additional_layers_file_name,
                                               headers=headers)

    def save_samples_information(self, additional_description=''):
        if not self.samples_information_to_append:
            samples_information_column_titles = list(
                self.samples_information[next(iter(self.samples_information))])
            samples_information_dict = self.samples_information
        else:
            samples_information_column_titles = utils.get_columns_of_TAB_delim_file(
                self.samples_information_to_append)
            column_mapping = [str
                              ] * (len(samples_information_column_titles) + 2)
            self.run.warning(self.samples_information)
            samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.samples_information_to_append,
                dict_to_append=self.samples_information,
                assign_none_for_missing=True,
                column_mapping=column_mapping)

        if additional_description:
            additional_description = '-' + additional_description

        samples_information_file_name = self.output_file_prefix + additional_description + '-samples-information.txt'
        utils.store_dict_as_TAB_delimited_file(
            samples_information_dict,
            samples_information_file_name,
            headers=['samples'] + samples_information_column_titles)

    def save_gene_detection_and_coverage(self, additional_description=''):
        if additional_description:
            prefix = self.output_file_prefix + '-' + additional_description
        else:
            prefix = self.output_file_prefix
        gene_coverages_file_name = prefix + '-gene-coverages.txt'
        gene_detections_file_name = prefix + '-gene-detections.txt'
        utils.store_dict_as_TAB_delimited_file(self.gene_coverages,
                                               gene_coverages_file_name)
        utils.store_dict_as_TAB_delimited_file(self.gene_detection,
                                               gene_detections_file_name)

    def get_coverage_and_detection_dict(self, bin_id):
        _bin = summarizer.Bin(self.summary, bin_id)
        self.gene_coverages = _bin.gene_coverages
        self.gene_detection = _bin.gene_detection
        self.samples = set(next(iter(self.gene_coverages.values())).keys())

    def classify(self):
        if self.collection_name:
            bin_names_in_collection = self.summary.bin_ids
            if self.bin_ids_file_path:
                filesnpaths.is_file_exists(self.bin_ids_file_path)
                bin_names_of_interest = [
                    line.strip()
                    for line in open(self.bin_ids_file_path).readlines()
                ]

                missing_bins = [
                    b for b in bin_names_of_interest
                    if b not in bin_names_in_collection
                ]
                if len(missing_bins):
                    raise ConfigError(
                        "Some bin names you declared do not appear to be in the collection %s. \
                                        These are the bins that are missing: %s, these are the bins that are \
                                        actually in your collection: %s" %
                        (self.collection_name, missing_bins,
                         bin_names_in_collection))
            elif self.bin_id:
                if self.bin_id not in bin_names_in_collection:
                    raise ConfigError("The bin you declared, %s, does not appear to be in the collection %s." \
                                      % (self.bin_id, self.collection_name))
                bin_names_of_interest = [self.bin_id]
            else:
                bin_names_of_interest = bin_names_in_collection

            for bin_id in bin_names_of_interest:
                self.run.info_single('Classifying genes in bin: %s' % bin_id)
                self.get_coverage_and_detection_dict(bin_id)
                self.get_gene_classes()
                self.save_gene_class_information_in_additional_layers(bin_id)
                self.save_samples_information(bin_id)
                if self.store_gene_detections_and_gene_coverages_tables:
                    self.save_gene_detection_and_coverage(bin_id)

        else:
            # No collection provided so running on the entire detection table
            self.get_gene_classes()
            self.save_gene_class_information_in_additional_layers()
            self.save_samples_information()
            if self.store_gene_detections_and_gene_coverages_tables:
                self.save_gene_detection_and_coverage()
Ejemplo n.º 8
0
class MetagenomeCentricGeneClassifier:
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.exclude_samples = A('exclude_samples')
        self.include_samples = A('include_samples')
        self.profile_db = {}
        self.coverage_values_per_nt = {}
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.sample_detection_information_was_initiated = False
        self.positive_samples = []
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_df = pd.DataFrame.empty
        self.samples_detection_information = pd.DataFrame.empty
        self.gene_presence_absence_in_samples_initiated = False
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty
        self.additional_description = ''
        self.total_length = None
        self.samples_coverage_stats_dicts_was_initiated = False
        self.samples_coverage_stats_dicts = pd.DataFrame.empty
        self.non_outlier_indices = {}
        self.gene_coverage_stats_dict_of_dfs_initiated = False
        self.gene_coverage_stats_dict_of_dfs = {}
        self.gene_coverage_consistency_dict = {}
        self.gene_coverage_consistency_dict_initiated = False

        if self.exclude_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(self.exclude_samples, 'rU').readlines()
            ])

            if not self.samples_to_exclude:
                raise ConfigError(
                    "You asked to exclude samples, but provided an empty list."
                )

            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        if self.include_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.include_samples)
            self.samples_to_include = set([
                l.split('\t')[0].strip()
                for l in open(self.include_samples, 'rU').readlines()
            ])

            if not self.samples_to_include:
                raise ConfigError(
                    "You provided an empty list of samples to include.")

            run.info(
                'Including Samples',
                'The following samples will be included: %s' %
                self.samples_to_include,
            )
        else:
            self.samples_to_include = set([])

        # run sanity check on all input arguments
        self.sanity_check()

        if self.profile_db_path is None:
            # TODO: this will probably be removed because we don't save the coverage information in nucleotide level.
            pass
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
                self.init_samples(self.summary.p_meta['samples'])
            else:
                self.profile_db = ProfileSuperclass(args)
                self.init_samples(self.profile_db.p_meta['samples'])
                self.profile_db.init_split_coverage_values_per_nt_dict()
                self.profile_db.init_gene_level_coverage_stats_dicts(
                    outliers_threshold=2.5, populate_nt_level_coverage=True)
                self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
                    self.profile_db.split_coverage_values_per_nt_dict,
                    self.samples)

                # comply with the new design and get gene_coverages and gene_detection dicsts from
                # gene_level_coverage_stats_dict.
                gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts(
                )

                self.init_coverage_and_detection_dataframes(
                    gene_coverages, gene_detection)

                # getting the total length of all contigs
                self.total_length = self.profile_db.p_meta['total_length']

    def get_gene_coverages_and_gene_detection_dicts(self):
        gene_coverages = {}
        gene_detection = {}

        A = lambda x: self.profile_db.gene_level_coverage_stats_dict[
            gene_callers_id][sample_name][x]

        gene_caller_ids = list(
            self.profile_db.gene_level_coverage_stats_dict.keys())

        # populate gene coverage and detection dictionaries
        if self.profile_db.gene_level_coverage_stats_dict:
            for gene_callers_id in gene_caller_ids:
                gene_coverages[gene_callers_id], gene_detection[
                    gene_callers_id] = {}, {}

                for sample_name in self.profile_db.p_meta['samples']:
                    gene_coverages[gene_callers_id][sample_name] = A(
                        'mean_coverage')
                    gene_detection[gene_callers_id][sample_name] = A(
                        'detection')

        return gene_coverages, gene_detection

    def check_if_valid_portion_value(self, arg_name, arg_value):
        """ Helper function to verify that an argument has a valid value for a non-zero portion (i.e. greater than zero and a max of 1)"""
        if arg_value <= 0 or arg_value > 1:
            raise ConfigError(
                "%s value must be greater than zero and a max of 1, the value you supplied %s"
                % (arg_name, arg_value))

    def sanity_check(self):
        """Basic sanity check for class inputs"""

        if self.profile_db_path is None and self.gene_coverages_data_file_path is None:
            raise ConfigError(
                "You must provide either a profile.db or a gene coverage self.gene_coverages_filtered data file"
            )

        if self.profile_db_path and self.gene_coverages_data_file_path:
            raise ConfigError(
                "You provided both a profile database and a gene coverage self.gene_coverages_filtered data file, you \
            must provide only one or the other (hint: if you have a profile database, the use that"
            )

        # checking output file
        filesnpaths.is_output_file_writable(self.output_file_prefix +
                                            '-additional-layers.txt',
                                            ok_if_exists=False)

        # checking alpha
        if not isinstance(self.alpha, float):
            raise ConfigError("alpha value must be a type float.")
        # alpha must be a min of 0 and smaller than 0.5
        if self.alpha < 0 or self.alpha >= 0.5:
            raise ConfigError(
                "alpha must be a minimum of 0 and smaller than 0.5")

        if self.collection_name:
            if not self.profile_db_path:
                raise ConfigError(
                    "You specified a collection name %s, but you provided a gene coverage self.gene_coverages_filtered data file \
                 collections are only available when working with a profile database."
                    % self.collection_name)

        if self.exclude_samples and self.include_samples:
            raise ConfigError(
                "You cannot use both --include-samples and --exclude-samples! Please choose one."
            )

    def init_samples(self, samples_list):
        """ Create the set of samples according to user input and store it in self.samples"""
        samples = set(samples_list) - self.samples_to_exclude
        if self.include_samples:
            samples_to_include_that_are_not_there = self.samples_to_include - samples
            if samples_to_include_that_are_not_there:
                raise ConfigError(
                    "You requested to include some samples that are not in the profile database. Here are the samples in the profile database: %s. \
                                And here are the samples you requested, and that are not there: %s"
                    % (samples, samples_to_include_that_are_not_there))
            samples = self.samples_to_include
        self.samples = samples

    def init_coverage_and_detection_dataframes(self, gene_coverages_dict,
                                               gene_detection_dict):
        """ Populate the following: self.gene_coverages, self.Ng, self.gene_detections.

            Notice that this function could get as input either an object of ProfileSuperclass or of summarizer.Bin
        """
        self.gene_coverages = pd.DataFrame.from_dict(gene_coverages_dict,
                                                     orient='index',
                                                     dtype=float)
        self.Ng = len(self.gene_coverages.index)
        self.gene_detections = pd.DataFrame.from_dict(gene_detection_dict,
                                                      orient='index',
                                                      dtype=float)

        if self.include_samples or self.exclude_samples:
            # Only include samples that the user want
            self.gene_coverages = self.gene_coverages[list(self.samples)]
            self.gene_detections = self.gene_detections[list(self.samples)]

    def init_sample_detection_information(self):
        """ Determine  positive, negative, and ambiguous samples with the genome detection information
        (--alpha, --genome-detection-uncertainty)
        """

        # FIXME: some of the following variables are never used.
        MCG_samples_information_table_name = 'MCG_classifier_samples_information'
        MCG_samples_information_table_structure = [
            'samples', 'presence', 'detection',
            'number_of_taxon_specific_core_detected'
        ]
        MCG_samples_information_table_types = ['str', 'bool', 'int', 'int']

        # create an empty dataframe
        samples_information = pd.DataFrame(
            index=self.samples,
            columns=MCG_samples_information_table_structure[1:])
        positive_samples = []
        negative_samples = []

        self.progress.new("Setting presence/absence in samples")
        progress.update('...')
        num_samples, counter = len(self.samples), 1
        detection = {}
        for sample in self.samples:
            if num_samples > 100 and counter % 100 == 0:
                self.progress.update('%d of %d samples...' %
                                     (counter, num_samples))
            print("total length for %s is %s" % (sample, self.total_length))
            print(
                "the length of the vector: %s" %
                len(self.coverage_values_per_nt[sample])
            )  # FIXME: after testing this module, delete this line. it is only here to make sure that anvio is not lying to us.
            print(
                "number of nucleotide positions with non zero coverage in %s is %s "
                % (sample, np.count_nonzero(
                    self.coverage_values_per_nt[sample])))
            detection[sample] = np.count_nonzero(
                self.coverage_values_per_nt[sample]) / self.total_length
            samples_information['presence'][
                sample] = get_presence_absence_information(
                    detection[sample], self.alpha)
            if samples_information['presence'][sample]:
                positive_samples.append(sample)
            elif samples_information['presence'][sample] == False:
                negative_samples.append(sample)

            samples_information['detection'][sample] = detection[sample]
            counter += 1
        self.progress.end()

        self.positive_samples = positive_samples
        self.number_of_positive_samples = len(self.positive_samples)
        self.negative_samples = negative_samples
        self.samples_detection_information = samples_information
        self.run.warning('The number of positive samples is %s' %
                         self.number_of_positive_samples)
        self.run.warning('The number of negative samples is %s' %
                         len(self.negative_samples))
        self.sample_detection_information_was_initiated = True

    def init_samples_coverage_stats_dict(self):
        """ populate the samples_coverage_stats_dict."""
        if not self.sample_detection_information_was_initiated:
            self.init_sample_detection_information()

        self.samples_coverage_stats_dicts = pd.DataFrame(
            index=self.samples,
            columns=columns_for_samples_coverage_stats_dict)

        num_samples, counter = len(self.samples), 1
        self.progress.new(
            "Finding nucleotide positions in samples with outlier coverage values"
        )
        progress.update('...')
        for sample in self.positive_samples:
            if num_samples > 100 and counter % 100 == 0:
                self.progress.update('%d of %d samples...' %
                                     (counter, num_samples))

            # loop through positive samples
            # get the non-outlier information
            self.non_outlier_indices[
                sample], self.samples_coverage_stats_dicts.loc[
                    sample, ] = get_non_outliers_information(
                        self.coverage_values_per_nt[sample])

            self.run.info_single(
                'The mean and std of non-outliers in sample %s are: %s, %s respectively'
                %
                (sample,
                 self.samples_coverage_stats_dicts['non_outlier_mean_coverage']
                 [sample],
                 self.samples_coverage_stats_dicts['non_outlier_coverage_std']
                 [sample]))
            number_of_non_outliers = len(self.non_outlier_indices[sample])
            self.run.info_single(
                'The number of non-outliers is %s of %s (%.2f%%)' %
                (number_of_non_outliers, self.total_length,
                 100.0 * number_of_non_outliers / self.total_length))
        self.progress.end()

    def plot_TS(self):
        """ Creates a pdf file with the following plots for each sample the sorted nucleotide coverages \
        (with a the outliers in red and non-outliers in blue), and a histogram of coverages for the non-outliers"""
        # Creating a dircetory for the plots. If running on bins, each bin would be in a separate sub-directory

        if not self.samples_coverage_stats_dicts_was_initiated:
            self.init_samples_coverage_stats_dict()

        additional_description = ''
        if self.additional_description:
            additional_description = '-' + self.additional_description

        plot_dir = self.output_file_prefix + '-TS-plots' + '/'
        os.makedirs(plot_dir, exist_ok=True)
        self.progress.new(
            'Saving figures of taxon specific distributions to pdf')
        progress.update('...')
        number_of_fininshed = 0
        for sample in self.positive_samples:
            coverages_pdf_output = plot_dir + sample + additional_description + '-coverages.pdf'
            pdf_output_file = PdfPages(coverages_pdf_output)
            v = self.coverage_values_per_nt[sample]
            # Using argsort so we can use the non_oulier indices
            sorting_indices = np.argsort(v)
            # we would need the reverse of the sorting of the indices to create the x axis for the non-outliers
            reverse_sorted_indices = np.zeros(len(sorting_indices))
            reverse_sorted_indices[sorting_indices] = range(
                len(reverse_sorted_indices))

            # plotting the ordered coverage values (per nucleotide)
            # the non-outliers are plotted in blue
            # the outlier values are plotted in red
            fig = plt.figure()
            ax = fig.add_subplot(111, rasterized=True)
            ax.set_xlabel = 'Nucleotide Number (ordered)'
            ax.set_ylabel = r'$Nucleotide Coverage^2$'
            x1 = range(
                len(v)
            )  # FIXME: this shouldn't be in the loop (only here because I need to fix the mock data)
            x2 = reverse_sorted_indices[self.non_outlier_indices[sample]]
            y2 = v[self.non_outlier_indices[sample]]
            # plot all in red
            ax.semilogy(x1, v[sorting_indices], 'r.', rasterized=True)
            # plot on top the non-outliers in blue
            ax.semilogy(x2,
                        v[self.non_outlier_indices[sample]],
                        'b.',
                        rasterized=True)
            fig.suptitle("%s - sorted coverage values with outliers" % sample)
            plt.savefig(pdf_output_file, format='pdf')
            plt.close()

            # plotting a histogram of the non-outliers
            # This would allow to see if they resemble a normal distribution
            hist_range = (min(v[self.non_outlier_indices[sample]]),
                          max(v[self.non_outlier_indices[sample]]))
            # computing the number of bins so that the width of a bin is ~1/4 of the standard deviation
            # FIXME: need to make it so the bins are only of integers (so the smallest bin is of width 1
            # and that bins are integers)
            number_of_hist_bins = np.ceil(
                (hist_range[1] - hist_range[0]) /
                (self.samples_coverage_stats_dicts['non_outlier_coverage_std']
                 [sample] / 4)
            ).astype(
                int
            )  # setting the histogram bins to be of the width of a quarter of std
            fig = plt.figure()
            ax = fig.add_subplot(111, rasterized=True)
            ax.set_xlabel = 'Coverage'
            ax.hist(v[self.non_outlier_indices[sample]],
                    number_of_hist_bins,
                    hist_range,
                    rasterized=True)
            fig.suptitle("%s - histogram of non-outliers" % sample)
            # adding the mean and std of the non-outliers as text to the plot
            text_for_hist = u'$\mu = %d$\n $\sigma = %d$' %\
                                (self.samples_coverage_stats_dicts['non_outlier_mean_coverage'][sample],\
                                 self.samples_coverage_stats_dicts['non_outlier_coverage_std'][sample])
            ax.text(0.8,
                    0.9,
                    text_for_hist,
                    ha='center',
                    va='center',
                    transform=ax.transAxes)
            plt.savefig(pdf_output_file, format='pdf')
            plt.close()
            # close the pdf file
            pdf_output_file.close()
            number_of_fininshed += 1
            self.progress.update(
                "Finished %d of %d" %
                (number_of_fininshed, self.number_of_positive_samples))
        self.progress.end()

    def init_gene_presence_absence_in_samples(self):
        gene_callers_id = list(
            self.profile_db.gene_level_coverage_stats_dict.keys())
        self.gene_presence_absence_in_samples = pd.DataFrame(
            index=gene_callers_id, columns=self.samples)

        num_samples, counter = len(self.samples), 1
        self.progress.new('Computing gene presence/absence in samples')
        progress.update('...')
        for sample in self.samples:
            if num_samples > 100 and counter % 100 == 0:
                self.progress.update('%d of %d samples...' %
                                     (counter, num_samples))
            for gene_id in gene_callers_id:
                self.gene_presence_absence_in_samples.loc[
                    gene_id, sample] = get_presence_absence_information(
                        self.profile_db.gene_level_coverage_stats_dict[gene_id]
                        [sample]['detection'], self.alpha)
        self.gene_presence_absence_in_samples_initiated = True
        self.progress.end()

    def init_gene_coverage_stats_dict_of_dfs(self):
        """ Convert gene_coverage_stats_dict to a dictionary of pandas dataframes
            
            The reason to do this is that this way the gene parameters accross samples
            could be used as numpy arrays. For example this allows to use the gene non-outlier
            mean coverage accross samples as an array in order to perform regression
            (see init_gene_coverage_consistency_information for example of usage).
        """
        num_genes, counter = len(
            self.profile_db.gene_level_coverage_stats_dict.keys()), 1
        self.progress.new(
            "Initializing gene coverage stats dictionary of dataframes")
        progress.update('...')
        for gene_id, coverage_stats in self.profile_db.gene_level_coverage_stats_dict.items(
        ):
            if num_genes > 100 and counter % 100 == 0:
                self.progress.update('%d of %d genes...' %
                                     (counter, num_genes))
            self.gene_coverage_stats_dict_of_dfs[
                gene_id] = pd.DataFrame.from_dict(coverage_stats,
                                                  orient='index')
        self.gene_coverage_stats_dict_of_dfs_initiated = True
        self.progress.end()

    def init_gene_coverage_consistency_information(self):
        """ Perform orthogonal distance regression for each gene to determine coverage consistency.
            
            The question that we are trying to ask is:
                Do the non-outlier nt coverage of the gene in samlpes correlates to the non-outlier
                nt coverage of the genome in samples?

            The regression is performed only for positive samples.
            For each gene, the regression is performed only according to samples in which
            the gene is present (according to the detection critrea).
        """
        if not self.samples_coverage_stats_dicts_was_initiated:
            self.init_samples_coverage_stats_dict()

        if not self.gene_presence_absence_in_samples_initiated:
            self.init_gene_presence_absence_in_samples()

        if not self.gene_coverage_stats_dict_of_dfs_initiated:
            self.init_gene_coverage_stats_dict_of_dfs()

        self.progress.new("Computing coverage consistency for all genes.")
        progress.update('...')
        num_genes, counter = len(
            self.profile_db.gene_level_coverage_stats_dict.keys()), 1
        for gene_id in self.profile_db.gene_level_coverage_stats_dict.keys():
            if num_genes > 100 and counter % 100 == 0:
                self.progress.update('%d of %d genes...' %
                                     (counter, num_genes))

            # samples in which the gene is present
            _samples = self.gene_presence_absence_in_samples.loc[
                gene_id, self.gene_presence_absence_in_samples.loc[
                    gene_id, ] == True].index
            # mean and std of non-outlier nt in each sample
            x = self.samples_coverage_stats_dicts.loc[
                _samples, 'non_outlier_mean_coverage']
            std_x = self.samples_coverage_stats_dicts.loc[
                _samples, 'non_outlier_coverage_std']
            if len(_samples) > 1:
                # mean and std of non-outlier nt in the gene (in each sample)
                y = self.gene_coverage_stats_dict_of_dfs[gene_id].loc[
                    _samples, 'non_outlier_mean_coverage']
                std_y = self.gene_coverage_stats_dict_of_dfs[gene_id].loc[
                    _samples, 'non_outlier_coverage_std']

                # performing the regression using ODR
                _data = odr.RealData(list(x.values), list(y.values),
                                     list(std_x.values), list(std_y.values))
                _model = lambda B, c: B[0] * c
                _odr = odr.ODR(_data, odr.Model(_model), beta0=[3])
                odr_output = _odr.run()

                # store results
                self.gene_coverage_consistency_dict[gene_id] = {}
                self.gene_coverage_consistency_dict[gene_id][
                    'slope'] = odr_output.beta[0]
                self.gene_coverage_consistency_dict[gene_id][
                    'slope_std'] = odr_output.sd_beta[0]
                self.gene_coverage_consistency_dict[gene_id][
                    'slope_precision'] = odr_output.sd_beta[
                        0] / odr_output.beta[0]

                # compute R squered
                f = lambda b: lambda _x: b * _x
                R_squered = 1 - sum(
                    (np.apply_along_axis(f(odr_output.beta[0]), 0, x) -
                     y.values)**2) / sum((y - np.mean(y.values))**2)

                # Check if converged
                self.gene_coverage_consistency_dict[gene_id][
                    'R_squered'] = R_squered
                if odr_output.stopreason[0] == 'Sum of squares convergence':
                    self.gene_coverage_consistency_dict[gene_id][
                        'converged'] = True
                else:
                    self.gene_coverage_consistency_dict[gene_id][
                        'converged'] = False

        self.gene_coverage_consistency_dict_initiated = True
        self.progress.end()

    def get_gene_specificity(self, gene_id):
        """ return True for gene if it occurs in positive samples and doesn't occur in negative samples.
        
            Ambiguous occurences are not counted as anything. This means that if a gene is ambiguously
            occuring in a negative sample it could still be counted as "specific". It also means that
            if a gene is only ambiguously occuring in positive samples then it would be considered
            as "non-specific".
        """

        if self.gene_class_df.loc[
                gene_id,
                'occurence_in_positive_samples'] > 1 and self.gene_class_df.loc[
                    gene_id, 'occurence_in_negative_samples'] == 0:
            return True
        else:
            return False
        # TODO: if there are no occurences of the gene at all, then we should maybe return None instead of False

    def get_gene_coverage_consistency(self, gene_id):
        """ return true if the gene's coverage is consistent accross positive samples, False otherwise."""

        # TODO: make sure coverage_consistency_dict has been initiated
        if self.gene_class_df.loc[gene_id,
                                  'occurence_in_positive_samples'] == 0:
            # if the gene doesn't occur in positive samlpes then there is no classification
            return None
        elif self.gene_class_df.loc[gene_id,
                                    'occurence_in_positive_samples'] == 1:
            # if the gene occurs only in one positive sample then return True.
            # XXX: we might prefer to return None, we should consider this in the future.
            return True
        elif self.gene_coverage_consistency_dict[gene_id]['converged']:
            # FIXME: this is where we use an arbitrary threshold again :-(
            # if the slope precision is smaller than the threshold then the regression
            # fit is considered accurate enough and the gene coverage is considered consistent.
            return self.gene_coverage_consistency_dict[gene_id][
                'slope_precision'] > 0.5
        else:
            # The regression didn't converege so the coverage is probably not consistent.
            return False

    def determine_if_gene_is_core(self, gene_id, gene_specificity):
        """ return True for core gene, False for accessory gene
        
            If the gene is specific to positive samples, then core would be considered if it
            occurs in all positive samples. Otherwise it would be considered core if it 
            occurs in all positive AND all negative samples.
            Ambiguous occurences of a gene are not considered (i.e. they are the same as absence).
        """

        if gene_specificity:
            # return True if the the gene occurs in all positive samples.
            return self.gene_class_df.loc[
                gene_id,
                'occurence_in_positive_samples'] == len(self.positive_samples)
        else:
            # return True if the gene occurs in all positive AND all negative samples
            return self.gene_class_df.loc[
                gene_id, 'occurence_in_positive_and_negative_samples'] == len(
                    self.positive_samples) + len(self.negative_samples)

    def init_gene_class_df(self):
        """ generate dictionary with the class information per gene.

            This dictionary could be later use to produce an additional-layer
            text file for vizualization.
        """

        # TODO: make sure gene presence absence was calculated
        if not self.gene_coverage_consistency_dict_initiated:
            self.init_gene_coverage_consistency_information()
        # XXX: only negative and positive samples are used here
        # ambiguous samples are ignored as if they were never
        # there. This is not ideal, but is easy to do.
        self.gene_class_df = pd.DataFrame(
            index=list(self.profile_db.gene_level_coverage_stats_dict.keys()))
        for gene_id in self.profile_db.gene_level_coverage_stats_dict.keys():
            # determine the number of occurences in positive samples
            self.gene_class_df.loc[gene_id,
                                   'occurence_in_positive_samples'] = np.sum(
                                       self.gene_presence_absence_in_samples.
                                       loc[gene_id, self.positive_samples])
            # determine the number of occurences in negative samples
            self.gene_class_df.loc[gene_id,
                                   'occurence_in_negative_samples'] = np.sum(
                                       self.gene_presence_absence_in_samples.
                                       loc[gene_id, self.negative_samples])
            # set the occurence_in_positive_and_negative_samples
            self.gene_class_df.loc[
                gene_id,
                'occurence_in_positive_and_negative_samples'] = self.gene_class_df.loc[
                    gene_id,
                    'occurence_in_positive_samples'] + self.gene_class_df.loc[
                        gene_id, 'occurence_in_negative_samples']

            gene_specificity = self.get_gene_specificity(gene_id)
            gene_coverage_consistency = self.get_gene_coverage_consistency(
                gene_id)
            # determine core accessory
            gene_is_core = self.determine_if_gene_is_core(
                gene_id, gene_specificity)

            self.gene_class_df.loc[gene_id, 'specificity'] = gene_specificity
            self.gene_class_df.loc[
                gene_id, 'coverage_consistency'] = gene_coverage_consistency
            self.gene_class_df.loc[gene_id, 'core'] = gene_is_core
            self.gene_class_df.loc[gene_id, 'MCG_class'] = get_class_string(
                gene_specificity, gene_coverage_consistency, gene_is_core)

    def get_gene_classes(self):
        """ The main process of this class - computes the class information for each gene"""
        # Create the plots for nucleotide-level coverage data per sample.
        self.plot_TS()

        # generate plots for coverage consistency information for each gene.
        self.gen_gene_consistency_plots()

        # create the gene_class_df
        self.init_gene_class_df()

    def gen_gene_consistency_plots(self):
        """ generate and save the gene consistency plots for each gene."""

        if not self.gene_coverage_consistency_dict_initiated:
            self.init_gene_coverage_consistency_information()

        num_genes, counter = len(
            self.profile_db.gene_level_coverage_stats_dict.keys()), 1
        progress.new('Plotting gene consistency information')
        progress.update('...')
        for gene_id in self.profile_db.gene_level_coverage_stats_dict.keys():
            if num_genes > 100 and counter % 100 == 0:
                self.progress.update('%d of %d genes...' %
                                     (counter, num_genes))
            p = MCGPlots(self, gene_id, run=run, progress=progress)
            p.plot()

        progress.end()

    def get_coverage_and_detection_dict(self, bin_id):
        _bin = summarizer.Bin(self.summary, bin_id)
        self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
            _bin.split_coverage_values_per_nt_dict, self.samples)

        # getting the total length of all contigs
        self.total_length = _bin.total_length

        self.init_coverage_and_detection_dataframes(_bin.gene_coverages,
                                                    _bin.gene_detection)

    def classify(self):
        if self.collection_name:
            bin_names_in_collection = self.summary.bin_ids
            if self.bin_ids_file_path:
                filesnpaths.is_file_exists(self.bin_ids_file_path)
                bin_names_of_interest = [
                    line.strip()
                    for line in open(self.bin_ids_file_path).readlines()
                ]

                missing_bins = [
                    b for b in bin_names_of_interest
                    if b not in bin_names_in_collection
                ]
                if len(missing_bins):
                    raise ConfigError(
                        "Some bin names you declared do not appear to be in the collection %s. \
                                        These are the bins that are missing: %s, these are the bins that are \
                                        actually in your collection: %s" %
                        (self.collection_name, missing_bins,
                         bin_names_in_collection))
            elif self.bin_id:
                if self.bin_id not in bin_names_in_collection:
                    raise ConfigError("The bin you declared, %s, does not appear to be in the collection %s." \
                                      % (self.bin_id, self.collection_name))
                bin_names_of_interest = [self.bin_id]
            else:
                bin_names_of_interest = bin_names_in_collection

            for bin_id in bin_names_of_interest:
                self.run.info_single('Classifying genes in bin: %s' % bin_id)
                self.get_coverage_and_detection_dict(bin_id)
                self.additional_description = bin_id
                self.get_gene_classes()
                #self.save_gene_class_information_in_additional_layers(bin_id)
                #self.save_samples_information(bin_id)

        else:
            # No collection provided so running on the entire detection table
            self.get_gene_classes()
Ejemplo n.º 9
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.exclude_samples = A('exclude_samples')
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.positive_samples = pd.DataFrame.empty
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_information = pd.DataFrame.empty
        self.profile_db = {}
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty

        # check that there is a file like this
        if self.exclude_samples:
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(args.exclude_samples, 'rU').readlines()
            ])
            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = pd.DataFrame.from_dict(
                    self.profile_db.gene_coverages_dict,
                    orient='index',
                    dtype=float)
                self.gene_coverages.drop(self.samples_to_exclude,
                                         axis=1,
                                         inplace=True)
                self.Ng = len(self.gene_coverages.index)
                self.gene_detections = pd.DataFrame.from_dict(
                    self.profile_db.gene_detection_dict,
                    orient='index',
                    dtype=float)
                self.gene_detections.drop(self.samples_to_exclude,
                                          axis=1,
                                          inplace=True)
                self.samples = set(self.gene_coverages.columns)
Ejemplo n.º 10
0
class AlonsClassifier:
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.exclude_samples = A('exclude_samples')
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.positive_samples = pd.DataFrame.empty
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_information = pd.DataFrame.empty
        self.profile_db = {}
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty

        # check that there is a file like this
        if self.exclude_samples:
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(args.exclude_samples, 'rU').readlines()
            ])
            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = pd.DataFrame.from_dict(
                    self.profile_db.gene_coverages_dict,
                    orient='index',
                    dtype=float)
                self.gene_coverages.drop(self.samples_to_exclude,
                                         axis=1,
                                         inplace=True)
                self.Ng = len(self.gene_coverages.index)
                self.gene_detections = pd.DataFrame.from_dict(
                    self.profile_db.gene_detection_dict,
                    orient='index',
                    dtype=float)
                self.gene_detections.drop(self.samples_to_exclude,
                                          axis=1,
                                          inplace=True)
                self.samples = set(self.gene_coverages.columns)

    def check_if_valid_portion_value(self, arg_name, arg_value):
        """ Helper function to verify that an argument has a valid value for a non-zero portion (i.e. greater than zero and a max of 1)"""
        if arg_value <= 0 or arg_value > 1:
            raise ConfigError(
                "%s value must be greater than zero and a max of 1, the value you supplied %s"
                % (arg_name, arg_value))

    def sanity_check(self):
        """Basic sanity check for class inputs"""

        if self.profile_db_path is None and self.gene_coverages_data_file_path is None:
            raise ConfigError(
                "You must provide either a profile.db or a gene coverage self.gene_coverages_filtered data file"
            )

        if self.profile_db_path and self.gene_coverages_data_file_path:
            raise ConfigError(
                "You provided both a profile database and a gene coverage self.gene_coverages_filtered data file, you \
            must provide only one or the other (hint: if you have a profile database, the use that"
            )

        # checking output file
        filesnpaths.is_output_file_writable(self.output_file_prefix +
                                            '-additional-layers.txt',
                                            ok_if_exists=False)
        # checking alpha
        if not isinstance(self.alpha, float):
            raise ConfigError("alpha value must be a type float.")
        self.check_if_valid_portion_value("alpha", self.alpha)

        # Checking beta
        if not isinstance(self.beta, float):
            raise ConfigError("beta value must be a type float.")
        self.check_if_valid_portion_value("beta", self.beta)
        if self.beta > self.alpha:
            raise ConfigError(
                "beta value must be smaller than alpha value. The beta value you specified is %s while the alpha value\
            is %s" % (self.beta, self.alpha))

        # Checking gamma
        if not isinstance(self.gamma, float):
            raise ConfigError("Gamma value must be a type float.")
        self.check_if_valid_portion_value("gamma", self.gamma)

        # Checking eta
        self.check_if_valid_portion_value("eta", self.eta)

        if self.collection_name:
            if not self.profile_db_path:
                raise ConfigError(
                    "You specified a collection name %s, but you provided a gene coverage self.gene_coverages_filtered data file \
                 collections are only available when working with a profile database."
                    % self.collection_name)

    def get_data_from_txt_file(self):
        """ Reads the coverage data from TAB delimited file """
        self.gene_coverages = pd.read_table(self.gene_coverages_data_file_path,
                                            sep='\t',
                                            header=0,
                                            index_col=0)
        self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True)
        self.Ng = len(self.gene_coverages.index)
        self.samples = set(self.gene_coverages.columns.values)
        # checking if a gene_detection file was also supplied
        if self.gene_detections_data_file_path:
            self.gene_detections = pd.read_table(
                self.gene_coverages_data_file_path,
                sep='\t',
                header=0,
                index_col=0)
            self.gene_detections.drop(self.samples_to_exclude,
                                      axis=1,
                                      inplace=True)
            # making sure that the tables are compatible, notice we're only checking if gene_detection contains everything that's in gene_coverages (and not vise versa)
            for gene_id in self.gene_coverages.index:
                if gene_id not in self.gene_detections.index:
                    raise ConfigError(
                        "Your tables are not compatible. For example gene_id %s is in %s, but not in %s"
                        % (gene_id, self.gene_coverages_data_file_path,
                           self.gene_detections_data_file_path))
            for sample_id in self.samples:
                if sample_id not in self.gene_detections.columns.values:
                    raise ConfigError(
                        "Your tables are not compatible. For example sample_id %s is in %s, but not in %s"
                        % (sample_id, self.gene_coverages_data_file_path,
                           self.gene_detections_data_file_path))

    def set_gene_presence_absence_in_samples(self):
        """ Determines the presence/absense of genes according to the gene detection threshold.
            The following arguments are populated:
                self.gene_presence_absence_in_samples
                self.gene_coverages_filtered
        """
        self.gene_presence_absence_in_samples = self.gene_detections > self.zeta
        self.gene_coverages_filtered = self.gene_coverages.copy()
        self.gene_coverages_filtered[~self.
                                     gene_presence_absence_in_samples] = 0

    def set_sample_detection_information(self):
        """ Using the --genome-presence-threshold and the --genome-absence-threhold the samples are devided to three groups:
                positive samples: samples in which the number of genes that are present (according to the --min-gene-detection threshold) is
                    greater than --genome-presence-threshold
                negative samples: samples in which the number of genes that are present (according to the --min-gene-detection threshold) is
                    smaller than --genome-absence-threhold
                ambiguous samples: all other samples (i.e. samples in which the number of genes that are present is between the two thresholds)
            This function populates the following arguments of self:
                self.positive_samples - a set of the positive sample ids self.negative_samples - a set of the negative sample ids
                self.samples_information - dictionary with True, False, None for positive, negative and ambiguous samples respectively
        """
        MDG_samples_information_table_name = 'MDG_classifier_samples_information'
        MDG_samples_information_table_structure = [
            'samples', 'presence', 'number_of_detected_genes',
            'number_of_taxon_specific_core_detected'
        ]
        MDG_samples_information_table_types = ['str', 'bool', 'int', 'int']
        # create an empty dataframe
        samples_information = pd.DataFrame(
            index=self.samples,
            columns=MDG_samples_information_table_structure[1:])

        # Compute the number of detected genes per samples
        number_of_genes_in_sampels = self.gene_presence_absence_in_samples.sum(
            axis=0)
        for sample in self.samples:
            samples_information['number_of_detected_genes'][
                sample] = number_of_genes_in_sampels[sample]

        self.positive_samples = set(
            number_of_genes_in_sampels[number_of_genes_in_sampels /
                                       self.Ng > self.alpha].index)
        self.negative_samples = set(number_of_genes_in_sampels[
            number_of_genes_in_sampels /
            self.Ng > self.beta].index) - self.positive_samples
        for sample in self.positive_samples:
            samples_information['presence'][sample] = True
        for sample in self.negative_samples:
            samples_information['presence'][sample] = False
        self.samples_information = samples_information

        self.number_of_positive_samples = len(self.positive_samples)
        self.number_of_negative_samples = len(self.negative_samples)
        self.run.warning('The number of positive samples is: %d ' %
                         self.number_of_positive_samples)
        self.run.warning('The number of negative samples is: %d ' %
                         self.number_of_negative_samples)
        self.run.warning('The number of ambiuous samples is: %d ' %
                         (len(self.samples - self.negative_samples -
                              self.positive_samples)))

    def get_taxon_specific_genes_in_samples(self, additional_description=''):
        """ Use only positive samples to identify the single copy core genes:
            Assumption: At least 25% of the genes are single copy and taxon specific (so that it is included in the Q1Q3 range).
                a. Sort coverage vaules
                b. Sliding window (with 50% overlapping windows) that has least outliers with the smallest range. The reason for 50% \
                overlap is that this guarentees that if the assumption that at least 25% are single copy taxon specific then, it is \
                guarentees that at least one window will have Q1Q3 composed only from taxon specific genes.
                    i.Run sliding window and for each step calculate the number of outliers and the range
                    ii.Sort according the number of outliers and then according to the 
        """
        # get the indixes that sort each column separately TODO: delete this part if you don't need it
        sorting_indexes = self.gene_coverages_filtered.apply(
            lambda x: x.sort_values(ascending=True).index)
        # TODO: I'm creating a copy but probably I don't need to
        coverages_pdf_output = self.output_file_prefix + additional_description + '-coverages.pdf'
        coverages_pdf_output_file = PdfPages(coverages_pdf_output)
        # creating dataframe for non-outliers (see usage below)
        non_outliers = pd.DataFrame().reindex_like(self.gene_coverages)
        sorting_indexes = self.gene_coverages_filtered.apply(
            lambda x: x.sort_values(ascending=True).index)
        print(self.samples_information)
        print(self.positive_samples)
        print(self.negative_samples)
        print(self.samples)
        for sample in self.positive_samples:
            # a vector of the coverages for the sample
            v = self.gene_coverages_filtered[sample].copy()
            # set coverage to zero if not above gene detection threshold
            v[self.gene_detections[sample] < self.zeta] = 0
            # get non-outliers according to the Interquartile range
            non_outliers[sample] = get_non_outliers(v)
            plot_outliers(coverages_pdf_output_file, v, non_outliers[sample],
                          sample)
        coverages_pdf_output_file.close()

        # finding the genes that are non-outliers in the majority of samples
        # the required majority is defined by the user defined argument self.eta (--core-min-detection)
        non_outliers_all = non_outliers[non_outliers.sum(
            axis=1) >= self.number_of_positive_samples * self.eta].index
        return non_outliers_all

    def get_gene_specificity(self, detection_of_genes):
        """ Find all genes that are detected in """
        gene_specificity = {}
        for gene_id in detection_of_genes:
            if detection_of_genes[gene_id]['detected_in_non_positive_samples']:
                # if the gene is detected in at least one negative sample then gene_specificity is False
                gene_specificity[gene_id] = False
            elif detection_of_genes[gene_id]['number_of_detections'] == 0:
                # if the gene is not detected in any sample then the gene_specificity is None
                gene_specificity[gene_id] = None
            else:
                # if the gene is only detected in positive samples the gene_specificity is True
                gene_specificity[gene_id] = True
        return gene_specificity

    def get_coverage_consistency(self, adjusted_stds, detection_of_genes,
                                 beta):
        """For each gene if the adjusted standard deviation (to understand what this is refer to Alon Shaiber) is smaller
        than beta then coverage_consistency is True, otherwise, coverage_consistency is False. If the gene is not
        detected in positive samples then coverage consistency is not defined (get's a value 'None')"""
        coverage_consistency = {}

        for gene_id in adjusted_stds:
            # if the gene is not detected in any sample then return None
            if detection_of_genes[gene_id]['number_of_detections'] <= 1:
                coverage_consistency[gene_id] = None
            elif adjusted_stds[gene_id] == -1:
                coverage_consistency[gene_id] = None
            else:
                if adjusted_stds[gene_id] < beta:
                    coverage_consistency[gene_id] = True
                else:
                    coverage_consistency[gene_id] = False
        return coverage_consistency

    def get_taxon_specificity(self, coverage_consistency, gene_specificity):
        taxon_specificity = {}
        for gene_id in coverage_consistency:
            if coverage_consistency[gene_id] is None or gene_specificity[
                    gene_id] is None:
                taxon_specificity[gene_id] = None
            elif coverage_consistency[gene_id] and gene_specificity[gene_id]:
                taxon_specificity[gene_id] = 'TS'
            else:
                taxon_specificity[gene_id] = 'TNS'
        return taxon_specificity

    def get_number_of_detections_for_gene(self, detection_of_genes, gene_id,
                                          samples):
        detections = 0
        for sample_id in samples:
            detections += detection_of_genes[gene_id][sample_id]
        return detections

    def get_core_accessory_info(self, detection_of_genes, gene_id, eta):
        """ Returns 'core'/'accessory' classification for each gene. This is done using only the samples in which the
        genome is detected """
        if detection_of_genes[gene_id]['number_of_detections'] == 0:
            return 'None'
        elif self.get_number_of_detections_for_gene(
                detection_of_genes, gene_id,
                self.positive_samples) < eta * len(self.positive_samples):
            return 'accessory'
        else:
            return 'core'

    def get_gene_class(self, taxon_specificity, core_or_accessory):
        if taxon_specificity == None or core_or_accessory == 'None':
            return 'None'
        elif taxon_specificity == 'TS':
            if core_or_accessory == 'core':
                return 'TSC'
            elif core_or_accessory == 'accessory':
                return 'TSA'
            else:
                print(
                    '%s is not valid. Value should be \'core\' or \'accessory\''
                    % core_or_accessory)
                exit(1)
        elif taxon_specificity == 'TNS':
            if core_or_accessory == 'core':
                return 'TNC'
            elif core_or_accessory == 'accessory':
                return 'TNA'
            else:
                print(
                    '%s is not valid. Value should be \'core\' or \'accessory\''
                    % core_or_accessory)
                exit(1)
        else:
            print('%s is not valid. Value should be \'TS\' or \'TNS\'' %
                  taxon_specificity)
            exit(1)

    def report_gene_class_information(self):
        # TODO: change to pandas
        C = lambda dictionary, field, value: len([
            dict_id for dict_id in dictionary
            if dictionary[dict_id][field] == value
        ])

        for gene_class in ['TSC', 'TSA', 'TNC', 'TNA', 'None']:
            self.run.info(
                'Num class %s' % gene_class,
                C(self.gene_class_information, 'gene_class', gene_class))

        # TODO: report the number of negative samples and the number of NA samples
        self.run.info('Num samples in which the genome is detected',
                      C(self.samples_information, 'detection', True),
                      mc='green')

    def get_gene_classes(self):
        """ returning the classification per gene along with detection in samples (i.e. for each sample, whether the
        genome has been detected in the sample or not """
        # need to start a new gene_class_information dict
        # this is due to the fact that if the algorithm is ran on a list of bins then this necessary
        self.gene_class_information = pd.DataFrame(
            index=self.gene_coverages.index, columns=['gene_class'])
        # use a gene detection threshold to determine gene presence/absence in samples
        self.set_gene_presence_absence_in_samples()
        # use a gene presence threshold classify samples as positive, negative or ambiguous
        self.set_sample_detection_information()
        #
        non_outliers_all = self.get_taxon_specific_genes_in_samples()

        # set the gene classes
        for gene_id in self.gene_coverages_filtered.index:
            if gene_id in non_outliers_all:
                self.gene_class_information['gene_class'][
                    gene_id] = self.get_gene_class('TS', 'core')
            else:
                self.gene_class_information['gene_class'][
                    gene_id] = self.get_gene_class(None, None)

    def get_specificity_from_class_id(self, class_id):
        try:
            class_id = int(class_id)
        except:
            raise ConfigError(
                "Classes must be of type integer. You sent this: ", class_id)

        classes = {0: 'None', 1: 'TS', 2: 'TS', 3: 'TS', 4: 'TNS', 5: 'TNS'}

        try:
            return classes(class_id)
        except:
            raise ConfigError(
                "The class id '%d' is not a valid one. Try one of these: '%s'"
                % (class_id, ', '.join(list(classes.keys()))))

    def save_gene_class_information_in_additional_layers(
            self, additional_description=''):
        if not self.additional_layers_to_append:
            additional_column_titles = []
            additional_layers_df = self.gene_class_information
        else:
            additional_layers_df = pd.read_table(
                self.additional_layers_to_append)
            try:
                # concatinating the gene_class_information with the user provided additional layers
                additional_layers_df.join(self.gene_class_information,
                                          how='outer')
            except ValueError as e:
                raise ConfigError(
                    "Something went wrong. This is what we know: %s. This could be happening because \
                you have columns in your --additional-layers file with the following title: %s"
                    % (e, self.gene_class_information.columns.tolist()))

        if additional_description:
            additional_description = '-' + additional_description
        additional_layers_file_name = self.output_file_prefix + additional_description + '-additional-layers.txt'
        additional_layers_df.to_csv(additional_layers_file_name,
                                    sep='\t',
                                    index_label='gene_callers_id')

    def save_samples_information(self, additional_description=''):
        # TODO: there used to be this here:
        #self.run.warning(self.samples_information)
        if not self.samples_information_to_append:
            samples_information_df = self.samples_information
        else:
            samples_information_df = pd.read_table(
                self.samples_information_to_append)
            try:
                # concatinating the samples_information with the user provided samples_information file
                samples_information_df.join(self.samples_information,
                                            how='outer')
            except ValueError as e:
                raise ConfigError(
                    "Something went wrong. This is what we know: %s. This could be happening because \
                you have columns in your --additional-layers file with the following title: %s"
                    % (e, self.samples_information.columns.tolist()))

        if additional_description:
            additional_description = '-' + additional_description

        samples_information_file_name = self.output_file_prefix + additional_description + '-samples-information.txt'
        samples_information_df.to_csv(samples_information_file_name,
                                      sep='\t',
                                      index_label='samples')

    def save_gene_detection_and_coverage(self, additional_description=''):
        if additional_description:
            prefix = self.output_file_prefix + '-' + additional_description
        else:
            prefix = self.output_file_prefix
        gene_coverages_file_name = prefix + '-gene-coverages.txt'
        gene_detections_file_name = prefix + '-gene-detections.txt'
        self.gene_coverages.to_csv(gene_coverages_file_name,
                                   sep='\t',
                                   index_label='gene_callers_id')
        self.gene_detections.to_csv(gene_detections_file_name,
                                    sep='\t',
                                    index_label='gene_callers_id')

    def get_coverage_and_detection_dict(self, bin_id):
        _bin = summarizer.Bin(self.summary, bin_id)
        self.gene_coverages = pd.DataFrame.from_dict(_bin.gene_coverages,
                                                     orient='index',
                                                     dtype=float)
        print(self.gene_coverages)
        self.gene_coverages.drop(self.samples_to_exclude, axis=1, inplace=True)
        self.Ng = len(self.gene_coverages.index)
        self.gene_detections = pd.DataFrame.from_dict(_bin.gene_detection,
                                                      orient='index',
                                                      dtype=float)
        self.gene_detections.drop(self.samples_to_exclude,
                                  axis=1,
                                  inplace=True)
        self.samples = set(self.gene_coverages.columns.values)

    def classify(self):
        if self.collection_name:
            bin_names_in_collection = self.summary.bin_ids
            if self.bin_ids_file_path:
                filesnpaths.is_file_exists(self.bin_ids_file_path)
                bin_names_of_interest = [
                    line.strip()
                    for line in open(self.bin_ids_file_path).readlines()
                ]

                missing_bins = [
                    b for b in bin_names_of_interest
                    if b not in bin_names_in_collection
                ]
                if len(missing_bins):
                    raise ConfigError(
                        "Some bin names you declared do not appear to be in the collection %s. \
                                        These are the bins that are missing: %s, these are the bins that are \
                                        actually in your collection: %s" %
                        (self.collection_name, missing_bins,
                         bin_names_in_collection))
            elif self.bin_id:
                if self.bin_id not in bin_names_in_collection:
                    raise ConfigError("The bin you declared, %s, does not appear to be in the collection %s." \
                                      % (self.bin_id, self.collection_name))
                bin_names_of_interest = [self.bin_id]
            else:
                bin_names_of_interest = bin_names_in_collection

            for bin_id in bin_names_of_interest:
                self.run.info_single('Classifying genes in bin: %s' % bin_id)
                self.get_coverage_and_detection_dict(bin_id)
                self.get_gene_classes()
                self.save_gene_class_information_in_additional_layers(bin_id)
                self.save_samples_information(bin_id)
                if self.store_gene_detections_and_gene_coverages_tables:
                    self.save_gene_detection_and_coverage(bin_id)

        else:
            # No collection provided so running on the entire detection table
            self.get_gene_classes()
            self.save_gene_class_information_in_additional_layers()
            self.save_samples_information()
            if self.store_gene_detections_and_gene_coverages_tables:
                self.save_gene_detection_and_coverage()