Exemple #1
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]
        auxiliary_fields = [f for f in self.atomic_data_fields if constants.IS_AUXILIARY_FIELD(f)]

        # setting standard view table structure and types
        view_table_structure = ['contig'] + self.sample_ids_found_in_input_dbs + auxiliary_fields
        view_table_types = ['text'] + ['numeric'] * len(self.sample_ids_found_in_input_dbs) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for input_profile_db_path in self.profile_dbs_info_dict:
                    self.normalized_coverages[target][split_name][input_profile_db_path] = self.get_normalized_coverage_of_split(target, input_profile_db_path, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][split_name] = self.get_max_normalized_ratio_of_split(target, split_name)

        self.progress.new('Generating view data tables')
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' % (essential_field, target))

                data_dict = {}
                for split_name in self.split_names:
                    data_dict[split_name] = {'__parent__': self.split_parents[split_name]}

                    for input_profile_db_path in self.profile_dbs_info_dict:
                        sample_id = self.profile_dbs_info_dict[input_profile_db_path]['sample_id']
                        if essential_field == 'normalized_coverage':
                            data_dict[split_name][sample_id] = self.normalized_coverages[target][split_name][input_profile_db_path]
                        elif essential_field == 'max_normalized_ratio':
                            data_dict[split_name][sample_id] = self.max_normalized_ratios[target][split_name][input_profile_db_path]
                        elif essential_field == 'relative_abundance':
                            data_dict[split_name][sample_id] = self.get_relative_abundance_of_split(target, input_profile_db_path, split_name)
                        else:
                            data_dict[split_name][sample_id] = self.atomic_data_for_each_run[target][input_profile_db_path][split_name][essential_field]

                # time to store the data for this view in the profile database
                table_name = '_'.join([essential_field, target])
                TablesForViews(self.merged_profile_db_path).create_new_view(
                                                data_dict=data_dict,
                                                table_name=table_name,
                                                table_structure=view_table_structure,
                                                table_types=view_table_types,
                                                view_name=essential_field if target == 'splits' else None)

        # if SNVs were not profiled, remove all entries from variability tables:
        if not self.SNVs_profiled:
            TablesForViews(self.merged_profile_db_path).remove(view_name='variability', table_names_to_blank=['variability_splits', 'variability_contigs'])

        self.progress.end()
Exemple #2
0
    def populate_layers_additional_data_and_layer_orders(self):
        self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue")

        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]

        # initialize views.
        args = argparse.Namespace(profile_db = self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' % (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                      distance=self.distance,
                                                                      linkage=self.linkage,
                                                                      transpose=True)

                layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'}
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        self.progress.new('Working on layer additional data')
        self.progress.update('...')

        layer_additional_data_dict = {}
        for sample_name in self.sample_ids_found_in_input_dbs:
            layer_additional_data_dict[sample_name] = {}

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            layer_additional_data_dict[sample_name]['num_mapped_reads'] = self.total_reads_mapped_per_sample[sample_name]

        self.progress.end()

        TableForLayerOrders(args).add(layer_orders_data_dict)
        TableForLayerAdditionalData(args).add(layer_additional_data_dict, ['num_mapped_reads'])
Exemple #3
0
    def populate_misc_data_tables(self):
        self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue")

        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]

        # initialize views.
        args = argparse.Namespace(profile_db = self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out layer orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' % (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                      distance=self.distance,
                                                                      linkage=self.linkage,
                                                                      transpose=True)

                layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'}
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # add the layer orders quietly
        TableForLayerOrders(args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict)
        self.run.warning(None, header="Layer orders added", lc='cyan')
        for layer_order in layer_orders_data_dict:
            self.run.info_single(layer_order, mc='cyan')

        # done with layer orders. let's add our layer additional data and call it a day.
        for data_group_name in self.layer_additional_data_dict:
            args.target_data_group = data_group_name
            TableForLayerAdditionalData(args, r=terminal.Run(verbose=False)).add(self.layer_additional_data_dict[data_group_name],
                                                                                 list(self.layer_additional_data_keys[data_group_name]))

        self.run.warning(None, header="Data groups added", lc='cyan')
        for data_group in self.layer_additional_data_dict:
            self.run.info_single('%s (w/%d items)' % (data_group, len(self.layer_additional_data_keys[data_group])), mc='cyan')
Exemple #4
0
    def gen_samples_db_for_the_merged_profile(self):
        """Geenrate a samples db for the merged profile.

           We use the ProfileSuperclass to load all the views we added into the meged profile,
           and generate clusterings of samples for each view to generate a default samples database."""

        self.run.info_single("SAMPLES.db stuff...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        class Args:
            pass

        args = Args()
        args.profile_db = self.merged_profile_db_path

        # initialize views.
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        sample_orders = {}
        failed_attempts = []
        self.progress.new('Working on SAMPLES.db')
        for essential_field in essential_fields:
            self.progress.update('recovering samples order for "%s"' %
                                 (essential_field))
            try:
                sample_orders[essential_field] = \
                        clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                 distance=self.distance,
                                                                 linkage=self.linkage,
                                                                 transpose=True)
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(sample_orders):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate a samples\
                              database for this merged profile, however, all attempts to cluster samples\
                              based on view data available in the merged profile failed. No samples db\
                              for you :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to generate a samples db with clusterings it generated\
                              using the view data that worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # generate the samples order file
        samples_order_file_path = filesnpaths.get_temp_file_path()
        samples_order_file = open(samples_order_file_path, 'w')
        samples_order_file.write('attributes\tbasic\tnewick\n')
        for sample_order in sample_orders:
            samples_order_file.write(
                '%s\t%s\t%s\n' %
                (sample_order, '', sample_orders[sample_order]))
        samples_order_file.close()

        # figure out samples information stuff
        samples_information = {}
        headers = []
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name] = {}

        self.progress.new('Working on SAMPLES.db')
        self.progress.update('...')

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name][
                'num_mapped_reads'] = self.total_reads_mapped_per_sample[
                    sample_name]

        self.progress.end()
        # generate the samples information file
        samples_information_file_path = filesnpaths.get_temp_file_path()
        utils.store_dict_as_TAB_delimited_file(samples_information,
                                               samples_information_file_path,
                                               headers=headers)

        # generate the samples database
        samples_db = dbops.SamplesInformationDatabase(self.samples_db_path,
                                                      quiet=False)
        samples_db.create(
            samples_order_path=samples_order_file_path,
            samples_information_path=samples_information_file_path)

        os.remove(samples_order_file_path)
        os.remove(samples_information_file_path)

        self.run.info('Samples database', self.samples_db_path)
Exemple #5
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]
        auxiliary_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_AUXILIARY_FIELD(f)
        ]

        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__,
                                          progress=self.progress)

        # setting standard view table structure and types
        view_table_structure = ['contig'
                                ] + self.merged_sample_ids + auxiliary_fields
        view_table_types = [
            'text'
        ] + ['numeric'] * len(self.merged_sample_ids) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for sample_id in self.merged_sample_ids:
                    self.normalized_coverages[target][split_name][
                        sample_id] = self.get_normalized_coverage_of_split(
                            target, sample_id, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][
                    split_name] = self.get_max_normalized_ratio_of_split(
                        target, split_name)

        self.progress.new('Generating view data tables')
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' %
                                     (essential_field, target))

                target_table = '_'.join([essential_field, target])

                m = {}
                for split_name in self.split_names:
                    m[split_name] = {
                        '__parent__': self.split_parents[split_name]
                    }

                    for sample_id in self.merged_sample_ids:
                        if essential_field == 'normalized_coverage':
                            m[split_name][
                                sample_id] = self.normalized_coverages[target][
                                    split_name][sample_id]
                        elif essential_field == 'max_normalized_ratio':
                            m[split_name][
                                sample_id] = self.max_normalized_ratios[
                                    target][split_name][sample_id]
                        elif essential_field == 'relative_abundance':
                            m[split_name][
                                sample_id] = self.get_relative_abundance_of_split(
                                    target, sample_id, split_name)
                        else:
                            m[split_name][
                                sample_id] = self.atomic_data_for_each_run[
                                    target][sample_id][split_name][
                                        essential_field]

                # variable 'm' for the essential field is now ready to be its own table:
                profile_db.db.create_table(target_table, view_table_structure,
                                           view_table_types)
                db_entries = [
                    tuple([split_name] +
                          [m[split_name][h] for h in view_table_structure[1:]])
                    for split_name in self.split_names
                ]
                profile_db.db._exec_many(
                    '''INSERT INTO %s VALUES (%s)''' % (target_table, ','.join(
                        ['?'] * len(view_table_structure))), db_entries)

                if target == 'splits':
                    views_table.append(essential_field, target_table)

        profile_db.disconnect()
        self.progress.end()

        # store views in the database
        views_table.store()