예제 #1
0
파일: merger.py 프로젝트: paczian/anvio
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]
        auxiliary_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_AUXILIARY_FIELD(f)
        ]

        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__,
                                          progress=self.progress)

        # setting standard view table structure and types
        view_table_structure = ['contig'
                                ] + self.merged_sample_ids + auxiliary_fields
        view_table_types = [
            'text'
        ] + ['numeric'] * len(self.merged_sample_ids) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for sample_id in self.merged_sample_ids:
                    self.normalized_coverages[target][split_name][
                        sample_id] = self.get_normalized_coverage_of_split(
                            target, sample_id, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][
                    split_name] = self.get_max_normalized_ratio_of_split(
                        target, split_name)

        self.progress.new('Generating view data tables')
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' %
                                     (essential_field, target))

                target_table = '_'.join([essential_field, target])

                m = {}
                for split_name in self.split_names:
                    m[split_name] = {
                        '__parent__': self.split_parents[split_name]
                    }

                    for sample_id in self.merged_sample_ids:
                        if essential_field == 'normalized_coverage':
                            m[split_name][
                                sample_id] = self.normalized_coverages[target][
                                    split_name][sample_id]
                        elif essential_field == 'max_normalized_ratio':
                            m[split_name][
                                sample_id] = self.max_normalized_ratios[
                                    target][split_name][sample_id]
                        elif essential_field == 'relative_abundance':
                            m[split_name][
                                sample_id] = self.get_relative_abundance_of_split(
                                    target, sample_id, split_name)
                        else:
                            m[split_name][
                                sample_id] = self.atomic_data_for_each_run[
                                    target][sample_id][split_name][
                                        essential_field]

                # variable 'm' for the essential field is now ready to be its own table:
                profile_db.db.create_table(target_table, view_table_structure,
                                           view_table_types)
                db_entries = [
                    tuple([split_name] +
                          [m[split_name][h] for h in view_table_structure[1:]])
                    for split_name in self.split_names
                ]
                profile_db.db._exec_many(
                    '''INSERT INTO %s VALUES (%s)''' % (target_table, ','.join(
                        ['?'] * len(view_table_structure))), db_entries)

                if target == 'splits':
                    views_table.append(essential_field, target_table)

        profile_db.disconnect()
        self.progress.end()

        # store views in the database
        views_table.store()
예제 #2
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('skip_AA_frequencies', self.skip_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed',
                      self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        else:
            self.init_serialized_profile()

        self.generate_variabile_positions_table()
        self.profile_AA_frequencies()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # here we store atomic data for contigs and splits into the database:
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        self.atomic_contig_split_data.store_atomic_data_for_contigs_and_splits(
            self.sample_id, self.contigs, profile_db.db)
        profile_db.disconnect()

        # the only view for the single PROFILE database is ready, and already
        # set as the default view. store the info in the db:
        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__)
        views_table.append('single', 'atomic_data_splits')
        views_table.store()

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        self.bam.close()
        self.run.quit()