Beispiel #1
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only = True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability', self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('skip_AA_frequencies', self.skip_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed', self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        else:
            self.init_serialized_profile()

        self.generate_variabile_positions_table()
        self.profile_AA_frequencies()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # here we store atomic data for contigs and splits into the database:
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        self.atomic_contig_split_data.store_atomic_data_for_contigs_and_splits(self.sample_id, self.contigs, profile_db.db)
        profile_db.disconnect()

        # the only view for the single PROFILE database is ready, and already
        # set as the default view. store the info in the db:
        views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__)
        views_table.append('single', 'atomic_data_splits')
        views_table.store()

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized, strip_prefix = self.output_directory)

        self.bam.close()
        self.run.quit()
Beispiel #2
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        self.run.log_file_path = os.path.join(self.output_directory,
                                              'RUNLOG.txt')

        # set database paths
        self.merged_profile_db_path = os.path.join(self.output_directory,
                                                   'PROFILE.db')
        self.database_paths['PROFILE.db'] = os.path.abspath(
            self.merged_profile_db_path)

        profile_db = dbops.ProfileDatabase(self.merged_profile_db_path)

        C = lambda x: list(self.profile_dbs_info_dict.values())[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.max_contig_length = C('max_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.SCVs_profiled = C('SCVs_profiled')
        self.SNVs_profiled = C('SNVs_profiled')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.num_splits))

        self.total_reads_mapped_per_sample = dict([
            (s, self.layer_additional_data_dict['default'][s]
             ['total_reads_mapped'])
            for s in self.layer_additional_data_dict['default']
        ])

        sample_ids_list = ', '.join(sorted(self.sample_ids_found_in_input_dbs))
        total_reads_mapped_list = ', '.join([
            str(self.total_reads_mapped_per_sample[sample_id])
            for sample_id in self.sample_ids_found_in_input_dbs
        ])

        # we run this now because we change default flags in this function
        # depending on the number of reads characterized within each single profile.
        self.set_normalization_multiplier()

        meta_values = {
            'db_type':
            'profile',
            'anvio':
            __version__,
            'sample_id':
            self.sample_id,
            'samples':
            sample_ids_list,
            'total_reads_mapped':
            total_reads_mapped_list,
            'merged':
            True,
            'blank':
            False,
            'items_ordered':
            False,
            'default_view':
            'mean_coverage',
            'min_contig_length':
            self.min_contig_length,
            'max_contig_length':
            self.max_contig_length,
            'SNVs_profiled':
            self.SNVs_profiled,
            'SCVs_profiled':
            self.SCVs_profiled,
            'num_contigs':
            self.num_contigs,
            'num_splits':
            self.num_splits,
            'total_length':
            self.total_length,
            'min_coverage_for_variability':
            self.min_coverage_for_variability,
            'report_variability_full':
            self.report_variability_full,
            'contigs_db_hash':
            self.contigs_db_hash,
            'description':
            self.description
            if self.description else '_No description is provided_'
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )

        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.merged_profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('num_runs_processed',
                      len(self.sample_ids_found_in_input_dbs))
        self.run.info('merged_sample_ids', sample_ids_list)
        self.run.info("Common layer additional data keys",
                      ', '.join(self.layer_additional_data_keys))
        self.run.info('total_reads_mapped', total_reads_mapped_list)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.merge_split_coverage_data()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.progress.update('...')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.SCVs_profiled:
            self.progress.new('Merging variable codons tables')
            self.progress.update('...')
            self.merge_variable_codons_tables()
            self.progress.end()
        else:
            self.run.warning(
                "Codon frequencies were not profiled, hence, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        self.populate_misc_data_tables()

        self.run.info_single('Happy ☘ ', nl_before=1, nl_after=1)

        self.run.quit()
Beispiel #3
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_AA_frequencies', self.profile_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed',
                      self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        elif self.serialized_profile_path:
            self.init_serialized_profile()
        else:
            raise ConfigError, "What are you doing? :( Whatever it is, anvi'o will have none of it."

        self.generate_variabile_nts_table()
        self.generate_variabile_aas_table()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # creating views in the database for atomic data we gathered during the profiling. Meren, please note
        # that the first entry has a view_id, and the second one does not have one. I know you will look at this
        # and be utterly confused 2 months from now. Please go read the description given in the dbops.py for the
        # function create_new_view defined in the class TablesForViews.
        view_data_splits, view_data_contigs = self.atomic_data.get_data(
            self.sample_id, self.contigs)
        dbops.TablesForViews(self.profile_db_path,
                             anvio.__profile__version__).create_new_view(
                                 data_dict=view_data_splits,
                                 table_name='atomic_data_splits',
                                 table_structure=t.atomic_data_table_structure,
                                 table_types=t.atomic_data_table_types,
                                 view_name='single')

        dbops.TablesForViews(self.profile_db_path,
                             anvio.__profile__version__).create_new_view(
                                 data_dict=view_data_splits,
                                 table_name='atomic_data_contigs',
                                 table_structure=t.atomic_data_table_structure,
                                 table_types=t.atomic_data_table_types,
                                 view_name=None)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #4
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists = self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory, 'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        self.contigs_db_hash = self.input_runinfo_dicts.values()[0]['contigs_db_hash']
        self.min_contig_length = self.input_runinfo_dicts.values()[0]['min_contig_length']
        self.num_contigs = self.input_runinfo_dicts.values()[0]['num_contigs']
        self.num_splits = self.input_runinfo_dicts.values()[0]['num_splits']
        self.min_coverage_for_variability = self.input_runinfo_dicts.values()[0]['min_coverage_for_variability']
        self.total_length = self.input_runinfo_dicts.values()[0]['total_length']
        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': ','.join(self.merged_sample_ids),
                       'merged': True,
                       'contigs_clustered': not self.skip_hierarchical_clustering,
                       'default_view': 'mean_coverage',
                       'min_contig_length': self.min_contig_length,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'num_contigs': self.num_contigs,
                       'num_splits': self.num_splits,
                       'total_length': self.total_length,
                       'contigs_db_hash': self.contigs_db_hash}
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables()
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed', not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        self.progress.new('Merging variable positions tables')
        self.merge_variable_positions_tables()
        self.progress.end()

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized, strip_prefix = self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Beispiel #5
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(
                argparse.Namespace(profile_db=self.profile_db_path))
            layer_additional_data_table.add(
                {self.sample_id: self.layer_additional_data},
                self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #6
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_AA_frequencies', self.profile_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            dbops.TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            dbops.TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()
Beispiel #7
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        self.contigs_db_hash = self.input_runinfo_dicts.values(
        )[0]['contigs_db_hash']
        self.min_contig_length = self.input_runinfo_dicts.values(
        )[0]['min_contig_length']
        self.num_contigs = self.input_runinfo_dicts.values()[0]['num_contigs']
        self.num_splits = self.input_runinfo_dicts.values()[0]['num_splits']
        self.min_coverage_for_variability = self.input_runinfo_dicts.values(
        )[0]['min_coverage_for_variability']
        self.total_length = self.input_runinfo_dicts.values(
        )[0]['total_length']
        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'contigs_db_hash': self.contigs_db_hash
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        self.progress.new('Merging variable positions tables')
        self.merge_variable_positions_tables()
        self.progress.end()

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Beispiel #8
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('description', 'Found (%d characters)' % len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('max_contig_length', self.max_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability', self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        if self.max_contig_length < sys.maxsize:
            self.run.warning("Your maximum contig length is set to %s base pairs. Which means anvi'o will remove\
            any contigs that are longer than this value." % pp(self.max_contig_length))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(list(zip(self.split_names, [dict(list(zip(t.atomic_data_table_structure[1:], [None] * len(t.atomic_data_table_structure[1:]))))] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove('single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                                           data_dict=view_data_splits,
                                           table_name='atomic_data_splits',
                                           table_structure=t.atomic_data_table_structure,
                                           table_types=t.atomic_data_table_types,
                                           view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError("What are you doing? :( Whatever it is, anvi'o will have none of it.")

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(argparse.Namespace(profile_db=self.profile_db_path), r=self.run, p=self.progress)
            layer_additional_data_table.add({self.sample_id: self.layer_additional_data}, self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.info_single('Happy 😇', nl_before=1, nl_after=1)

        self.run.quit()
Beispiel #9
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('skip_AA_frequencies', self.skip_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed',
                      self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        else:
            self.init_serialized_profile()

        self.generate_variabile_positions_table()
        self.profile_AA_frequencies()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # here we store atomic data for contigs and splits into the database:
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        self.atomic_contig_split_data.store_atomic_data_for_contigs_and_splits(
            self.sample_id, self.contigs, profile_db.db)
        profile_db.disconnect()

        # the only view for the single PROFILE database is ready, and already
        # set as the default view. store the info in the db:
        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__)
        views_table.append('single', 'atomic_data_splits')
        views_table.store()

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        self.bam.close()
        self.run.quit()
Beispiel #10
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(self.output_directory, delete_if_exists=self.overwrite_output_destinations)

        self.run.log_file_path = os.path.join(self.output_directory, 'RUNLOG.txt')

        # set database paths
        self.merged_profile_db_path = os.path.join(self.output_directory, 'PROFILE.db')
        self.database_paths['PROFILE.db'] = os.path.abspath(self.merged_profile_db_path)

        profile_db = dbops.ProfileDatabase(self.merged_profile_db_path)

        C = lambda x: list(self.profile_dbs_info_dict.values())[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.max_contig_length = C('max_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.SCVs_profiled = C('SCVs_profiled')
        self.SNVs_profiled = C('SNVs_profiled')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.num_splits))

        self.total_reads_mapped_per_sample = dict([(s, self.layer_additional_data_dict['default'][s]['total_reads_mapped']) for s in self.layer_additional_data_dict['default']])

        sample_ids_list = ', '.join(sorted(self.sample_ids_found_in_input_dbs))
        total_reads_mapped_list = ', '.join([str(self.total_reads_mapped_per_sample[sample_id]) for sample_id in self.sample_ids_found_in_input_dbs])

        # we run this now because we change default flags in this function
        # depending on the number of reads characterized within each single profile.
        self.set_normalization_multiplier()

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': sample_ids_list,
                       'total_reads_mapped': total_reads_mapped_list,
                       'merged': True,
                       'blank': False,
                       'items_ordered': False,
                       'default_view': 'mean_coverage',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': self.SNVs_profiled,
                       'SCVs_profiled': self.SCVs_profiled,
                       'num_contigs': self.num_contigs,
                       'num_splits': self.num_splits,
                       'total_length': self.total_length,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.contigs_db_hash,
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables()

        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('description', 'Found (%d characters)' % len(self.description) if self.description else None)
        self.run.info('profile_db', self.merged_profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('num_runs_processed', len(self.sample_ids_found_in_input_dbs))
        self.run.info('merged_sample_ids', sample_ids_list)
        self.run.info("Common layer additional data keys", ', '.join(self.layer_additional_data_keys))
        self.run.info('total_reads_mapped', total_reads_mapped_list)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('clustering_performed', not self.skip_hierarchical_clustering)

        self.merge_split_coverage_data()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.progress.update('...')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning("SNVs were not profiled, variable nt positions tables will be empty in the merged profile database.")

        if self.SCVs_profiled:
            self.progress.new('Merging variable codons tables')
            self.progress.update('...')
            self.merge_variable_codons_tables()
            self.progress.end()
        else:
            self.run.warning("Codon frequencies were not profiled, hence, these tables will be empty in the merged profile database.")

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.populate_misc_data_tables()

        self.run.info_single('Happy ☘ ', nl_before=1, nl_after=1)

        self.run.quit()
Beispiel #11
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        self.init_dirs_and_dbs()

        # we will set up things here so the information in the annotation_db
        # can be utilized directly from within the contigs for loop. contig to
        # gene associations will be stored in self.genes_in_contigs dictionary for
        # fast access.
        if self.annotation_db_path:
            self.populate_genes_in_contigs_dict()

        self.run.info("anvio", anvio.__version__)
        self.run.info("profiler_version", anvio.__profile__version__)
        self.run.info("sample_id", self.sample_id)
        self.run.info("profile_db", self.profile_db_path, display_only=True)
        self.run.info("annotation_db", True if self.annotation_db_path else False)
        self.run.info("annotation_hash", self.annotation_hash)
        self.run.info("cmd_line", utils.get_cmd_line())
        self.run.info("merged", False)
        self.run.info("split_length", self.split_length)
        self.run.info("min_contig_length", self.min_contig_length)
        self.run.info("min_mean_coverage", self.min_mean_coverage)
        self.run.info("clustering_performed", self.contigs_shall_be_clustered)
        self.run.info("min_coverage_for_variability", self.min_coverage_for_variability)
        self.run.info("report_variability_full", self.report_variability_full)

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            self.store_profile()
            self.store_summarized_profile_for_each_split()
        else:
            self.init_serialized_profile()
            self.store_summarized_profile_for_each_split()

        self.generate_variabile_positions_table()
        self.generate_gene_coverages_table()

        # here we store both metadata and TNF information into the database:
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        self.metadata.store_metadata_for_contigs_and_splits(self.sample_id, self.contigs, profile_db.db)
        profile_db.disconnect()

        # the only view for the single PROFILE database is ready, and already
        # set as the default view. store the info in the db:
        views_table = dbops.TableForViews(self.profile_db_path, anvio.__profile__version__)
        views_table.append("single", "metadata_splits")
        views_table.store()

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination("RUNINFO.cp")
        self.run.info("runinfo", runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized, strip_prefix=self.output_directory)

        self.run.quit()
Beispiel #12
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        C = lambda x: self.input_runinfo_dicts.values()[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.total_reads_mapped = C('total_reads_mapped')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.gene_coverages_computed = C('gene_coverages_computed')
        self.AA_frequencies_profiled = C('profile_AA_frequencies')
        self.SNVs_profiled = not C('skip_SNV_profiling')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'blank': False,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': self.SNVs_profiled,
            'AA_frequencies_profiled': self.AA_frequencies_profiled,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'total_reads_mapped': self.total_reads_mapped,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.contigs_db_hash,
            'gene_coverages_computed': self.gene_coverages_computed
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.AA_frequencies_profiled:
            self.progress.new('Merging variable AAs tables')
            self.merge_variable_aas_tables()
            self.progress.end()
        else:
            self.run.warning(
                "AA frequencies were not profiled, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()