Example #1
0
    def merge_split_summaries(self):
        merged_summary_index = {}
        merged_summary_index_path = os.path.join(self.output_directory, 'SUMMARY.cp')
        summary_dir = filesnpaths.gen_output_directory(os.path.join(self.output_directory, 'SUMMARY'), delete_if_exists = True)


        # read all index files per run into a dict here, so the access is easier from within
        # the for loop below
        run_sum_indices = {}
        for runinfo  in self.input_runinfo_dicts.values():
            run_sum_indices[runinfo['sample_id']] = dictio.read_serialized_object(runinfo['profile_summary_index'])

        for i in range(0, len(self.split_names)):
            self.progress.update('merging summaries for splits %s of %s' % (i + 1, len(self.split_names)))
            split_name = self.split_names[i]

            merged_summary = {}
            for runinfo in self.input_runinfo_dicts.values(): 
                run_split_summary = dictio.read_serialized_object(os.path.join(runinfo['input_dir'], run_sum_indices[runinfo['sample_id']][split_name]))
                merged_summary[runinfo['sample_id']] = run_split_summary[runinfo['sample_id']]

            merged_split_summary_path = os.path.join(summary_dir, os.path.basename(run_sum_indices[runinfo['sample_id']][split_name]))
            dictio.write_serialized_object(merged_summary, merged_split_summary_path)
            merged_summary_index[split_name] = merged_split_summary_path

        self.progress.update('Serializing merged split summary index ...')
        dictio.write_serialized_object(dictio.strip_prefix_from_dict_values(merged_summary_index, self.output_directory),\
                                           merged_summary_index_path)

        return summary_dir, merged_summary_index_path
Example #2
0
    def list_contigs(self):
        import signal
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)

        if self.input_file_path:
            self.progress.new('Init')
            self.progress.update('Reading BAM File')
            self.bam = pysam.Samfile(self.input_file_path, 'rb')
            self.progress.end()

            self.contig_names = self.bam.references
            self.contig_lengths = self.bam.lengths

            utils.check_contig_names(self.contig_names)

            for tpl in sorted(zip(self.contig_lengths, self.contig_names),
                              reverse=True):
                print '%-40s %s' % (tpl[1], pp(int(tpl[0])))

        else:
            self.progress.new('Init')
            self.progress.update('Reading serialized profile')
            self.contigs = dictio.read_serialized_object(
                self.serialized_profile_path)
            self.progress.end()

            self.run.info('profile_loaded_from', self.serialized_profile_path)
            self.run.info('num_contigs', pp(len(self.contigs)))

            for tpl in sorted([(int(self.contigs[contig].length), contig)
                               for contig in self.contigs]):
                print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
Example #3
0
File: cogs.py Project: ppflrs/anvio
    def init_p_id_to_cog_id_dict(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading NCBI Protein ID to COG id converter ...')

        self.p_id_to_cog_id = dictio.read_serialized_object(self.essential_files['PID-TO-CID.cPickle'])

        self.progress.end()
Example #4
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')
        self.cogs = utils.get_TAB_delimited_file_as_dictionary(
            self.essential_files['COG.txt'],
            no_header=True,
            column_names=['COG', 'categories', 'annotation'])

        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(
            self.essential_files['CATEGORIES.txt'],
            no_header=True,
            column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(
            self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [
                c.strip() for c in self.cogs[cog]['categories'].split(',')
            ]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Example #5
0
    def list_contigs(self):
        import signal
        signal.signal(signal.SIGPIPE, signal.SIG_DFL)

        if self.input_file_path:
            self.progress.new('Init')
            self.progress.update('Reading BAM File')
            self.bam = pysam.Samfile(self.input_file_path, 'rb')
            self.progress.end()

            self.contig_names = self.bam.references
            self.contig_lenghts = self.bam.lengths

            utils.check_contig_names(self.contig_names)

            for tpl in sorted(zip(self.contig_lenghts, self.contig_names), reverse = True):
                print '%-40s %s' % (tpl[1], pp(int(tpl[0])))

        else:
            self.progress.new('Init')
            self.progress.update('Reading serialized profile')
            self.contigs = dictio.read_serialized_object(self.serialized_profile_path)
            self.progress.end()

            self.run.info('profile_loaded_from', self.serialized_profile_path)
            self.run.info('num_contigs', pp(len(self.contigs)))

            for tpl in sorted([(int(self.contigs[contig].length), contig) for contig in self.contigs]):
                print '%-40s %s' % (tpl[1], pp(int(tpl[0])))
Example #6
0
    def init_serialized_profile(self):
        self.progress.new('Init')
        self.progress.update('Reading serialized profile')
        self.contigs = dictio.read_serialized_object(self.serialized_profile_path)
        self.progress.end()

        self.run.info('profile_loaded_from', self.serialized_profile_path)
        self.run.info('num_contigs', pp(len(self.contigs)))

        if self.contig_names_of_interest:
            contigs_to_discard = set()
            for contig in self.contigs:
                if contig not in self.contig_names_of_interest:
                    contigs_to_discard.add(contig)

            if len(contigs_to_discard):
                for contig in contigs_to_discard:
                    self.contigs.pop(contig)
            self.run.info('num_contigs_selected_for_analysis', pp(len(self.contigs)))

        self.check_contigs()

        # it brings good karma to let the user know what the hell is wrong with their data:
        self.check_contigs_without_any_gene_calls(self.contigs.keys())

        contigs_to_discard = set()
        for contig in self.contigs.values():
            if contig.length < self.min_contig_length:
                contigs_to_discard.add(contig.name)
        if len(contigs_to_discard):
            for contig in contigs_to_discard:
                self.contigs.pop(contig)
            self.run.info('contigs_raw_longer_than_M', len(self.contigs))

        self.check_contigs()
Example #7
0
    def init_p_id_to_cog_id_dict(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading NCBI Protein ID to COG id converter ...')

        self.p_id_to_cog_id = dictio.read_serialized_object(self.essential_files['PID-TO-CID.cPickle'])

        self.progress.end()
Example #8
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')

        if self.COG_version == 'COG14':
            self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation'])
        elif self.COG_version == 'COG20':
            self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation', 'pathway'])
        else:
            raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                              "parsing of a new generation of COG files.")


        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Example #9
0
    def load_from_files(self, args):
        if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir):
            raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\
                                           '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\
                                           more detailed information on them."

        if self.view:
            raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/"

        metadata_path = os.path.abspath(self.metadata)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = os.path.abspath(self.output_dir)
        self.p_meta['views'] = {}
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        if self.summary_index:
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)
            self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index'])

        # sanity of the metadata
        filesnpaths.is_file_tab_delimited(metadata_path)
        metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True)
        if not metadata_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the metadata file must\
                                      say 'contig', which is not the case for your metadata file\
                                      ('%s'). Please make sure this is a properly formatted metadata\
                                      file." % (metadata_path)

        # store metadata as view:
        self.views[self.default_view] = {'header': metadata_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # reminder: this is being stored in the output dir provided as a commandline parameter:
        self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp')

        if self.title:
            self.title = self.title

        filesnpaths.gen_output_directory(self.p_meta['output_dir'])
Example #10
0
    def read_runinfo_dict(self, path):
        runinfo = dictio.read_serialized_object(path)
        sample_id = runinfo['sample_id']

        if not sample_id in self.merged_sample_ids:
            self.merged_sample_ids.append(sample_id)
        self.input_runinfo_dicts[sample_id] = runinfo

        input_dir = os.path.dirname(os.path.abspath(path))
        runinfo['input_dir'] = input_dir
        runinfo['profile_db'] = os.path.join(input_dir, 'PROFILE.db')

        return sample_id, runinfo
Example #11
0
    def read_runinfo_dict(self, path):
        runinfo = dictio.read_serialized_object(path)
        sample_id = runinfo['sample_id']

        if not sample_id in self.merged_sample_ids:
            self.merged_sample_ids.append(sample_id)
        self.input_runinfo_dicts[sample_id] = runinfo

        input_dir = os.path.dirname(os.path.abspath(path))
        runinfo['input_dir'] = input_dir
        runinfo['profile_db'] = os.path.join(input_dir, 'PROFILE.db')

        return sample_id, runinfo
Example #12
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No profile database is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is 5
    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != '14':
        raise ConfigError(
            "Version of this profile database is not 14 (hence, this script cannot really do anything)."
        )

    is_merged = profile_db.get_meta_value('merged')

    progress.new("Trying to upgrade the %s profile database" %
                 'merged' if is_merged else 'single')

    # update the runinfo.cp
    input_dir = os.path.dirname(os.path.abspath(db_path))
    P = lambda x: os.path.join(input_dir, x)
    E = lambda x: os.path.exists(x)

    runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None
    runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None

    if runinfo_path:
        runinfo = dictio.read_serialized_object(runinfo_path)
        if 'blank' not in runinfo:
            runinfo['blank'] = False

            dictio.write_serialized_object(runinfo, runinfo_path)

    # add the new value
    profile_db.set_meta_value('blank', False)

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version('15')

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single("Database successfully upgraded to version 15!",
                    nl_after=1,
                    nl_before=1,
                    mc='green')
Example #13
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No profile database is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is 5
    profile_db = db.DB(db_path, None, ignore_version = True)
    if str(profile_db.get_version()) != '14':
        raise ConfigError("Version of this profile database is not 14 (hence, this script cannot really do anything).")

    is_merged = profile_db.get_meta_value('merged')

    progress.new("Trying to upgrade the %s profile database" % 'merged' if is_merged else 'single')

    # update the runinfo.cp
    input_dir = os.path.dirname(os.path.abspath(db_path))
    P = lambda x: os.path.join(input_dir, x)
    E = lambda x: os.path.exists(x)

    runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None
    runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None

    if runinfo_path:
        runinfo = dictio.read_serialized_object(runinfo_path)
        if 'blank' not in runinfo:
            runinfo['blank'] = False

            dictio.write_serialized_object(runinfo, runinfo_path)

    # add the new value
    profile_db.set_meta_value('blank', False)

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version('15')

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single("Database successfully upgraded to version 15!", nl_after=1, nl_before=1, mc='green')
Example #14
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')
        self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation'])

        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Example #15
0
    def init_serialized_profile(self):
        self.progress.new('Init')
        self.progress.update('Reading serialized profile')
        self.contigs = dictio.read_serialized_object(
            self.serialized_profile_path)
        self.progress.end()

        self.run.info('profile_loaded_from', self.serialized_profile_path)
        self.run.info('num_contigs', pp(len(self.contigs)))

        if self.contig_names_of_interest:
            contigs_to_discard = set()
            for contig in self.contigs:
                if contig not in self.contig_names_of_interest:
                    contigs_to_discard.add(contig)

            if len(contigs_to_discard):
                for contig in contigs_to_discard:
                    self.contigs.pop(contig)
            self.run.info('num_contigs_selected_for_analysis',
                          pp(len(self.contigs)))

        self.check_contigs()

        # it brings good karma to let the user know what the hell is wrong with their data:
        self.check_contigs_without_any_gene_calls(self.contigs.keys())

        contigs_to_discard = set()
        for contig in self.contigs.values():
            if contig.length < self.min_contig_length:
                contigs_to_discard.add(contig.name)
        if len(contigs_to_discard):
            for contig in contigs_to_discard:
                self.contigs.pop(contig)
            self.run.info('contigs_raw_longer_than_M', len(self.contigs))

        self.check_contigs()
Example #16
0
    def load_from_profile_database(self, args):
        if self.p_meta['version'] != anvio.__profile__version__:
            raise ConfigError, "The profile database has a version number that differs from the version that is valid\
                                for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\
                                unfortunately, you need to re-profile and re-merge this project using the current anvi'o :("

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # is summary being overwritten?
        if self.summary_index:
            run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index)
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)

        if os.path.exists(self.P('SUMMARY.cp')):
            self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp'))
        else:
            self.splits_summary_index = None
            run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\
                         it will attempt to do it), but things may behave badly with the absence of\
                         SUMMARY.cp (first and foremost, you will not be able to inspect individual\
                         contigs through any of the interactive interfaces). Please investigate it\
                         if you were not expecting this.")

        # set title
        if self.title:
            self.title = self.title + ' (%s)' % self.default_view
        else:
            self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view