Example #1
0
    def merge_split_summaries(self):
        merged_summary_index = {}
        merged_summary_index_path = os.path.join(self.output_directory, 'SUMMARY.cp')
        summary_dir = filesnpaths.gen_output_directory(os.path.join(self.output_directory, 'SUMMARY'), delete_if_exists = True)


        # read all index files per run into a dict here, so the access is easier from within
        # the for loop below
        run_sum_indices = {}
        for runinfo  in self.input_runinfo_dicts.values():
            run_sum_indices[runinfo['sample_id']] = dictio.read_serialized_object(runinfo['profile_summary_index'])

        for i in range(0, len(self.split_names)):
            self.progress.update('merging summaries for splits %s of %s' % (i + 1, len(self.split_names)))
            split_name = self.split_names[i]

            merged_summary = {}
            for runinfo in self.input_runinfo_dicts.values(): 
                run_split_summary = dictio.read_serialized_object(os.path.join(runinfo['input_dir'], run_sum_indices[runinfo['sample_id']][split_name]))
                merged_summary[runinfo['sample_id']] = run_split_summary[runinfo['sample_id']]

            merged_split_summary_path = os.path.join(summary_dir, os.path.basename(run_sum_indices[runinfo['sample_id']][split_name]))
            dictio.write_serialized_object(merged_summary, merged_split_summary_path)
            merged_summary_index[split_name] = merged_split_summary_path

        self.progress.update('Serializing merged split summary index ...')
        dictio.write_serialized_object(dictio.strip_prefix_from_dict_values(merged_summary_index, self.output_directory),\
                                           merged_summary_index_path)

        return summary_dir, merged_summary_index_path
Example #2
0
 def store_profile(self):
     output_file = self.generate_output_destination('PROFILE.cp')
     self.progress.new('Storing Profile')
     self.progress.update('Serializing information for %s contigs ...' % pp(len(self.contigs)))
     dictio.write_serialized_object(self.contigs, output_file)
     self.progress.end()
     self.run.info('profile_dict', output_file)
Example #3
0
    def store_summarized_profile_for_each_split(self):
        summary_index = {}
        summary_index_output_path = self.generate_output_destination("SUMMARY.cp")
        summary_dir = self.generate_output_destination("SUMMARY", directory=True)
        self.progress.new("Storing summary files")

        counter = 1

        for contig in self.contigs:
            self.progress.update("working on contig %s of %s" % (pp(counter), pp(len(self.contigs))))
            for split in self.contigs[contig].splits:
                split_summary_path = self.generate_output_destination(os.path.join(summary_dir, "%.6d.cp" % counter))
                dictio.write_serialized_object(
                    {
                        self.sample_id: {
                            "coverage": split.coverage.c,
                            "variability": split.auxiliary.v,
                            "competing_nucleotides": split.auxiliary.competing_nucleotides,
                        }
                    },
                    split_summary_path,
                )
                summary_index[split.name] = split_summary_path
                counter += 1

        self.progress.end()
        self.run.info("profile_summary_dir", summary_dir)
        dictio.write_serialized_object(
            dictio.strip_prefix_from_dict_values(summary_index, self.output_directory), summary_index_output_path
        )
        self.run.info("profile_summary_index", summary_index_output_path)
Example #4
0
 def store_profile(self):
     output_file = self.generate_output_destination('PROFILE.cp')
     self.progress.new('Storing Profile')
     self.progress.update('Serializing information for %s contigs ...' %
                          pp(len(self.contigs)))
     dictio.write_serialized_object(self.contigs, output_file)
     self.progress.end()
     self.run.info('profile_dict', output_file)
Example #5
0
    def store_info_dict(self, destination, strip_prefix=None):
        if strip_prefix:
            # mostly to get rid of output_dir prefix in output file names.
            # surprisingly enough, this is the best place to do it. live
            # and learn :/
            self.info_dict = dictio.strip_prefix_from_dict_values(self.info_dict, strip_prefix)

        dictio.write_serialized_object(self.info_dict, destination)
Example #6
0
    def store_info_dict(self, destination, strip_prefix = None):
        if strip_prefix:
            # mostly to get rid of output_dir prefix in output file names.
            # surprisingly enough, this is the best place to do it. live 
            # and learn :/
            self.info_dict = dictio.strip_prefix_from_dict_values(self.info_dict, strip_prefix)

        dictio.write_serialized_object(self.info_dict, destination)
Example #7
0
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path)

        def raise_error(line_num, line_content, fields, e):
            raise ConfigError(f"Bad news :( While parsing a COG input file, anvi'o encountered an error (which said: [{e}]) "
                              f"while processing the line {line_counter} in your file. Where the fields in that file looked "
                              f"looked like this: {fields}. Sadly, this has been a long-standing and very annoying issue that "
                              f"anvi'o developers were unable to reproduce. But we recently learned that the issue is likely due "
                              f"to your internet speed (https://github.com/merenlab/anvio/issues/1738). Slower connections lead "
                              f"to broken connections with the NCBI servers, and leave you with an unfinished file :/ The only "
                              f"working solution so far is to try again with a faster internet connection.")

        progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file)

        p_id_to_cog_id = {}

        line_counter = 0
        for line in open(input_file_path, 'rU').readlines():
            line_counter += 1

            if line_counter % 500 == 0:
                self.progress.increment(line_counter)
                progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%")

            fields = line.strip('\n').split(',')

            # `p_id` should look just like the FASTA ids, and its location has changed between
            # 2014 release and 2020 release.
            if self.COG_version == 'COG14':
                try:
                    p_id = fields[0]
                    COG = fields[6]
                except Exception as e:
                    raise_error(line_counter, line, fields, e)
            elif self.COG_version == 'COG20':
                try:
                    p_id = fields[2].replace('.', '_')
                    COG = fields[6]
                except Exception as e:
                    raise_error(line_counter, line, fields, e)
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                if COG not in p_id_to_cog_id[p_id]:
                    p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).")
        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()
Example #8
0
File: cogs.py Project: ppflrs/anvio
    def generate_missing_cog_ids_file(self):
        missing_cog_ids = self.cogs_found_in_proteins_fasta.difference(self.cogs_found_in_cog_names_file)

        if len(missing_cog_ids):
            self.run.warning("%d of %d COG IDs that appear in the list of orthology domains file (which links protein IDs\
                              to COG names), are missing from the COG names file (which links COG IDs to function names and\
                              categoires). Because clearly even the files that are distributed together should not be expected to\
                              be fully compatible. Anvi'o thanks everyone for their contributions." % \
                                                        (len(missing_cog_ids), len(self.cogs_found_in_proteins_fasta)))

        dictio.write_serialized_object(missing_cog_ids, J(self.COG_data_dir, 'MISSING_COG_IDs.cPickle'))
Example #9
0
    def generate_missing_cog_ids_file(self):
        missing_cog_ids = self.cogs_found_in_proteins_fasta.difference(self.cogs_found_in_cog_names_file)

        if len(missing_cog_ids):
            self.run.warning("%d of %d COG IDs that appear in the list of orthology domains file (which links protein IDs\
                              to COG names), are missing from the COG names file (which links COG IDs to function names and\
                              categoires). Because clearly even the files that are distributed together should not be expected to\
                              be fully compatible. Anvi'o thanks everyone for their contributions." % \
                                                        (len(missing_cog_ids), len(self.cogs_found_in_proteins_fasta)))

        dictio.write_serialized_object(missing_cog_ids, J(self.COG_data_dir, 'MISSING_COG_IDs.cPickle'))
Example #10
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No profile database is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is 5
    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != '14':
        raise ConfigError(
            "Version of this profile database is not 14 (hence, this script cannot really do anything)."
        )

    is_merged = profile_db.get_meta_value('merged')

    progress.new("Trying to upgrade the %s profile database" %
                 'merged' if is_merged else 'single')

    # update the runinfo.cp
    input_dir = os.path.dirname(os.path.abspath(db_path))
    P = lambda x: os.path.join(input_dir, x)
    E = lambda x: os.path.exists(x)

    runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None
    runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None

    if runinfo_path:
        runinfo = dictio.read_serialized_object(runinfo_path)
        if 'blank' not in runinfo:
            runinfo['blank'] = False

            dictio.write_serialized_object(runinfo, runinfo_path)

    # add the new value
    profile_db.set_meta_value('blank', False)

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version('15')

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single("Database successfully upgraded to version 15!",
                    nl_after=1,
                    nl_before=1,
                    mc='green')
Example #11
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No profile database is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is 5
    profile_db = db.DB(db_path, None, ignore_version = True)
    if str(profile_db.get_version()) != '14':
        raise ConfigError("Version of this profile database is not 14 (hence, this script cannot really do anything).")

    is_merged = profile_db.get_meta_value('merged')

    progress.new("Trying to upgrade the %s profile database" % 'merged' if is_merged else 'single')

    # update the runinfo.cp
    input_dir = os.path.dirname(os.path.abspath(db_path))
    P = lambda x: os.path.join(input_dir, x)
    E = lambda x: os.path.exists(x)

    runinfo_path = P('RUNINFO.cp') if E(P('RUNINFO.cp')) else None
    runinfo_path = P('RUNINFO.mcp') if E(P('RUNINFO.mcp')) else None

    if runinfo_path:
        runinfo = dictio.read_serialized_object(runinfo_path)
        if 'blank' not in runinfo:
            runinfo['blank'] = False

            dictio.write_serialized_object(runinfo, runinfo_path)

    # add the new value
    profile_db.set_meta_value('blank', False)

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version('15')

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single("Database successfully upgraded to version 15!", nl_after=1, nl_before=1, mc='green')
Example #12
0
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        num_lines_in_file = filesnpaths.get_num_lines_in_file(input_file_path)

        progress.new('Formatting protein ids to COG ids file', progress_total_items=num_lines_in_file)

        p_id_to_cog_id = {}

        line_counter = 0
        for line in open(input_file_path, 'rU').readlines():
            line_counter += 1

            if line_counter % 500 == 0:
                self.progress.increment(line_counter)
                progress.update(f"{line_counter * 100 / num_lines_in_file:.2f}%")

            fields = line.strip('\n').split(',')

            # `p_id` should look just like the FASTA ids, and its location has changed between
            # 2014 release and 2020 release.
            if self.COG_version == 'COG14':
                p_id = fields[0]
                COG = fields[6]
            elif self.COG_version == 'COG20':
                p_id = fields[2].replace('.', '_')
                COG = fields[6]
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                if COG not in p_id_to_cog_id[p_id]:
                    p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        progress.update("Serializing the data dictionary for future use (a.k.a, very pro stuff).")
        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()
Example #13
0
File: cogs.py Project: ppflrs/anvio
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        progress.new('Formatting protein ids to COG ids file')
        progress.update('...')

        p_id_to_cog_id = {}

        for line in open(input_file_path, 'rU').readlines():
            fields = line.strip('\n').split(',')
            p_id = fields[0]
            COG = fields[6]

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()
Example #14
0
    def format_p_id_to_cog_id_cPickle(self, input_file_path, output_file_path):
        progress.new('Formatting protein ids to COG ids file')
        progress.update('...')

        p_id_to_cog_id = {}

        for line in open(input_file_path, 'rU').readlines():
            fields = line.strip('\n').split(',')
            p_id = fields[0]
            COG = fields[6]

            self.cogs_found_in_proteins_fasta.add(COG)

            if p_id in p_id_to_cog_id:
                p_id_to_cog_id[p_id].append(COG)
            else:
                p_id_to_cog_id[p_id] = [COG]

        dictio.write_serialized_object(p_id_to_cog_id, output_file_path)

        progress.end()