Python get_columns_of_TAB_delim_file 예제들, anvio.utils.get_columns_of_TAB_delim_file Python 예제들

예제 #1

0

파일 보기

파일: interactive.py 프로젝트: caglar10ur/anvio

    def load_from_files(self, args):
        if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir):
            raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\
                                           '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\
                                           more detailed information on them."

        if self.view:
            raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/"

        metadata_path = os.path.abspath(self.metadata)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = os.path.abspath(self.output_dir)
        self.p_meta['views'] = {}
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        if self.summary_index:
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)
            self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index'])

        # sanity of the metadata
        filesnpaths.is_file_tab_delimited(metadata_path)
        metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True)
        if not metadata_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the metadata file must\
                                      say 'contig', which is not the case for your metadata file\
                                      ('%s'). Please make sure this is a properly formatted metadata\
                                      file." % (metadata_path)

        # store metadata as view:
        self.views[self.default_view] = {'header': metadata_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # reminder: this is being stored in the output dir provided as a commandline parameter:
        self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp')

        if self.title:
            self.title = self.title

        filesnpaths.gen_output_directory(self.p_meta['output_dir'])

예제 #2

0

파일 보기

파일: v22_to_v23.py 프로젝트: AstrobioMike/anvio

    def populate_from_file(self, additional_data_file_path, skip_check_names=None):
        data_keys = utils.get_columns_of_TAB_delim_file(additional_data_file_path)
        data_dict = utils.get_TAB_delimited_file_as_dictionary(additional_data_file_path)

        if not len(data_keys):
            raise ConfigError("There is something wrong with the additional data file for %s at %s.\
                               It does not seem to have any additional keys for data :/" \
                                            % (self.target, additional_data_file_path))

        if self.target == 'layer_orders':
            OrderDataBaseClass.add(self, data_dict, skip_check_names)
        else:
            AdditionalDataBaseClass.add(self, data_dict, data_keys, skip_check_names)

예제 #3

0

파일 보기

파일: __init__.py 프로젝트: AstrobioMike/anvio

def get(engine, run=run):
    data = {}

    if engine not in engines:
        raise ConfigError("Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'." % engine)

    dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine)
    substitution_matrix_paths = [s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')]

    for matrix_path in substitution_matrix_paths:
        matrix_id = os.path.basename(matrix_path).split('.txt')[0]

        matrix_rows = u.get_column_data_from_TAB_delim_file(matrix_path, column_indices=[0])[0][1:]
        matrix_columns = u.get_columns_of_TAB_delim_file(matrix_path, include_first_column=False)

        if sorted(matrix_columns) != sorted(matrix_rows):
            raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like\
                               a nicely done matrix. Substitution scoring matrices must contain the same row and column\
                               names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \
                                                    % (os.path.basename(matrix_path)))

        if engine == 'AA':
            expected_items = set(list(constants.amino_acids))
        elif engine == 'NT':
            expected_items = set(list(constants.nucleotides))
        elif engine == 'CDN':
            expected_items = set(list(constants.codons))

        unexpected_items_in_matrix = [item for item in matrix_columns if item not in expected_items]
        if len(unexpected_items_in_matrix):
            raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory.\
                               Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But\
                               the matrix %s had stuff anvi'o is not familiar with: '%s'." % \
                                            (matrix_id, engine, len(expected_items), ', '.join(expected_items),
                                             matrix_id, ', '.join(unexpected_items_in_matrix)))

        matrix_data = u.get_TAB_delimited_file_as_dictionary(matrix_path, column_mapping = [str] + [float] * len(expected_items))
        data[matrix_id] = matrix_data

    if len(data):
        run.warning('%d matri%s been loaded: "%s".' % \
                                    (len(data),
                                     'ces have' if len(data) > 1 else 'x has',
                                     ', '.join(list(data.keys()))),
                    header='%s substitution scoring matrices' % engine,
                    lc="green")

    return data

예제 #4

0

파일 보기

파일: interactive.py 프로젝트: HMorrison/anvio

    def convert_view_data_into_json(self):
        '''This function's name must change to something more meaningful.'''

        additional_layers_dict, additional_layer_headers = None, []
        if self.additional_layers_path:
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(self.additional_layers_path)
            additional_layer_headers = utils.get_columns_of_TAB_delim_file(self.additional_layers_path)

        for view in self.views:
            # here we will populate runinfo['views'] with json objects.
            view_dict = self.views[view]['dict']
            view_headers = self.views[view]['header']

            json_object = []

            # (1) set the header line with the first entry:
            json_header = ['contigs']

            # (2) add taxonomy, if exitsts:
            if len(self.splits_taxonomy_dict):
                json_header.extend(['taxonomy'])

            # (3) then add split summaries from contigs db, if exists
            if len(self.genes_in_splits_summary_dict):
                json_header.extend(self.genes_in_splits_summary_headers[1:])

            # (4) then add length and GC content
            basic_info_headers = ['length', 'gc_content']
            json_header.extend(basic_info_headers)

            # (5) then add the view!
            json_header.extend(view_headers)

            # (6) then add 'additional' headers as the outer ring:
            if additional_layer_headers:
                json_header.extend(additional_layer_headers)

            # (7) finally add hmm search results
            if self.hmm_searches_header:
                json_header.extend([tpl[0] for tpl in self.hmm_searches_header])

            # (8) and finalize it (yay):
            json_object.append(json_header)

            for split_name in view_dict:
                # (1)
                json_entry = [split_name]

                # (2)
                if self.splits_taxonomy_dict:
                    json_entry.extend([self.splits_taxonomy_dict[split_name]['t_species']])

                # (3)
                if self.genes_in_splits_summary_dict:
                    json_entry.extend([self.genes_in_splits_summary_dict[split_name][header] for header in self.genes_in_splits_summary_headers[1:]])

                # (4)
                json_entry.extend([self.splits_basic_info[split_name][header] for header in basic_info_headers])

                # (5) adding essential data for the view
                json_entry.extend([view_dict[split_name][header] for header in view_headers])

                # (6) adding additional layers
                json_entry.extend([additional_layers_dict[split_name][header] if additional_layers_dict.has_key(split_name) else None for header in additional_layer_headers])

                # (7) adding hmm stuff
                if self.hmm_searches_dict:
                    if self.split_hmm_layers:
                        json_entry.extend([self.hmm_searches_dict[split_name][header] if self.hmm_searches_dict.has_key(split_name) else None for header in [tpl[0] for tpl in self.hmm_searches_header]])
                    else:
                        json_entry.extend([len(self.hmm_searches_dict[split_name][header]) if self.hmm_searches_dict.has_key(split_name) else 0 for header in [tpl[1] for tpl in self.hmm_searches_header]])

                # (8) send it along!
                json_object.append(json_entry)

            self.views[view] = json_object

예제 #5

0

파일 보기

파일: interactive.py 프로젝트: HMorrison/anvio

    def load_from_anvio_files(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header = 'Available states (%d)' % len(self.states_table.states), lc = 'green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state:
            if not self.state in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state

예제 #6

0

파일 보기

파일: interactive.py 프로젝트: HMorrison/anvio

    def load_from_user_files(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\
                                using the '--manual-mode' flag, you still need to declare a profile database.\
                                The profile database in this mode only used to read or store the 'state' of\
                                the display for visualization purposes. You DO NOT need to point to an already\
                                existing database, as anvi'o will generate an empty one for your if there is no\
                                profile database."

        if (not self.fasta_file) or (not self.view_data_path) or (not self.tree):
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                each of '-f', '-d', and '-t' parameters. Please see the help menu for more info."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"


        view_data_path = os.path.abspath(self.view_data_path)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        # sanity of the view data
        filesnpaths.is_file_tab_delimited(view_data_path)
        view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
        if not view_data_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the view data file must\
                                      say 'contig', which is not the case for your view data file\
                                      ('%s'). Please make sure this is a properly formatted view data\
                                      file." % (view_data_path)

        # load view data as the default view:
        self.views[self.default_view] = {'header': view_data_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up: 
        self.p_meta['samples'] = self.views[self.default_view]['header']

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for ad hoc operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title

예제 #7

0

파일 보기

파일: interactive.py 프로젝트: caglar10ur/anvio

    def load_from_profile_database(self, args):
        if self.p_meta['version'] != anvio.__profile__version__:
            raise ConfigError, "The profile database has a version number that differs from the version that is valid\
                                for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\
                                unfortunately, you need to re-profile and re-merge this project using the current anvi'o :("

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # is summary being overwritten?
        if self.summary_index:
            run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index)
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)

        if os.path.exists(self.P('SUMMARY.cp')):
            self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp'))
        else:
            self.splits_summary_index = None
            run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\
                         it will attempt to do it), but things may behave badly with the absence of\
                         SUMMARY.cp (first and foremost, you will not be able to inspect individual\
                         contigs through any of the interactive interfaces). Please investigate it\
                         if you were not expecting this.")

        # set title
        if self.title:
            self.title = self.title + ' (%s)' % self.default_view
        else:
            self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view

예제 #8

0

파일 보기

파일: interactive.py 프로젝트: ascendo/anvio

    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}}

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {'header': view_data_columns[1:],
                                             'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {'header': ['names'],
                                             'dict': ad_hoc_dict}

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                    'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title

예제 #9

0

파일 보기

파일: genecalls.py 프로젝트: CheolsoonIm/anvio

    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False,
            skip_predict_frame=False,
            skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

        Primary input is either an `input_file_path` for external gene calls, or an
        external `gene_calls_dict` dictionary object.

        Parameters
        ==========
        input_file_path : str
            Path to file with one of the following structures.

            Option 1:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3

            Option 2:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version  aa_sequence
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3   MSKKIYFTEYSKVNRLQTISNFTGSA
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3   MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK

        gene_calls_dict : dict, None
            Alternative to `input_file_path`. If provided, entries will be APPENDED to the database.
            So you need to make sure gene caller ids in your dict does not overlap with the ones in
            the database. Should look like:

                {
                    "1": {
                        "contig": "contig_name",
                        "start": 20,
                        "stop": 1544,
                        "direction": "f",
                        "partial": 0,
                        "call_type": 1,
                        "source": "source_name",
                        "version": "unknown",
                        "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA"
                    },

                    "2": {
                      (...)
                    },

                    (...)
                }

            All entries are required except "aa_sequence", which is optional. If provided, it should
            be present for ALL entries, even if it is an empty string. It's presence will be used to
            populate `gene_amino_acid_sequences`.

        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.

        skip_predict_frame : bool, False
            If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts
            the most likley open reading frame and trims the start/stop of the gene call to reflect this
            change so that the gene *is* divisible by 3. This flag allows the retention of amino acid
            sequences even if genes are not divisible by 3, or when it is flagged as partial.

        skip_amino_acid_sequences : bool, False
            Should the gene_amino_acid_sequences table be populated? This may be useful if genes
            that are not translated are being added, such as ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            has_aa_seq = lambda x: True if 'aa_sequence' in x else False
            num_with_aa_seqs = sum([
                has_aa_seq(gene_call)
                for gene_call in gene_calls_dict.values()
            ])
            num_gene_calls = len(gene_calls_dict)
            if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls:
                raise ConfigError(
                    "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table "
                    "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have "
                    "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls -
                                       num_with_aa_seqs, num_gene_calls))

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning "
                    "prematurely and assuming you know what's up. If you don't, stop here and try to "
                    "identify what decisions you've made might have led you to this weird point your "
                    "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've "
                    "done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external "
                "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` "
                "with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            expected_fields = t.genes_in_contigs_table_structure
            column_mapping = [int, str, int, int, str, int, int, str, str]

            if 'aa_sequence' in utils.get_columns_of_TAB_delim_file(
                    input_file_path):
                expected_fields = t.genes_in_contigs_table_structure + [
                    'aa_sequence'
                ]
                column_mapping.append(
                    lambda x: ''
                    if x is None else str(x))  # str(None) is 'None', amazingly

            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=expected_fields,
                only_expected_fields=True,
                column_mapping=column_mapping)

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that "
                    "this is an error, anvi'o will stop here and complain. If this is not an error and you "
                    "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, "
                    "instead of providing an emtpy external gene calls file. You don't agree? You need this "
                    "for some weird step for you weird pipeline? Let us know, and we will consider changing "
                    "this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(
                gene_calls_dict,
                ignore_internal_stop_codons=ignore_internal_stop_codons,
                skip_predict_frame=skip_predict_frame,
            )

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)

예제 #10

0

파일 보기

 def predict_from_TAB_delimited_file(self, file_path):
     cols = utils.get_columns_of_TAB_delim_file(file_path)
     return self.predict(
         utils.get_TAB_delimited_file_as_dictionary(file_path,
                                                    column_mapping=[str] +
                                                    [float] * len(cols)))

예제 #11

0

파일 보기

파일: pubs.py 프로젝트: TACO-Chloe/web

    def parse_pubs_txt(self):
        if os.path.exists(self.pubs_info_file_path):
            self.info = u.get_TAB_delimited_file_as_dictionary(
                self.pubs_info_file_path)

        pubs_header = u.get_columns_of_TAB_delim_file(
            self.pubs_file_path, include_first_column=True)
        headers_expected = [
            'Authors', 'Title', 'Publication', 'Volume', 'Number', 'Pages',
            'Year', 'doi'
        ]
        missing_headers = [h for h in pubs_header if h not in headers_expected]
        if len(missing_headers):
            raise ConfigError(
                "Sorry, the pubs.txt seems to be missing some of the headers that are mandatory. Each of \
                               the columns in the following list must be present in this file: %s (hint: yours do not have\
                               the following: %s)." %
                (', '.join(headers_expected), ', '.join(missing_headers)))

        self.pubs_txt = u.get_TAB_delimited_file_as_dictionary(
            self.pubs_file_path, indexing_field=pubs_header.index('doi'))

        for doi in self.pubs_txt:
            authors = []
            co_first_authors = []
            co_senior_authors = []
            p = self.pubs_txt[doi]

            for author in [_.strip() for _ in p['Authors'].split(';')]:
                if not len(author):
                    continue

                author_last_name, author_first_name_raw = [
                    _.strip() for _ in author.split(',')
                ]
                author_first_name = ''.join(
                    [n[0] for n in author_first_name_raw.split()])
                author_final_name = '%s %s' % (author_last_name,
                                               author_first_name)

                if author_first_name_raw.endswith('*'):
                    co_first_authors.append(author_final_name)
                elif author_first_name_raw.endswith('+'):
                    co_senior_authors.append(author_final_name)

                authors.append(author_final_name)

            if p['Number']:
                issue = '%s(%s):%s' % (p['Volume'], p['Number'], p['Pages'])
            else:
                issue = '%s:%s' % (p['Volume'], p['Pages'])

            year = p['Year'].strip()
            pub_entry = {
                'authors': authors,
                'title': p['Title'],
                'journal': p['Publication'],
                'issue': issue,
                'doi': doi,
                'year': year,
                'co_first_authors': co_first_authors,
                'co_senior_authors': co_senior_authors
            }

            if year not in self.pubs_dict:
                self.pubs_dict[year] = [pub_entry]
            else:
                self.pubs_dict[year].append(pub_entry)

예제 #12

0

파일 보기

    def convert_view_data_into_json(self):
        '''This function's name must change to something more meaningful.'''

        additional_layers_dict, additional_layers_headers = self.additional_layers_dict, self.additional_layers_headers
        if self.additional_layers_path:
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(self.additional_layers_path, dict_to_append=additional_layers_dict, assign_none_for_missing=True)
            additional_layers_headers = additional_layers_headers + utils.get_columns_of_TAB_delim_file(self.additional_layers_path)

        for view in self.views:
            # here we will populate runinfo['views'] with json objects.
            view_dict = self.views[view]['dict']
            view_headers = self.views[view]['header']

            json_object = []

            # (1) set the header line with the first entry:
            json_header = ['contigs']

            # (2) add taxonomy, if exitsts:
            if len(self.splits_taxonomy_dict):
                json_header.extend(['taxonomy'])

            # (3) then add split summaries from contigs db, if exists
            if len(self.genes_in_splits_summary_dict):
                json_header.extend(self.genes_in_splits_summary_headers[1:])

            # (4) then add length and GC content IF we have sequences available
            if self.splits_basic_info:
                basic_info_headers = ['length', 'gc_content']
                json_header.extend(basic_info_headers)

            # (5) then add the view!
            json_header.extend(view_headers)

            # (6) then add 'additional' headers as the outer ring:
            if additional_layers_headers:
                json_header.extend(additional_layers_headers)

            # (7) finally add hmm search results
            if self.hmm_searches_header:
                json_header.extend([tpl[0] for tpl in self.hmm_searches_header])

            # (8) and finalize it (yay):
            json_object.append(json_header)

            for split_name in view_dict:
                # (1)
                json_entry = [split_name]

                # (2)
                if self.splits_taxonomy_dict:
                    if split_name in self.splits_taxonomy_dict:
                        json_entry.extend([self.splits_taxonomy_dict[split_name]])
                    else:
                        json_entry.extend([None])

                # (3)
                if self.genes_in_splits_summary_dict:
                    json_entry.extend([self.genes_in_splits_summary_dict[split_name][header] for header in self.genes_in_splits_summary_headers[1:]])

                # (4)
                if self.splits_basic_info:
                    json_entry.extend([self.splits_basic_info[split_name][header] for header in basic_info_headers])

                # (5) adding essential data for the view
                json_entry.extend([view_dict[split_name][header] for header in view_headers])

                # (6) adding additional layers
                json_entry.extend([additional_layers_dict[split_name][header] if split_name in additional_layers_dict else None for header in additional_layers_headers])

                # (7) adding hmm stuff
                if self.hmm_searches_dict:
                    if self.split_hmm_layers:
                        json_entry.extend([self.hmm_searches_dict[split_name][header] if split_name in self.hmm_searches_dict else None for header in [tpl[0] for tpl in self.hmm_searches_header]])
                    else:
                        json_entry.extend([len(self.hmm_searches_dict[split_name][header]) if split_name in self.hmm_searches_dict else 0 for header in [tpl[1] for tpl in self.hmm_searches_header]])

                # (8) send it along!
                json_object.append(json_entry)

            self.views[view] = json_object

예제 #13

0

파일 보기

    def load_full_mode(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\
                                should use the `--manual` flag if that's what your intention)."

        if not self.profile_db_path:
            raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\
                                Well. This does not make any sense."

        if not args.skip_init_functions:
            self.init_functions()

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_collections_dict(self.profile_db_path)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path)

        # load views from the profile database
        if self.p_meta['blank']:
            blank_dict = {}
            for split_name in self.splits_basic_info:
                blank_dict[split_name] = {'blank_view': 0}

            self.views['blank_view'] = {'header': ['blank_view'],
                                        'dict': blank_dict}
            self.default_view = 'blank_view'

        else:
            self.load_views()
            self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header='Available views (%d)' % len(self.views), lc='green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header='Available states (%d)' % len(self.states_table.states), lc='green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping=column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings

        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = clustering_id
                self.p_meta['available_clusterings'] = [clustering_id]
                self.p_meta['clusterings'] = {clustering_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % clustering_id)
            else:
                self.p_meta['clusterings'][clustering_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % clustering_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_profile_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state_autoload:
            if not self.state_autoload in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state_autoload

예제 #14

0

파일 보기

    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError, "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist)."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}}

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {'header': view_data_columns[1:],
                                             'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {'header': ['names'],
                                             'dict': ad_hoc_dict}

        self.displayed_item_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.displayed_item_names_ordered) - set(self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.displayed_item_names_ordered:
                self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                    'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path)

        if self.title:
            self.title = self.title

예제 #15

0

파일 보기

파일: learning.py 프로젝트: AstrobioMike/anvio

 def predict_from_TAB_delimited_file(self, file_path):
     cols = utils.get_columns_of_TAB_delim_file(file_path)
     return self.predict(utils.get_TAB_delimited_file_as_dictionary(file_path, column_mapping=[str] + [float] * len(cols)))

예제 #16

0

파일 보기

    def append_dict_to_file(self, dict_to_append, file_handle):
        """This function adds a TAB-delimited dictionary to the end of the file.

        If the file is empty, it writes the header as well as adding the dictionary contents.
        Otherwise, it checks that the dictionary contains the same keys as the header and appends the
        dictionary contents to the end of the file.

        Parameters
        ==========
        dict_to_append : dictionary
            Holds the data you want to add to the end of the file. Keys should be headers of the file.
        file_handle : a file object
            Pointer to the file, opened in append mode. The calling function should take care of the
            open() and pass the handle here
        """

        import anvio.utils as utils
        if is_file_empty(self.path):
            utils.store_dict_as_TAB_delimited_file(dict_to_append, None, headers=self.headers, file_obj=file_handle, \
                                                    key_header=self.key_header, keys_order=self.keys_order, \
                                                    header_item_conversion_dict=self.header_item_conversion_dict, \
                                                    do_not_close_file_obj=True)
        else:
            # if dictionary is empty, just return
            if not dict_to_append:
                return

            file_headers = utils.get_columns_of_TAB_delim_file(self.path, include_first_column=True)
            inner_dict_keys = list(dict_to_append.values())[0].keys()

            # figure out if the first column holds the keys of the outer dictionary or one of the inner dictionary keys
            if file_headers[0] in inner_dict_keys:
                self.key_header = None
                self.headers = file_headers
            else:
                self.key_header = file_headers[0]
                self.headers = file_headers[1:]

            # check that the inner dictionary has the file headers we need
            missing_headers = [h for h in self.headers if h not in inner_dict_keys]
            if len(missing_headers):
                if anvio.DEBUG:
                    if len(missing_headers) > 10:
                        raise FilesNPathsError(f"Some headers from the file (n={len(missing_headers)}) are not in your dictionary :/ "
                                          f"Here are the first ten of them: {missing_headers[:10].__str__()}")
                    else:
                        raise FilesNPathsError(f"Some headers from the file are not in your dictionary :/ Here they are: {missing_headers.__str__()}")
                else:
                    raise FilesNPathsError("Some headers from the file are not in your dictionary :/ Use `--debug` to see where this "
                                      "error is coming from the codebase with a list of example keys that are missing.")

            # check that any requested outer dictionary keys are present
            if not self.keys_order:
                self.keys_order = sorted(dict_to_append.keys())
            else:
                missing_keys = [k for k in self.keys_order if k not in dict_to_append]
                if len(missing_keys):
                    if anvio.DEBUG:
                        if len(missing_keys) > 10:
                            raise FilesNPathsError(f"Some keys (n={len(missing_keys)}) are not in your dictionary :/ Here are "
                                              f"the first ten of them: {missing_keys[:10].__str__()}")
                        else:
                            raise FilesNPathsError(f"Some keys are not in your dictionary :/ Here they are: {missing_keys.__str__()}")
                    else:
                        raise FilesNPathsError("Some keys are not in your dictionary :/ Use `--debug` to see where this "
                                          "error is coming from the codebase with a list of example keys that are "
                                          "missing.")

            # dict looks okay, append it to file
            for k in self.keys_order:
                if self.key_header: # first column is key of outer dict
                    line = [str(k)]
                else:               # do not put the key of outer dict in the first column
                    line = []

                for header in self.headers:
                    try:
                        val = dict_to_append[k][header]
                    except KeyError:
                        raise FilesNPathsError(f"Header '{header}' is not found in the dict for key '{k}':/")
                    except TypeError:
                        raise FilesNPathsError("Your dictionary is not properly formatted to be exported "
                                           f"as a TAB-delimited file :/ You ask for '{header}', but it is not "
                                           "even a key in the dictionary")

                    line.append(str(val) if not isinstance(val, type(None)) else '')

                if anvio.AS_MARKDOWN:
                    file_handle.write(f"|{'|'.join(map(str, line))}|\n")
                else:
                    file_handle.write('%s\n' % '\t'.join(line))