Python get_column_data_from_TAB_delim_fileの例、anvio.utils.get_column_data_from_TAB_delim_file Pythonの例

コード例 #1

0

ファイルを表示

    def load_genomes(self):
        # genome_name parameter can be a file or comma seperated genome names.
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus,
                                          dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(
                    self.genome_names_to_focus,
                    column_indices=[0],
                    expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [
                    g.strip() for g in self.genome_names_to_focus.split(',')
                ]

            self.run.warning(
                "A subset of genome names is found, and anvi'o will focus only on to those."
            )

        self.genomes_storage = GenomeStorage(
            self.genomes_storage_path,
            storage_hash=None,
            genome_names_to_focus=self.genome_names_to_focus)
        self.genomes = self.genomes_storage.get_genomes_dict()

        self.external_genome_names = [
            g for g in self.genomes if self.genomes[g]['external_genome']
        ]
        self.internal_genome_names = [
            g for g in self.genomes if not self.genomes[g]['external_genome']
        ]

        self.hash_to_genome_name = {}
        for genome_name in self.genomes:
            self.hash_to_genome_name[self.genomes[genome_name]
                                     ['genome_hash']] = genome_name

コード例 #2

0

ファイルを表示

ファイル: genomedescriptions.py プロジェクト: isaacfink21/anvio

    def names_check(self):
        names = utils.get_column_data_from_TAB_delim_file(
            self.input_file_for_metagenomes, [0])[0][1:]

        if len(names) != len(set(names)):
            raise ConfigError(
                "Each entry in your metagenomes file must e unique :/")

コード例 #3

0

ファイルを表示

ファイル: __init__.py プロジェクト: yinx843/anvio

def get(engine, run=run):
    data = {}

    if engine not in engines:
        raise ConfigError(
            "Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'."
            % engine)

    dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine)
    substitution_matrix_paths = [
        s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')
    ]

    for matrix_path in substitution_matrix_paths:
        matrix_id = os.path.basename(matrix_path).split('.txt')[0]

        matrix_rows = u.get_column_data_from_TAB_delim_file(
            matrix_path, column_indices=[0])[0][1:]
        matrix_columns = u.get_columns_of_TAB_delim_file(
            matrix_path, include_first_column=False)

        if sorted(matrix_columns) != sorted(matrix_rows):
            raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like "
                              "a nicely done matrix. Substitution scoring matrices must contain the same row and column "
                              "names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \
                                                    % (os.path.basename(matrix_path)))

        if engine == 'AA':
            expected_items = set(list(constants.amino_acids))
        elif engine == 'NT':
            expected_items = set(list(constants.nucleotides))
        elif engine == 'CDN':
            expected_items = set(list(constants.codons))

        unexpected_items_in_matrix = [
            item for item in matrix_columns if item not in expected_items
        ]
        if len(unexpected_items_in_matrix):
            raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory. "
                              "Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But "
                              "the matrix %s had stuff anvi'o is not familiar with: '%s'." % \
                                            (matrix_id, engine, len(expected_items), ', '.join(expected_items),
                                             matrix_id, ', '.join(unexpected_items_in_matrix)))

        matrix_data = u.get_TAB_delimited_file_as_dictionary(
            matrix_path, column_mapping=[str] + [float] * len(expected_items))
        data[matrix_id] = matrix_data

    if len(data):
        run.warning('%d matri%s been loaded: "%s".' % \
                                    (len(data),
                                     'ces have' if len(data) > 1 else 'x has',
                                     ', '.join(list(data.keys()))),
                    header='%s substitution scoring matrices' % engine,
                    lc="green")

    return data

コード例 #4

0

ファイルを表示

    def init_genomes_data_storage(self):
        """Initializes an existing genomes storage by reading everything about genomes of interest"""

        A = lambda x: self.args.__dict__[x
                                         ] if x in self.args.__dict__ else None
        self.storage_path = A('genomes_storage')
        self.genome_names_to_focus = A('genome_names')

        if not self.storage_path:
            raise ConfigError(
                "Anvi'o genomes storage is speaking. Someone called the init function,\
                               yet there is nothing to initialize since genome storage path variable\
                               (args.genomes_storage) is None. If you are an end user, please make sure\
                               you provide the genomes storage paramater to whatever program you were\
                               running. If you are a developer, you probably already figured what is\
                               wrong. If you are a cat, you need to send us an e-mail immediately."
            )

        # let's take care of the genome names to focus, if there are any, first.
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus,
                                          dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(
                    self.genome_names_to_focus,
                    column_indices=[0],
                    expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [
                    g.strip() for g in self.genome_names_to_focus.split(',')
                ]

            self.run.warning(
                "A subset of genome names is found, and anvi'o will focus only on to those."
            )

        filesnpaths.is_proper_genomes_storage_file(self.storage_path)

        self.genomes_storage = auxiliarydataops.GenomesDataStorage(
            self.storage_path,
            db_hash=None,
            genome_names_to_focus=self.genome_names_to_focus,
            ignore_hash=True)
        self.genomes_storage_hash = self.genomes_storage.get_storage_hash()

        self.genomes = self.genomes_storage.get_genomes_dict()

        self.external_genome_names = [
            g for g in self.genomes if self.genomes[g]['external_genome']
        ]
        self.internal_genome_names = [
            g for g in self.genomes if not self.genomes[g]['external_genome']
        ]

        for genome_name in self.genomes:
            self.hash_to_genome_name[self.genomes[genome_name]
                                     ['genome_hash']] = genome_name

コード例 #5

0

ファイルを表示

ファイル: __init__.py プロジェクト: AstrobioMike/anvio

def get(engine, run=run):
    data = {}

    if engine not in engines:
        raise ConfigError("Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'." % engine)

    dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine)
    substitution_matrix_paths = [s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')]

    for matrix_path in substitution_matrix_paths:
        matrix_id = os.path.basename(matrix_path).split('.txt')[0]

        matrix_rows = u.get_column_data_from_TAB_delim_file(matrix_path, column_indices=[0])[0][1:]
        matrix_columns = u.get_columns_of_TAB_delim_file(matrix_path, include_first_column=False)

        if sorted(matrix_columns) != sorted(matrix_rows):
            raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like\
                               a nicely done matrix. Substitution scoring matrices must contain the same row and column\
                               names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \
                                                    % (os.path.basename(matrix_path)))

        if engine == 'AA':
            expected_items = set(list(constants.amino_acids))
        elif engine == 'NT':
            expected_items = set(list(constants.nucleotides))
        elif engine == 'CDN':
            expected_items = set(list(constants.codons))

        unexpected_items_in_matrix = [item for item in matrix_columns if item not in expected_items]
        if len(unexpected_items_in_matrix):
            raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory.\
                               Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But\
                               the matrix %s had stuff anvi'o is not familiar with: '%s'." % \
                                            (matrix_id, engine, len(expected_items), ', '.join(expected_items),
                                             matrix_id, ', '.join(unexpected_items_in_matrix)))

        matrix_data = u.get_TAB_delimited_file_as_dictionary(matrix_path, column_mapping = [str] + [float] * len(expected_items))
        data[matrix_id] = matrix_data

    if len(data):
        run.warning('%d matri%s been loaded: "%s".' % \
                                    (len(data),
                                     'ces have' if len(data) > 1 else 'x has',
                                     ', '.join(list(data.keys()))),
                    header='%s substitution scoring matrices' % engine,
                    lc="green")

    return data

コード例 #6

0

ファイルを表示

ファイル: panops.py プロジェクト: meren/anvio

    def load_genomes(self):
        # genome_name parameter can be a file or comma seperated genome names.
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus, dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(self.genome_names_to_focus, column_indices=[0], expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [g.strip() for g in self.genome_names_to_focus.split(',')]

            self.run.warning("A subset of genome names is found, and anvi'o will focus only on to those.")

        self.genomes_storage = GenomeStorage(self.genomes_storage_path, storage_hash=None, genome_names_to_focus=self.genome_names_to_focus)
        self.genomes = self.genomes_storage.get_genomes_dict()

        self.external_genome_names = [g for g in self.genomes if self.genomes[g]['external_genome']]
        self.internal_genome_names = [g for g in self.genomes if not self.genomes[g]['external_genome']]

        self.hash_to_genome_name = {}
        for genome_name in self.genomes:
            self.hash_to_genome_name[self.genomes[genome_name]['genome_hash']] = genome_name

コード例 #7

0

ファイルを表示

    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError(
                "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database.")

        if not self.profile_db_path:
            raise ConfigError(
                "Even when you want to use the interactive interface in manual mode, you need\
                                to provide a profile database path. But you DO NOT need an already existing\
                                profile database, since anvi'o will generate an empty one for you. The profile\
                                database in this mode only used to read or store the 'state' of the display\
                                for visualization purposes, or to allow you to create and store collections."
            )

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if filesnpaths.is_file_exists(self.profile_db_path, dont_raise=True):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError(
                    "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist).")

        if not self.tree and not self.view_data_path:
            raise ConfigError(
                "You must be joking Mr. Feynman. No tree file, and no data file? What is it that\
                               anvi'o supposed to visualize? :(")

        if not self.tree:
            self.run.warning(
                "You haven't declared a tree file. Anvi'o will do its best to come up with an\
                              organization of your items.")

        if self.view:
            raise ConfigError(
                "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode")

        if self.show_views:
            raise ConfigError(
                "Sorry, there are no views to show in manual mode :/")

        if self.show_states:
            raise ConfigError(
                "Sorry, there are no states to show in manual mode :/")

        if self.tree:
            filesnpaths.is_file_exists(self.tree)
            newick_tree_text = ''.join([
                l.strip()
                for l in open(os.path.abspath(self.tree)).readlines()
            ])
            item_names = utils.get_names_order_from_newick_tree(
                newick_tree_text)
        else:
            item_names = utils.get_column_data_from_TAB_delim_file(
                self.view_data_path, column_indices=[0])[0][1:]

        # try to convert item names into integer values for proper sorting later. it's OK if it does
        # not work.
        try:
            item_names = [int(n) for n in item_names]
        except:
            pass

        view_data_path = os.path.abspath(
            self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.default_view = self.p_meta['default_view']

        # set some default organizations of data:
        self.p_meta['clusterings'] = {
            'Alphabetical_(reverse):none:none': {
                'basic': sorted(item_names)
            },
            'Alphabetical:none:none': {
                'basic': sorted(item_names, reverse=True)
            }
        }
        self.p_meta['available_clusterings'] = [
            'Alphabetical_(reverse):none:none', 'Alphabetical:none:none'
        ]
        self.p_meta['default_clustering'] = self.p_meta[
            'available_clusterings'][0]

        # if we have a tree, let's make arrangements for it:
        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            self.p_meta['default_clustering'] = clustering_id
            self.p_meta['available_clusterings'].append(clustering_id)
            self.p_meta['clusterings'][clustering_id] = {
                'newick': newick_tree_text
            }

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(
                view_data_path, include_first_column=True)

            # load view data as the default view:
            self.views[self.default_view] = {
                'header': view_data_columns[1:],
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(view_data_path)
            }
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            ad_hoc_dict = {}
            for item in item_names:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {
                'header': ['names'],
                'dict': ad_hoc_dict
            }

        self.displayed_item_names_ordered = list(
            self.views[self.default_view]['dict'].keys())

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(
                self.displayed_item_names_ordered) - set(
                    self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError(
                    'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"'
                    %
                    (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()))

            # setup a mock splits_basic_info dict
            for split_id in self.displayed_item_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path)

        # read description from self table, if it is not available get_description function will return placeholder text
        self.p_meta['description'] = get_description_in_db(
            self.profile_db_path)

        if self.title:
            self.title = self.title

コード例 #8

0

ファイルを表示

ファイル: synteny.py プロジェクト: shiyi-pan/anvio

    def __init__(self,
                 args,
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 skip_sanity_check=False):
        """Parses arguments and run sanity_check"""

        self.args = args
        self.run = run
        self.progress = progress

        # Parse arguments
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.annotation_source = A('annotation_source')
        self.window_range = A('ngram_window_range') or "2:3"
        self.is_in_unknowns_mode = A('analyze_unknown_functions')
        self.output_file = A('output_file')
        self.skip_init_functions = A('skip_init_functions')
        self.genome_names_to_focus = A('genome_names')
        self.ngram_source = A("ngram_source")

        self.annotation_source_dict = {}

        self.pan_db_path = A('pan_db')

        if self.annotation_source and self.pan_db_path:
            self.annotation_sources = [self.annotation_source, 'gene_clusters']

        if self.pan_db_path:
            self.pan_db = PanDatabase(self.pan_db_path)

            self.p_meta = self.pan_db.meta

            self.p_meta['creation_date'] = utils.get_time_to_date(
                self.p_meta['creation_date']
            ) if 'creation_date' in self.p_meta else 'unknown'
            self.p_meta['genome_names'] = sorted([
                s.strip()
                for s in self.p_meta['external_genome_names'].split(',') +
                self.p_meta['internal_genome_names'].split(',') if s
            ])
            self.p_meta['num_genomes'] = len(self.p_meta['genome_names'])
            self.genome_names = self.p_meta['genome_names']
            self.gene_clusters_gene_alignments_available = self.p_meta[
                'gene_alignments_computed']
        else:
            self.pan_db = None

        self.genomes_storage_path = A('genomes_storage')

        # confirm genome-storage and pangenome hashes match of pangenome is provided
        if self.pan_db:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                self.p_meta['genomes_storage_hash'],
                genome_names_to_focus=self.p_meta['genome_names'],
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)
        else:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)

        # list-annotation-resources
        self.list_annotation_sources = A('list_annotation_sources')
        self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe(
            'gene_function_calls').source.unique()
        if self.list_annotation_sources:
            self.run.info('Available functional annotation sources',
                          ', '.join(self.gene_function_source_set))
            sys.exit()

        # This houses the ngrams' data
        self.ngram_attributes_list = []

        # Focus on specfic set of genomes
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus,
                                          dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(
                    self.genome_names_to_focus,
                    column_indices=[0],
                    expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [
                    g.strip() for g in self.genome_names_to_focus.split(',')
                ]

            self.run.warning(
                "A subset of genome names is found, and anvi'o will focus only on to those."
            )

            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                storage_hash=None,
                genome_names_to_focus=self.genome_names_to_focus)
            self.genomes = self.genomes_storage.get_genomes_dict()

            self.external_genome_names = [
                g for g in self.genomes if self.genomes[g]['external_genome']
            ]
            self.internal_genome_names = [
                g for g in self.genomes
                if not self.genomes[g]['external_genome']
            ]

            self.hash_to_genome_name = {}
            for genome_name in self.genomes:
                self.hash_to_genome_name[self.genomes[genome_name]
                                         ['genome_hash']] = genome_name

            # number of genomes in genome-storage
            self.num_contigs_in_external_genomes_with_genes = len(self.genomes)

        # number of genomes in genome-storage
        if self.genome_names_to_focus:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genome_names_to_focus)
        else:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genomes_storage.get_all_genome_names())

        if not skip_sanity_check:
            self.sanity_check()

        # unless we are in debug mode, let's keep things quiet.
        if anvio.DEBUG:
            self.run_object = terminal.Run()
        else:
            self.run_object = terminal.Run(verbose=False)