コード例 #1
0
    def process_range(self, input_bam_paths, contig_name, start, end):
        if end <= start:
            raise ConfigError(
                "The end of range cannot be equal or smaller than the start of it :/"
            )

        data = []

        for input_bam_path in input_bam_paths:
            bam_file_object = BAMFileObject(input_bam_path).get()

            sample_id = filesnpaths.get_name_from_file_path(input_bam_path)

            self.run.warning('',
                             header="Working on '%s'" % sample_id,
                             lc='cyan')

            self.run.info('input_bam_path', input_bam_path)
            self.run.info('sample_id', sample_id)
            self.run.info('total_reads_mapped',
                          pp(int(bam_file_object.mapped)))
            self.run.info('num_contigs_in_bam',
                          pp(len(bam_file_object.references)))

            self.progress.new('Processing "%s" in "%s"' %
                              (contig_name, input_bam_path))
            self.progress.update('Analyzing positions stretching %d nts ...' %
                                 (end - start))

            read_ids = set([])

            for pileupcolumn in bam_file_object.pileup(contig_name, start,
                                                       end):
                for pileupread in pileupcolumn.pileups:
                    if not pileupread.is_del:
                        L = LinkMerDatum(sample_id, pileupread.alignment.qname,
                                         pileupread.alignment.is_read1)
                        L.contig_name = contig_name
                        L.pos_in_contig = pileupcolumn.pos
                        L.pos_in_read = pileupread.query_position
                        L.base = pileupread.alignment.seq[
                            pileupread.query_position]
                        L.reverse = pileupread.alignment.is_reverse
                        L.sequence = pileupread.alignment.query

                        if not L.read_unique_id in read_ids:
                            data.append(L)
                            read_ids.add(L.read_unique_id)

            self.progress.end()
            bam_file_object.close()

        self.run.info(
            'data',
            '%d reads identified mapping between positions %d and %d in "%s"' %
            (len(data), start, end, contig_name))

        self.data.extend(data)
コード例 #2
0
ファイル: panops.py プロジェクト: SydneyHardeman/anvio
    def run_diamond(self, unique_proteins_fasta_path, unique_proteins_names_dict):
        diamond = Diamond(unique_proteins_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        diamond.names_dict = unique_proteins_names_dict
        diamond.target_db_path = self.get_output_file_path(filesnpaths.get_name_from_file_path(unique_proteins_fasta_path))
        diamond.search_output_path = self.get_output_file_path('diamond-search-results')
        diamond.tabular_output_path = self.get_output_file_path('diamond-search-results.txt')

        diamond.sensitive = self.sensitive

        return diamond.get_blastall_results()
コード例 #3
0
ファイル: panops.py プロジェクト: ascendo/anvio
    def run_diamond(self, unique_proteins_fasta_path, unique_proteins_names_dict):
        diamond = Diamond(unique_proteins_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        diamond.names_dict = unique_proteins_names_dict
        diamond.target_db_path = self.get_output_file_path(filesnpaths.get_name_from_file_path(unique_proteins_fasta_path))
        diamond.search_output_path = self.get_output_file_path('diamond-search-results')
        diamond.tabular_output_path = self.get_output_file_path('diamond-search-results.txt')

        diamond.sensitive = self.sensitive

        return diamond.get_blastall_results()
コード例 #4
0
ファイル: panops.py プロジェクト: SydneyHardeman/anvio
    def run_blast(self, unique_proteins_fasta_path, unique_proteins_names_dict):
        self.run.warning("You elected to use NCBI's blastp for protein search. Running blastp will be significantly\
                          slower than DIAMOND (although, anvi'o developers are convinced that you *are*\
                          doing the right thing, so, kudos to you).")
        blast = BLAST(unique_proteins_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_proteins_names_dict
        blast.log_file_path = self.log_file_path
        blast.target_db_path = self.get_output_file_path(filesnpaths.get_name_from_file_path(unique_proteins_fasta_path))
        blast.search_output_path = self.get_output_file_path('blast-search-results.txt')

        return blast.get_blastall_results()
コード例 #5
0
ファイル: panops.py プロジェクト: ascendo/anvio
    def run_blast(self, unique_proteins_fasta_path, unique_proteins_names_dict):
        self.run.warning("You elected to use NCBI's blastp for protein search. Running blastp will be significantly\
                          slower than DIAMOND (although, anvi'o developers are convinced that you *are*\
                          doing the right thing, so, kudos to you).")
        blast = BLAST(unique_proteins_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_proteins_names_dict
        blast.log_file_path = self.log_file_path
        blast.target_db_path = self.get_output_file_path(filesnpaths.get_name_from_file_path(unique_proteins_fasta_path))
        blast.search_output_path = self.get_output_file_path('blast-search-results.txt')


        return blast.get_blastall_results()
コード例 #6
0
ファイル: bamops.py プロジェクト: meren/anvio
    def process_range(self, input_bam_paths, contig_name, start, end):
        if end <= start:
            raise ConfigError("The end of range cannot be equal or smaller than the start of it :/")

        data = []

        for input_bam_path in input_bam_paths:
            bam_file_object = BAMFileObject(input_bam_path).get()

            sample_id = filesnpaths.get_name_from_file_path(input_bam_path)

            self.run.warning('', header="Working on '%s'" % sample_id, lc='cyan')

            self.run.info('input_bam_path', input_bam_path)
            self.run.info('sample_id', sample_id)
            self.run.info('total_reads_mapped', pp(int(bam_file_object.mapped)))
            self.run.info('num_contigs_in_bam', pp(len(bam_file_object.references)))

            self.progress.new('Processing "%s" in "%s"' % (contig_name, input_bam_path))
            self.progress.update('Analyzing positions stretching %d nts ...' % (end - start))

            read_ids = set([])

            for pileupcolumn in bam_file_object.pileup(contig_name, start, end):
                for pileupread in pileupcolumn.pileups:
                    if not pileupread.is_del:
                        L = LinkMerDatum(sample_id, pileupread.alignment.qname, pileupread.alignment.is_read1)
                        L.contig_name = contig_name
                        L.pos_in_contig = pileupcolumn.pos
                        L.pos_in_read = pileupread.query_position
                        L.base = pileupread.alignment.seq[pileupread.query_position]
                        L.reverse = pileupread.alignment.is_reverse
                        L.sequence = pileupread.alignment.query

                        if not L.read_unique_id in read_ids:
                            data.append(L)
                            read_ids.add(L.read_unique_id)

            self.progress.end()
            bam_file_object.close()

        self.run.info('data', '%d reads identified mapping between positions %d and %d in "%s"' % (len(data), start, end, contig_name))

        self.data.extend(data)
コード例 #7
0
    def process(self):
        self.sanity_check()

        self.linkmers = LinkMersData(self.run, self.progress)

        for input_file in self.input_file_paths:
            sample_id = filesnpaths.get_name_from_file_path(input_file)
            bam_file_object = BAMFileObject(input_file).get()

            self.run.info('input_bam_path', input_file)
            self.run.info('sample_id', sample_id)
            self.run.info('total_reads_mapped', pp(int(bam_file_object.mapped)))
            self.run.info('num_contigs_in_bam', pp(len(bam_file_object.references)))

            for request_id, contig_name, positions in self.contig_and_position_requests_list:
                self.linkmers.append(bam_file_object, sample_id, request_id, contig_name, positions, self.only_complete_links)

            bam_file_object.close()

        return self.linkmers.data
コード例 #8
0
    def __init__(self, config_file_path, input_directory=None, db_paths={}, row_ids_of_interest=[], r=run, p=progress):
        self.run = r
        self.progress = p

        self.input_directory = input_directory or os.path.abspath(os.getcwd())
        self.config_file_path = config_file_path

        # `row_ids_of_interest` gives opportunity to filter out irrelevant entries quickly
        # while vectors are being obtained from each matrix described in the config file.
        # to see why it is important in the context of anvi'o, see
        # https://github.com/meren/anvio/issues/100
        self.row_ids_of_interest = set(row_ids_of_interest)

        # these are the database files that may be referenced from within the config files
        # with !DATABASE.db::table notation. If a database entry has an exclamation mark,
        # it will be searched for in the db_paths dict to associate it with the relative
        # path that is only known to the client
        self.db_paths = db_paths

        # read the config
        filesnpaths.is_file_exists(self.config_file_path)
        config = ConfigParser.ConfigParser()
        config.read(self.config_file_path)

        # this will keep the actual paths for each matrix:
        self.matrix_paths = {}
        self.set_default_paths(config)

        self.check_for_db_requests(config)

        # and sanity check.
        self.sanity_check(config)

        if self.get_option(config, 'general', 'output_file', str):
            self.output_file_name = self.get_option(config, 'general', 'output_file', str)
            self.output_file_path = os.path.join(self.input_directory, self.output_file_name)
        else:
            self.output_file_name = None
            self.output_file_path = None

        self.name = self.get_option(config, 'general', 'name', str) or filesnpaths.get_name_from_file_path(self.config_file_path)
        self.distance = self.get_option(config, 'general', 'distance', str)
        self.linkage = self.get_option(config, 'general', 'linkage', str)

        self.num_components = self.get_option(config, 'general', 'num_components', int)
        self.seed = self.get_option(config, 'general', 'seed', int)
        self.master = None

        self.matrices_dict = {}
        self.matrices = []
        for section in self.get_other_sections(config):
            alias, matrix = section.split()

            self.matrices.append(alias)

            m = {}
            columns_to_use = self.get_option(config, section, 'columns_to_use', str)
            m['alias'] = alias
            m['matrix'] = matrix
            m['columns_to_use'] = [c.strip() for c in columns_to_use.split(',')] if columns_to_use else None
            m['ratio'] = self.get_option(config, section, 'ratio', int)
            m['path'] = self.matrix_paths[alias]
            m['normalize'] = False if self.get_option(config, section, 'normalize', str) == 'False' else True
            m['log'] = True if self.get_option(config, section, 'log', str) == 'True' else False
            # next two variables are necessary to follow the order of vectors
            m['id_to_sample'], m['sample_to_id'], m['cols'], m['vectors'] = get_vectors(m['path'], m['columns_to_use'], self.row_ids_of_interest)
            self.matrices_dict[alias] = m

        # make sure all matrices have identical rows:
        if len(set([m['id_to_sample'].values().__str__() for m in self.matrices_dict.values()])) > 1:
            master_rows, master_matrix = sorted([(len(self.matrices_dict[m]['id_to_sample']), self.matrices_dict[m]['id_to_sample'].values(), m)\
                                                            for m in self.matrices_dict])[0][1:]
            self.master = master_matrix
            self.master_rows = master_rows
            # the smallest matrix is 'master_matrix', and the rows it has is master_rows. so every other matrix
            # must match that, or we will throw a tantrum.
            for matrix in [m for m in self.matrices if m != master_matrix]:
                m = self.matrices_dict[matrix]

                # get reduced set of vectors from rows that match `master_rows`:
                m['id_to_sample'], m['sample_to_id'], m['cols'], m['vectors'] = get_vectors(m['path'], m['columns_to_use'], master_rows)

                if len(m['vectors']) != len(master_rows):
                    raise ConfigError, 'The content of rows differed between input matrices. So I tried to\
                                        match all other matrices to the matrix with the smallest number of\
                                        rows (which was "%s"). However, not all other matrices contained\
                                        the small set of rows.' % (master_matrix)
        else:
            self.master_rows = sorted(self.matrices_dict[self.matrices[0]]['sample_to_id'].keys())

        self.num_matrices = len(self.matrices)
        self.multiple_matrices = self.num_matrices > 1
コード例 #9
0
    def get_short_reads_for_splits_dict(self):
        if not self.initialized:
            raise ConfigError('The `GetReadsFromBAM` class is not initialized :/ Ad hoc use of this class is\
                               OK, but in that case you should set `self.initialized` to True, and provide\
                               the split names of interest manually.')

        if not len(self.split_names_of_interest):
            raise ConfigError("The split names of interest set is empty. This should have never happened. Good\
                               job.")

        short_reads_for_splits_dict = {}
        if self.split_R1_and_R2:
            short_reads_for_splits_dict['R1'] = {}
            short_reads_for_splits_dict['R2'] = {}
            short_reads_for_splits_dict['UNPAIRED'] = {}
        else:
            short_reads_for_splits_dict['all'] = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = list(contigs_involved[contig_id].keys())
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = filesnpaths.get_name_from_file_path(bam_file_path)

            bam_file_object = BAMFileObject(bam_file_path).get()

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            '''here's what's available in the read objects below:

            ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
             'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
             'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired',
             'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
             'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
             'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend',
             'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
             'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
             'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
             'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags',
             'template_length', 'tid', 'tlen']'''

            has_unknown_mate = {}
            if self.split_R1_and_R2:
                for contig_id, start, stop in contig_start_stops:
                    for read in bam_file_object.fetch(contig_id, start, stop):

                        defline = '_'.join([contig_id, str(start), str(stop), read.query_name, bam_file_name])

                        if not read.is_paired:
                            short_reads_for_splits_dict['UNPAIRED'][defline] = read.query_sequence

                        elif defline in has_unknown_mate:
                            # `read`s mate has already been read. so assign the read and the mate
                            # to their respective 'R1' and 'R2' dictionaries, then remove the mate
                            # from has_unknown_mate since its mate is now known.
                            read_DIRECTION = 'R1' if read.is_read1 else 'R2'
                            mate_DIRECTION = 'R2' if read_DIRECTION == 'R1' else 'R1'
                            short_reads_for_splits_dict[mate_DIRECTION][defline] = has_unknown_mate[defline]
                            short_reads_for_splits_dict[read_DIRECTION][defline] = read.query_sequence
                            del has_unknown_mate[defline]

                        else:
                            has_unknown_mate[defline] = read.query_sequence
                short_reads_for_splits_dict['UNPAIRED'].update(has_unknown_mate)
            else:
                for contig_id, start, stop in contig_start_stops:
                    for read in bam_file_object.fetch(contig_id, start, stop):
                        short_reads_for_splits_dict['all']['_'.join([contig_id, str(start), str(stop), read.query_name, bam_file_name])] = read.query_sequence
            bam_file_object.close()

        self.progress.end()

        return short_reads_for_splits_dict
コード例 #10
0
ファイル: interactive.py プロジェクト: psaxcode/anvio
    def load_full_mode(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\
                                should use the `--manual` flag if that's what your intention)."

        if not self.profile_db_path:
            raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\
                                Well. This does not make any sense."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(
            os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # load views from the profile database
        if self.p_meta['blank']:
            blank_dict = {}
            for split_name in self.splits_basic_info:
                blank_dict[split_name] = {'blank_view': 0}

            self.views['blank_view'] = {
                'header': ['blank_view'],
                'dict': blank_dict
            }
            self.default_view = 'blank_view'

        else:
            self.load_views()
            self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('',
                        header='Available views (%d)' % len(self.views),
                        lc='green')
            for view in self.views:
                run.info(
                    view,
                    'Via "%s" table' % self.views[view]['table_name'],
                    lc='crimson',
                    mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('',
                        header='Available states (%d)' %
                        len(self.states_table.states),
                        lc='green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' %
                         self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(
                self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [
                str
            ] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {
                'table_name':
                'NA',
                'header':
                additional_view_columns,
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(
                    self.additional_view_path, column_mapping=column_mapping)
            }

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings

        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = clustering_id
                self.p_meta['available_clusterings'] = [clustering_id]
                self.p_meta['clusterings'] = {
                    clustering_id: {
                        'newick': open(os.path.abspath(self.tree)).read()
                    }
                }
                run.info(
                    'Additional Tree',
                    "Splits will be organized based on '%s'." % clustering_id)
            else:
                self.p_meta['clusterings'][clustering_id] = {
                    'newick': open(os.path.abspath(self.tree)).read()
                }
                run.info(
                    'Additional Tree',
                    "'%s' has been added to available trees." % clustering_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace(
                '_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_profile_data_available:
            summary_cp_available = os.path.exists(
                os.path.join(os.path.dirname(self.profile_db_path),
                             'SUMMARY.cp'))
            self.run.warning(
                "Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" %
                ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."
                 ))

        if self.state_autoload:
            if not self.state_autoload in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state_autoload
コード例 #11
0
ファイル: interactive.py プロジェクト: ascendo/anvio
    def load_full_mode(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\
                                should use the `--manual` flag if that's what your intention)."

        if not self.profile_db_path:
            raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\
                                Well. This does not make any sense."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        if self.p_meta['blank']:
            blank_dict = {}
            for split_name in self.splits_basic_info:
                blank_dict[split_name] = {'blank_view': 0}

            self.views['blank_view'] = {'header': ['blank_view'],
                                        'dict': blank_dict}
            self.default_view = 'blank_view'

        else:
            self.load_views()
            self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header='Available views (%d)' % len(self.views), lc='green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header='Available states (%d)' % len(self.states_table.states), lc='green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping=column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings

        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = clustering_id
                self.p_meta['available_clusterings'] = [clustering_id]
                self.p_meta['clusterings'] = {clustering_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % clustering_id)
            else:
                self.p_meta['clusterings'][clustering_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % clustering_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_profile_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state:
            if not self.state in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state
コード例 #12
0
ファイル: interactive.py プロジェクト: ascendo/anvio
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}}

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {'header': view_data_columns[1:],
                                             'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {'header': ['names'],
                                             'dict': ad_hoc_dict}

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                    'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title
コード例 #13
0
ファイル: bamops.py プロジェクト: ascendo/anvio
    def get_short_reads_for_splits_dict(self):
        short_reads_for_splits_dict = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = contigs_involved[contig_id].keys()
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = filesnpaths.get_name_from_file_path(bam_file_path)

            bam_file_object = BAMFileObject(bam_file_path).get()

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            for contig_id, start, stop in contig_start_stops:
                for entry in bam_file_object.fetch(contig_id, start, stop):
                    '''
                    here's what's available in the entry object:

                    ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
                     'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
                     'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired',
                     'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
                     'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
                     'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend',
                     'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
                     'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
                     'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
                     'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags', 'template_length', 'tid', 'tlen']'''

                    # we are doing only for 'single reads', but I think this has to take into account the paired-end case as well.
                    short_reads_for_splits_dict['_'.join([contig_id, str(start), str(stop), entry.query_name, bam_file_name])] = entry.query_sequence

            bam_file_object.close()

        self.progress.end()


        return short_reads_for_splits_dict
コード例 #14
0
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError(
                "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database.")

        if not self.profile_db_path:
            raise ConfigError(
                "Even when you want to use the interactive interface in manual mode, you need\
                                to provide a profile database path. But you DO NOT need an already existing\
                                profile database, since anvi'o will generate an empty one for you. The profile\
                                database in this mode only used to read or store the 'state' of the display\
                                for visualization purposes, or to allow you to create and store collections."
            )

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if filesnpaths.is_file_exists(self.profile_db_path, dont_raise=True):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError(
                    "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist).")

        if not self.tree and not self.view_data_path:
            raise ConfigError(
                "You must be joking Mr. Feynman. No tree file, and no data file? What is it that\
                               anvi'o supposed to visualize? :(")

        if not self.tree:
            self.run.warning(
                "You haven't declared a tree file. Anvi'o will do its best to come up with an\
                              organization of your items.")

        if self.view:
            raise ConfigError(
                "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode")

        if self.show_views:
            raise ConfigError(
                "Sorry, there are no views to show in manual mode :/")

        if self.show_states:
            raise ConfigError(
                "Sorry, there are no states to show in manual mode :/")

        if self.tree:
            filesnpaths.is_file_exists(self.tree)
            newick_tree_text = ''.join([
                l.strip()
                for l in open(os.path.abspath(self.tree)).readlines()
            ])
            item_names = utils.get_names_order_from_newick_tree(
                newick_tree_text)
        else:
            item_names = utils.get_column_data_from_TAB_delim_file(
                self.view_data_path, column_indices=[0])[0][1:]

        # try to convert item names into integer values for proper sorting later. it's OK if it does
        # not work.
        try:
            item_names = [int(n) for n in item_names]
        except:
            pass

        view_data_path = os.path.abspath(
            self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.default_view = self.p_meta['default_view']

        # set some default organizations of data:
        self.p_meta['clusterings'] = {
            'Alphabetical_(reverse):none:none': {
                'basic': sorted(item_names)
            },
            'Alphabetical:none:none': {
                'basic': sorted(item_names, reverse=True)
            }
        }
        self.p_meta['available_clusterings'] = [
            'Alphabetical_(reverse):none:none', 'Alphabetical:none:none'
        ]
        self.p_meta['default_clustering'] = self.p_meta[
            'available_clusterings'][0]

        # if we have a tree, let's make arrangements for it:
        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            self.p_meta['default_clustering'] = clustering_id
            self.p_meta['available_clusterings'].append(clustering_id)
            self.p_meta['clusterings'][clustering_id] = {
                'newick': newick_tree_text
            }

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(
                view_data_path, include_first_column=True)

            # load view data as the default view:
            self.views[self.default_view] = {
                'header': view_data_columns[1:],
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(view_data_path)
            }
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            ad_hoc_dict = {}
            for item in item_names:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {
                'header': ['names'],
                'dict': ad_hoc_dict
            }

        self.displayed_item_names_ordered = list(
            self.views[self.default_view]['dict'].keys())

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(
                self.displayed_item_names_ordered) - set(
                    self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError(
                    'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"'
                    %
                    (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()))

            # setup a mock splits_basic_info dict
            for split_id in self.displayed_item_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path)

        # read description from self table, if it is not available get_description function will return placeholder text
        self.p_meta['description'] = get_description_in_db(
            self.profile_db_path)

        if self.title:
            self.title = self.title
コード例 #15
0
    def get_short_reads_for_splits_dict(self):
        short_reads_for_splits_dict = {}

        self.progress.new('Accessing reads')
        self.progress.update(
            'Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(
            t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(
            self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update(
            'Computing start/stops positions of interest in %d contigs ...' %
            (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = contigs_involved[contig_id].keys()
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(
                splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append(
                    (contig_id, splits_basic_info[first_split]['start'],
                     splits_basic_info[last_split]['end']), )

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = filesnpaths.get_name_from_file_path(bam_file_path)

            bam_file_object = BAMFileObject(bam_file_path).get()

            self.progress.update(
                'Creating a dictionary of matching short reads in %s ...' %
                bam_file_name)

            for contig_id, start, stop in contig_start_stops:
                for entry in bam_file_object.fetch(contig_id, start, stop):
                    '''
                    here's what's available in the entry object:

                    ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
                     'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
                     'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired',
                     'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
                     'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
                     'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend',
                     'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
                     'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
                     'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
                     'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags', 'template_length', 'tid', 'tlen']'''

                    # we are doing only for 'single reads', but I think this has to take into account the paired-end case as well.
                    short_reads_for_splits_dict['_'.join([
                        contig_id,
                        str(start),
                        str(stop), entry.query_name, bam_file_name
                    ])] = entry.query_sequence

            bam_file_object.close()

        self.progress.end()

        return short_reads_for_splits_dict
コード例 #16
0
    def __init__(self,
                 config_file_path,
                 input_directory=None,
                 db_paths={},
                 row_ids_of_interest=[],
                 r=run,
                 p=progress):
        self.run = r
        self.progress = p

        self.input_directory = input_directory or os.path.abspath(os.getcwd())
        self.config_file_path = config_file_path

        # `row_ids_of_interest` gives opportunity to filter out irrelevant entries quickly
        # while vectors are being obtained from each matrix described in the config file.
        # to see why it is important in the context of anvi'o, see
        # https://github.com/meren/anvio/issues/100
        self.row_ids_of_interest = set(row_ids_of_interest)

        # these are the database files that may be referenced from within the config files
        # with !DATABASE.db::table notation. If a database entry has an exclamation mark,
        # it will be searched for in the db_paths dict to associate it with the relative
        # path that is only known to the client
        self.db_paths = db_paths

        # read the config
        filesnpaths.is_file_exists(self.config_file_path)
        config = ConfigParser.ConfigParser()
        config.read(self.config_file_path)

        # this will keep the actual paths for each matrix:
        self.matrix_paths = {}
        self.set_default_paths(config)

        self.check_for_db_requests(config)

        # and sanity check.
        self.sanity_check(config)

        if self.get_option(config, 'general', 'output_file', str):
            self.output_file_name = self.get_option(config, 'general',
                                                    'output_file', str)
            self.output_file_path = os.path.join(self.input_directory,
                                                 self.output_file_name)
        else:
            self.output_file_name = None
            self.output_file_path = None

        self.name = self.get_option(
            config, 'general', 'name',
            str) or filesnpaths.get_name_from_file_path(self.config_file_path)
        self.distance = self.get_option(config, 'general', 'distance', str)
        self.linkage = self.get_option(config, 'general', 'linkage', str)

        self.num_components = self.get_option(config, 'general',
                                              'num_components', int)
        self.seed = self.get_option(config, 'general', 'seed', int)
        self.master = None

        self.matrices_dict = {}
        self.matrices = []
        for section in self.get_other_sections(config):
            alias, matrix = section.split()

            self.matrices.append(alias)

            m = {}
            columns_to_use = self.get_option(config, section, 'columns_to_use',
                                             str)
            m['alias'] = alias
            m['matrix'] = matrix
            m['columns_to_use'] = [
                c.strip() for c in columns_to_use.split(',')
            ] if columns_to_use else None
            m['ratio'] = self.get_option(config, section, 'ratio', int)
            m['path'] = self.matrix_paths[alias]
            m['normalize'] = False if self.get_option(
                config, section, 'normalize', str) == 'False' else True
            m['log'] = True if self.get_option(config, section, 'log',
                                               str) == 'True' else False
            # next two variables are necessary to follow the order of vectors
            m['id_to_sample'], m['sample_to_id'], m['cols'], m[
                'vectors'] = get_vectors(m['path'], m['columns_to_use'],
                                         self.row_ids_of_interest)
            self.matrices_dict[alias] = m

        # make sure all matrices have identical rows:
        if len(
                set([
                    m['id_to_sample'].values().__str__()
                    for m in self.matrices_dict.values()
                ])) > 1:
            master_rows, master_matrix = sorted([(len(self.matrices_dict[m]['id_to_sample']), self.matrices_dict[m]['id_to_sample'].values(), m)\
                                                            for m in self.matrices_dict])[0][1:]
            self.master = master_matrix
            self.master_rows = master_rows
            # the smallest matrix is 'master_matrix', and the rows it has is master_rows. so every other matrix
            # must match that, or we will throw a tantrum.
            for matrix in [m for m in self.matrices if m != master_matrix]:
                m = self.matrices_dict[matrix]

                # get reduced set of vectors from rows that match `master_rows`:
                m['id_to_sample'], m['sample_to_id'], m['cols'], m[
                    'vectors'] = get_vectors(m['path'], m['columns_to_use'],
                                             master_rows)

                if len(m['vectors']) != len(master_rows):
                    raise ConfigError, 'The content of rows differed between input matrices. So I tried to\
                                        match all other matrices to the matrix with the smallest number of\
                                        rows (which was "%s"). However, not all other matrices contained\
                                        the small set of rows.' % (
                        master_matrix)
        else:
            self.master_rows = sorted(
                self.matrices_dict[self.matrices[0]]['sample_to_id'].keys())

        self.num_matrices = len(self.matrices)
        self.multiple_matrices = self.num_matrices > 1
コード例 #17
0
ファイル: interactive.py プロジェクト: psaxcode/anvio
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError, "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist)."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(
            self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
            self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {
            clustering_id: {
                'newick':
                ''.join([
                    l.strip()
                    for l in open(os.path.abspath(self.tree)).readlines()
                ])
            }
        }

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(
                view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {
                'header': view_data_columns[1:],
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(view_data_path)
            }
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {
                'header': ['names'],
                'dict': ad_hoc_dict
            }

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(
                self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (
                    num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        if self.title:
            self.title = self.title