Ejemplo n.º 1
0
    def sanity_check(self):
        self.distance = self.distance or constants.distance_metric_default
        self.linkage = self.linkage or constants.linkage_method_default

        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path('additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path, new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        if self.samples_info_file_path:
            new_samples_info_file_path = self.get_output_file_path('anvio_samples_info.txt')
            shutil.copyfile(self.samples_info_file_path, new_samples_info_file_path)
            self.samples_info_file_path = new_samples_info_file_path


        self.sanity_checked = True
Ejemplo n.º 2
0
    def __init__(self, args = {}, p=progress, r=run):
        self.args = args

        self.run = r
        self.progress = p

        self.samples = None
        self.samples_information_dict = None
        self.data = None

        A = lambda x, t: t(args.__dict__[x]) if args.__dict__.has_key(x) else None
        null = lambda x: x
        self.input_file_path = A('input_file', null)
        self.samples_information_path = A('samples_information', null)
        self.max_num_unique_positions = A('max_num_unique_positions', int)
        self.output_file_path = A('output_file', null)

        filesnpaths.is_output_file_writable(self.output_file_path)

        if self.samples_information_path:
            filesnpaths.is_file_tab_delimited(self.samples_information_path)
            self.samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(self.samples_information_path)
            num_attributes = len(self.samples_information_dict.values()[0])

            self.run.info('samples_information', '%d attributes read for %d samples' % (num_attributes, len(self.samples_information_dict)))

        if self.input_file_path:
            filesnpaths.is_file_tab_delimited(self.input_file_path)
            self.progress.new('Reading the input file')
            self.progress.update('...')
            self.data = utils.get_TAB_delimited_file_as_dictionary(self.input_file_path)
            self.progress.end()

            self.run.info('input_file', '%d entries read' % len(self.data))
Ejemplo n.º 3
0
def get_newick_tree_data(observation_matrix_path, output_file_name = None, clustering_distance='euclidean',
                         clustering_method = 'complete', norm = 'l1', progress = progress):
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    if output_file_name:
        output_file_name = os.path.abspath(output_file_name)
        output_directory = os.path.dirname(output_file_name)
        if not os.access(output_directory, os.W_OK):
            raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory
    
    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path)

    vectors = np.array(vectors)

    # normalize vectors:
    vectors = get_normalized_vectors(vectors, norm=norm, progress=progress)

    tree = get_clustering_as_tree(vectors, clustering_distance, clustering_method, progress)
    newick = get_tree_object_in_newick(tree, id_to_sample_dict)
   
    if output_file_name:
        open(output_file_name, 'w').write(newick.strip() + '\n')

    return newick
Ejemplo n.º 4
0
    def populate_from_file(self,
                           additional_data_file_path,
                           skip_check_names=None):

        if skip_check_names is None and utils.is_blank_profile(self.db_path):
            # FIXME: this BS is here because blank abvi'o profiles do not know what items they have,
            #        hence the utils.get_all_item_names_from_the_database function eventually explodes if we
            #        don't skip check names.
            skip_check_names = True

        filesnpaths.is_file_tab_delimited(additional_data_file_path)

        data_keys = utils.get_columns_of_TAB_delim_file(
            additional_data_file_path)
        data_dict = utils.get_TAB_delimited_file_as_dictionary(
            additional_data_file_path)

        if not len(data_keys):
            raise ConfigError("There is something wrong with the additional data file for %s at %s.\
                               It does not seem to have any additional keys for data :/" \
                                            % (self.target, additional_data_file_path))

        if self.target == 'layer_orders':
            OrderDataBaseClass.add(self, data_dict, skip_check_names)
        else:
            AdditionalDataBaseClass.add(self, data_dict, data_keys,
                                        skip_check_names)
Ejemplo n.º 5
0
    def __init__(self, args = None):
        self.args = args
        self.input_file_path = None 
        self.contigs_and_positions = {}

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if args:
            filesnpaths.is_file_exists(args.input_file)
            self.input_file_path = args.input_file

            if args.list_contigs:
                self.list_contigs()
                sys.exit()

            filesnpaths.is_file_exists(args.contigs_and_positions)
            filesnpaths.is_file_tab_delimited(args.contigs_and_positions, expected_number_of_fields = 2)

            f = open(args.contigs_and_positions)
            for line in f.readlines():
                contig_name, positions = line.split('\t')

                try:
                    positions = [int(pos) for pos in positions.split(',')]
                except ValueError:
                    raise ConfigError, 'Positions for contig "%s" does not seem to be comma-separated integers...' % contig_name

                self.contigs_and_positions[contig_name] = set(positions)

        self.bam = None
        self.linkmers = None
Ejemplo n.º 6
0
def get_newick_tree_data(observation_matrix_path,
                         output_file_name=None,
                         clustering_distance='euclidean',
                         clustering_method='complete',
                         norm='l1',
                         progress=progress):
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    if output_file_name:
        output_file_name = os.path.abspath(output_file_name)
        output_directory = os.path.dirname(output_file_name)
        if not os.access(output_directory, os.W_OK):
            raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(
        observation_matrix_path)

    vectors = np.array(vectors)

    # normalize vectors:
    vectors = get_normalized_vectors(vectors, norm=norm, progress=progress)

    tree = get_clustering_as_tree(vectors, clustering_distance,
                                  clustering_method, progress)
    newick = get_tree_object_in_newick(tree, id_to_sample_dict)

    if output_file_name:
        open(output_file_name, 'w').write(newick.strip() + '\n')

    return newick
Ejemplo n.º 7
0
def get_newick_tree_data(observation_matrix_path,
                         output_file_name=None,
                         linkage=constants.linkage_method_default,
                         distance=constants.distance_metric_default,
                         norm='l1',
                         progress=progress,
                         transpose=False):

    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    if output_file_name:
        output_file_name = os.path.abspath(output_file_name)
        output_directory = os.path.dirname(output_file_name)
        if not os.access(output_directory, os.W_OK):
            raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(
        observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    # normalize vectors:
    vectors = get_normalized_vectors(vectors, norm=norm, progress=progress)

    tree = get_clustering_as_tree(vectors, linkage, distance, progress)
    newick = get_tree_object_in_newick(tree, id_to_sample_dict)

    if output_file_name:
        open(output_file_name, 'w').write(newick.strip() + '\n')

    return newick
Ejemplo n.º 8
0
    def __init__(self, args = None):
        self.args = args
        self.input_file_path = None 
        self.contigs_and_positions = {}

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if args:
            filesnpaths.is_file_exists(args.input_file)
            self.input_file_path = args.input_file

            if args.list_contigs:
                self.list_contigs()
                sys.exit()

            filesnpaths.is_file_exists(args.contigs_and_positions)
            filesnpaths.is_file_tab_delimited(args.contigs_and_positions, expected_number_of_fields = 2)

            f = open(args.contigs_and_positions)
            for line in f.readlines():
                contig_name, positions = line.split('\t')

                try:
                    positions = [int(pos) for pos in positions.split(',')]
                except ValueError:
                    raise ConfigError, 'Positions for contig "%s" does not seem to be comma-separated integers...' % contig_name

                self.contigs_and_positions[contig_name] = set(positions)

        self.bam = None
        self.linkmers = None
Ejemplo n.º 9
0
    def sanity_check(self):
        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path('additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path, new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        if self.samples_info_file_path:
            new_samples_info_file_path = self.get_output_file_path('anvio_samples_info.txt')
            shutil.copyfile(self.samples_info_file_path, new_samples_info_file_path)
            self.samples_info_file_path = new_samples_info_file_path


        self.sanity_checked = True
Ejemplo n.º 10
0
def create_newick_file_from_matrix_file(
        observation_matrix_path,
        output_file_path,
        linkage=constants.linkage_method_default,
        distance=constants.distance_metric_default,
        norm='l1',
        progress=progress,
        transpose=False,
        items_order_file_path=None):
    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    filesnpaths.is_output_file_writable(output_file_path)
    if items_order_file_path:
        filesnpaths.is_output_file_writable(items_order_file_path)

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(
        observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    newick = get_newick_from_matrix(vectors, distance, linkage, norm,
                                    id_to_sample_dict)

    if output_file_path:
        open(output_file_path, 'w').write(newick.strip() + '\n')

    if items_order_file_path:
        open(items_order_file_path, 'w').write(
            '\n'.join(utils.get_names_order_from_newick_tree(newick)) + '\n')
Ejemplo n.º 11
0
    def sanity_check(self):
        self.distance = self.distance or constants.distance_metric_default
        self.linkage = self.linkage or constants.linkage_method_default

        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path('additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path, new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        if self.samples_info_file_path:
            new_samples_info_file_path = self.get_output_file_path('anvio_samples_info.txt')
            shutil.copyfile(self.samples_info_file_path, new_samples_info_file_path)
            self.samples_info_file_path = new_samples_info_file_path


        self.sanity_checked = True
Ejemplo n.º 12
0
    def parse_genes(self, comma_delimited_genes=None, genes_filepath=None):

        if comma_delimited_genes:
            gene_caller_ids = set(
                [x.strip() for x in comma_delimited_genes.split(',')])
            genes = []
            for gene in gene_caller_ids:
                try:
                    genes.append(int(gene))
                except:
                    raise ConfigError(
                        "Anvi'o does not like your gene caller id '%s'..." %
                        str(gene))

        elif genes_filepath:
            filesnpaths.is_file_tab_delimited(genes_filepath,
                                              expected_number_of_fields=1)

            try:
                genes = set(
                    [int(s.strip()) for s in open(genes_filepath).readlines()])
            except ValueError:
                raise ConfigError(
                    "Well. Anvi'o was working on your genes in `%s` ... and ... those gene IDs did not\
                                   look like anvi'o gene caller ids :/ Anvi'o is now sad."
                    % genes_filepath)

        return set(genes)
Ejemplo n.º 13
0
def get_newick_tree_data(observation_matrix_path, output_file_name=None, linkage=constants.linkage_method_default,
                         distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False):

    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)

    if output_file_name:
        output_file_name = os.path.abspath(output_file_name)
        output_directory = os.path.dirname(output_file_name)
        if not os.access(output_directory, os.W_OK):
            raise ConfigError, "You do not have write permission for the output directory: '%s'" % output_directory

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    # normalize vectors:
    vectors = get_normalized_vectors(vectors, norm=norm, progress=progress)

    tree = get_clustering_as_tree(vectors, linkage, distance, progress)
    newick = get_tree_object_in_newick(tree, id_to_sample_dict)

    if output_file_name:
        open(output_file_name, 'w').write(newick.strip() + '\n')

    return newick
Ejemplo n.º 14
0
def get_vectors_from_TAB_delim_matrix(file_path, cols_to_return=None, rows_to_return=[], transpose=False):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path)

    if transpose:
        transposed_file_path = filesnpaths.get_temp_file_path()
        transpose_tab_delimited_file(file_path, transposed_file_path)
        file_path = transposed_file_path

    rows_to_return = set(rows_to_return)
    vectors = []
    id_to_sample_dict = {}
    sample_to_id_dict = {}

    input_matrix = open(file_path)
    columns = input_matrix.readline().strip().split("\t")[1:]

    fields_of_interest = []
    if cols_to_return:
        fields_of_interest = [columns.index(col) for col in cols_to_return]
    else:
        fields_of_interest = [f for f in range(0, len(columns)) if IS_ESSENTIAL_FIELD(columns[f])]

    # update columns:
    columns = [columns[i] for i in fields_of_interest]

    if not len(columns):
        raise ConfigError, "Only a subset (%d) of fields were requested by the caller, but none of them was found\
                            in the matrix (%s) :/" % (
            len(cols_to_return),
            file_path,
        )

    id_counter = 0
    for line in input_matrix.readlines():
        row_name = line.strip().split("\t")[0]
        if rows_to_return and row_name not in rows_to_return:
            continue
        id_to_sample_dict[id_counter] = row_name
        fields = line.strip().split("\t")[1:]

        if fields_of_interest:
            vector = [float(fields[i]) for i in fields_of_interest]
        else:
            vector = [float(f) for f in fields]

        vectors.append(vector)

        id_counter += 1

    input_matrix.close()

    if transpose:
        # remove clutter
        os.remove(file_path)

    sample_to_id_dict = dict([(v, k) for k, v in id_to_sample_dict.iteritems()])

    return id_to_sample_dict, sample_to_id_dict, columns, vectors
Ejemplo n.º 15
0
    def load_from_files(self, args):
        if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir):
            raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\
                                           '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\
                                           more detailed information on them."

        if self.view:
            raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/"

        metadata_path = os.path.abspath(self.metadata)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = os.path.abspath(self.output_dir)
        self.p_meta['views'] = {}
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        if self.summary_index:
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)
            self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index'])

        # sanity of the metadata
        filesnpaths.is_file_tab_delimited(metadata_path)
        metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True)
        if not metadata_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the metadata file must\
                                      say 'contig', which is not the case for your metadata file\
                                      ('%s'). Please make sure this is a properly formatted metadata\
                                      file." % (metadata_path)

        # store metadata as view:
        self.views[self.default_view] = {'header': metadata_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # reminder: this is being stored in the output dir provided as a commandline parameter:
        self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp')

        if self.title:
            self.title = self.title

        filesnpaths.gen_output_directory(self.p_meta['output_dir'])
Ejemplo n.º 16
0
    def sanity_check(self):
        filesnpaths.is_file_tab_delimited(self.metadata_file_path)

        if os.path.exists(self.output_directory_path):
            filesnpaths.is_output_dir_writable(self.output_directory_path)
        else:
            filesnpaths.gen_output_directory(self.output_directory_path)

        filesnpaths.is_output_file_writable(self.output_fasta_descriptor)
Ejemplo n.º 17
0
def get_vectors_from_TAB_delim_matrix(file_path, cols_to_return=None, rows_to_return = [], transpose = False):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path)

    if transpose:
        transposed_file_path = filesnpaths.get_temp_file_path()
        transpose_tab_delimited_file(file_path, transposed_file_path)
        file_path = transposed_file_path

    rows_to_return = set(rows_to_return)
    vectors = []
    id_to_sample_dict = {}
    sample_to_id_dict = {}

    input_matrix = open(file_path, 'rU')
    columns = input_matrix.readline().strip().split('\t')[1:]

    fields_of_interest = []
    if cols_to_return:
        fields_of_interest = [columns.index(col) for col in cols_to_return]
    else:
        fields_of_interest = [f for f in range(0, len(columns)) if IS_ESSENTIAL_FIELD(columns[f])]

    # update columns:
    columns = [columns[i] for i in fields_of_interest]

    if not len(columns):
        raise ConfigError, "Only a subset (%d) of fields were requested by the caller, but none of them was found\
                            in the matrix (%s) :/" % (len(cols_to_return), file_path)

    id_counter = 0
    for line in input_matrix.readlines():
        row_name = line.strip().split('\t')[0]
        if rows_to_return and row_name not in rows_to_return:
                continue
        id_to_sample_dict[id_counter] = row_name
        fields = line.strip().split('\t')[1:]

        if fields_of_interest:
            vector = [float(fields[i]) for i in fields_of_interest]
        else:
            vector = [float(f) for f in fields]

        vectors.append(vector)

        id_counter += 1

    input_matrix.close()

    if transpose:
        # remove clutter
        os.remove(file_path)

    sample_to_id_dict = dict([(v, k) for k, v in id_to_sample_dict.iteritems()])

    return id_to_sample_dict, sample_to_id_dict, columns, vectors
Ejemplo n.º 18
0
    def init(self):
        super().init()

        # loading the samples.txt file
        self.samples_txt_file = self.get_param_value_from_config(['samples_txt'])
        filesnpaths.is_file_tab_delimited(self.samples_txt_file)
        try:
            # getting the samples information (names, [group], path to r1, path to r2) from samples.txt
            self.samples_information = pd.read_csv(self.samples_txt_file, sep='\t', index_col=False)
        except IndexError as e:
            raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. "
                              "This is what we know: '%s'" % (self.samples_txt_file, e))
        if 'sample' not in list(self.samples_information.columns):
            raise ConfigError("Looks like your samples_txt file, '%s', is not properly formatted. "
                              "We are not sure what's wrong, but we can't find a column with title 'sample'." % self.samples_txt_file)


        # get a list of the sample names
        self.sample_names = list(self.samples_information['sample'])
        self.run_metaspades = self.get_param_value_from_config(['metaspades', 'run'])
        self.use_scaffold_from_metaspades = self.get_param_value_from_config(['metaspades', 'use_scaffolds'])
        self.use_scaffold_from_idba_ud = self.get_param_value_from_config(['idba_ud', 'use_scaffolds'])
        self.run_qc = self.get_param_value_from_config(['iu_filter_quality_minoche', 'run']) == True
        self.run_summary = self.get_param_value_from_config(['anvi_summarize', 'run']) == True
        self.run_split = self.get_param_value_from_config(['anvi_split', 'run']) == True
        self.references_mode = self.get_param_value_from_config('references_mode')
        self.fasta_txt_file = self.get_param_value_from_config('fasta_txt')
        self.profile_databases = {}

        self.references_for_removal_txt = self.get_param_value_from_config(['remove_short_reads_based_on_references',\
                                                                            'references_for_removal_txt'])
        if self.references_for_removal_txt:
            self.load_references_for_removal()

        self.collections_txt = self.get_param_value_from_config('collections_txt')
        if self.collections_txt:
            self.load_collections()
        elif self.run_summary:
            raise ConfigError('If you want to run anvi-summarize you must provide a collections_txt file')
        elif self.run_split:
            raise ConfigError('If you want to run anvi-split you must provide a collections_txt file')

        self.init_samples_txt()
        self.init_kraken()
        self.init_refereces_txt()

        # Set the PROFILE databases paths variable:
        for group in self.group_names:
            # we need to use the single profile if the group is of size 1.
            self.profile_databases[group] = os.path.join(self.dirs_dict["MERGE_DIR"], group, "PROFILE.db") if self.group_sizes[group] > 1 else \
                                               os.path.join(self.dirs_dict["PROFILE_DIR"],
                                                            group,
                                                            self.samples_information.loc[self.samples_information['group']==group,'sample'].values[0],
                                                            "PROFILE.db")
Ejemplo n.º 19
0
def transpose_tab_delimited_file(input_file_path, output_file_path):
    filesnpaths.is_file_exists(input_file_path)
    filesnpaths.is_file_tab_delimited(input_file_path)
    filesnpaths.is_output_file_writable(output_file_path)

    file_content = [line.strip('\n').split('\t') for line in open(input_file_path, 'rU').readlines()]

    output_file = open(output_file_path, 'w')
    for entry in zip(*file_content):
        output_file.write('\t'.join(entry) + '\n')
    output_file.close()

    return output_file_path
Ejemplo n.º 20
0
def transpose_tab_delimited_file(input_file_path, output_file_path):
    filesnpaths.is_file_exists(input_file_path)
    filesnpaths.is_file_tab_delimited(input_file_path)
    filesnpaths.is_output_file_writable(output_file_path)

    file_content = [line.strip('\n').split('\t') for line in open(input_file_path).readlines()]

    output_file = open(output_file_path, 'w')
    for entry in zip(*file_content):
        output_file.write('\t'.join(entry) + '\n')
    output_file.close()

    return output_file_path
Ejemplo n.º 21
0
def create_newick_file_from_matrix_file(observation_matrix_path, output_file_name, linkage=constants.linkage_method_default,
                         distance=constants.distance_metric_default, norm='l1', progress=progress, transpose=False):
    is_distance_and_linkage_compatible(distance, linkage)
    filesnpaths.is_file_exists(observation_matrix_path)
    filesnpaths.is_file_tab_delimited(observation_matrix_path)
    filesnpaths.is_output_file_writable(output_file_name)

    id_to_sample_dict, sample_to_id_dict, header, vectors = utils.get_vectors_from_TAB_delim_matrix(observation_matrix_path, transpose=transpose)

    vectors = np.array(vectors)

    newick = get_newick_from_matrix(vectors, distance, linkage, norm, id_to_sample_dict)

    if output_file_name:
        open(output_file_name, 'w').write(newick.strip() + '\n')
Ejemplo n.º 22
0
    def get_internal_and_external_genomes_files(self):
        internal_genomes_file = self.get_param_value_from_config(
            'internal_genomes')
        external_genomes_file = self.get_param_value_from_config(
            'external_genomes')

        if not internal_genomes_file and not external_genomes_file:
            raise ConfigError(
                'You must provide either an external genomes file or internal genomes file'
            )

        fasta_txt_file = self.get_param_value_from_config('fasta_txt')
        if fasta_txt_file:
            if not external_genomes_file:
                raise ConfigError(
                    "You provided a fasta_txt, but didn't specify a path for an external-genomes file. "
                    "If you wish to use external genomes, you must specify a name for the external-genomes "
                    "file, using the `external_genomes` parameter in your config file. Just to clarify: "
                    "the external genomes file DOESN'T HAVE TO EXIST. Anvi'o can create it for you by "
                    "using the information you supplied in the `fasta_txt` file, but you still must specify "
                    "a name for the external-genomes file. For example, you could use \"external_genomes\": \"external-genomes.txt\" "
                    "(but feel free to be creative with the naming of your external-genomes file)."
                )

            filesnpaths.is_file_tab_delimited(fasta_txt_file)

        # here we do a little trick to make sure the rule can expect either one or both
        d = {
            "internal_genomes_file": external_genomes_file,
            "external_genomes_file": internal_genomes_file
        }

        if internal_genomes_file:
            filesnpaths.is_file_tab_delimited(internal_genomes_file)
            d['internal_genomes_file'] = internal_genomes_file

        if external_genomes_file:
            if not filesnpaths.is_file_exists(external_genomes_file,
                                              dont_raise=True):
                run.warning(
                    'There is no file %s. No worries, one will be created for you.'
                    % external_genomes_file)
            else:
                filesnpaths.is_file_tab_delimited

            d['external_genomes_file'] = external_genomes_file

        return d
Ejemplo n.º 23
0
    def __init__(self, args=None):
        self.args = args
        self.input_file_paths = []
        self.contig_and_position_requests_list = []

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if args:
            for input_file_path in args.input_files:
                filesnpaths.is_file_exists(input_file_path)

            self.input_file_paths = [
                os.path.abspath(p.strip()) for p in args.input_files
            ]

            if len(self.input_file_paths) != len(set(self.input_file_paths)):
                raise ConfigError(
                    "You can't declared the same BAM file twice :/")

            self.only_complete_links = args.only_complete_links

            if args.list_contigs:
                self.list_contigs()
                sys.exit()

            filesnpaths.is_file_exists(args.contigs_and_positions)
            filesnpaths.is_file_tab_delimited(args.contigs_and_positions,
                                              expected_number_of_fields=2)

            request_id = 0
            f = open(args.contigs_and_positions)
            for line in f.readlines():
                request_id += 1

                contig_name, positions = line.split('\t')

                try:
                    positions = [int(pos) for pos in positions.split(',')]
                except ValueError:
                    raise ConfigError(
                        'Positions for contig "%s" does not seem to be comma-separated integers...'
                        % contig_name)

                self.contig_and_position_requests_list.append(
                    (request_id, contig_name, set(positions)), )

        self.linkmers = None
Ejemplo n.º 24
0
    def get_genes_of_interest(self, genes_of_interest_path=None, gene_caller_ids=None):
        """
        nabs the genes of interest based on user arguments (self.args)
        """
        genes_of_interest = None

        # identify the gene caller ids of all genes available
        self.genes_in_contigs_database = set(dbops.ContigsSuperclass(self.args).genes_in_splits.keys())

        if not self.genes_in_contigs_database:
            raise ConfigError("This contigs database does not contain any identified genes...")

        # settling genes of interest
        if genes_of_interest_path and gene_caller_ids:
            raise ConfigError("You can't provide a gene caller id from the command line, and a list of gene caller ids\
                               as a file at the same time, obviously.")

        if gene_caller_ids:
            gene_caller_ids = set([x.strip() for x in gene_caller_ids.split(',')])

            genes_of_interest = []
            for gene in gene_caller_ids:
                try:
                    genes_of_interest.append(int(gene))
                except:
                    raise ConfigError("Anvi'o does not like your gene caller id '%s'..." % str(gene))

            genes_of_interest = set(genes_of_interest)

        elif genes_of_interest_path:
            filesnpaths.is_file_tab_delimited(genes_of_interest_path, expected_number_of_fields=1)

            try:
                genes_of_interest = set([int(s.strip()) for s in open(genes_of_interest_path).readlines()])
            except ValueError:
                raise ConfigError("Well. Anvi'o was working on your genes of interest ... and ... those gene IDs did not\
                                   look like anvi'o gene caller ids :/ Anvi'o is now sad.")

        if not genes_of_interest:
            # no genes of interest are specified. Assuming all, which could be innumerable--raise warning
            genes_of_interest = self.genes_in_contigs_database
            self.run.warning("You did not specify any genes of interest, so anvi'o will assume all of them are of interest.")

        return genes_of_interest
Ejemplo n.º 25
0
    def __init__(self, args = None):
        self.args = args
        self.input_file_paths = []
        self.contig_and_position_requests_list = []

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if args:
            for input_file_path in args.input_files:
                filesnpaths.is_file_exists(input_file_path)

            self.input_file_paths = [os.path.abspath(p.strip()) for p in args.input_files]

            if len(self.input_file_paths) != len(set(self.input_file_paths)):
                raise ConfigError, "You can't declared the same BAM file twice :/"

            self.only_complete_links = args.only_complete_links

            if args.list_contigs:
                self.list_contigs()
                sys.exit()

            filesnpaths.is_file_exists(args.contigs_and_positions)
            filesnpaths.is_file_tab_delimited(args.contigs_and_positions, expected_number_of_fields = 2)

            request_id = 0
            f = open(args.contigs_and_positions)
            for line in f.readlines():
                request_id += 1

                contig_name, positions = line.split('\t')

                try:
                    positions = [int(pos) for pos in positions.split(',')]
                except ValueError:
                    raise ConfigError, 'Positions for contig "%s" does not seem to be comma-separated integers...' % contig_name

                self.contig_and_position_requests_list.append((request_id, contig_name, set(positions)),)

        self.linkmers = None
Ejemplo n.º 26
0
    def parse_genes(self, comma_delimited_genes=None, genes_filepath=None):

        if comma_delimited_genes:
            gene_caller_ids = set([x.strip() for x in comma_delimited_genes.split(',')])
            genes = []
            for gene in gene_caller_ids:
                try:
                    genes.append(int(gene))
                except:
                    raise ConfigError("Anvi'o does not like your gene caller id '%s'..." % str(gene))

        elif genes_filepath:
            filesnpaths.is_file_tab_delimited(genes_filepath, expected_number_of_fields=1)

            try:
                genes = set([int(s.strip()) for s in open(genes_filepath).readlines()])
            except ValueError:
                raise ConfigError("Well. Anvi'o was working on your genes in `%s` ... and ... those gene IDs did not\
                                   look like anvi'o gene caller ids :/ Anvi'o is now sad." % genes_filepath)

        return set(genes)
Ejemplo n.º 27
0
    def __init__(self, args={}, p=progress, r=run):
        self.args = args

        self.run = r
        self.progress = p

        self.samples = None
        self.samples_information_dict = None
        self.variable_nts_table = None

        A = lambda x, t: t(args.__dict__[x]) if args.__dict__.has_key(
            x) else None
        null = lambda x: x
        self.input_file_path = A('input_file', null)
        self.samples_information_path = A('samples_information', null)
        self.max_num_unique_positions = A('max_num_unique_positions', int)
        self.output_file_path = A('output_file', null)

        filesnpaths.is_output_file_writable(self.output_file_path)

        if self.samples_information_path:
            filesnpaths.is_file_tab_delimited(self.samples_information_path)
            self.samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.samples_information_path)
            num_attributes = len(self.samples_information_dict.values()[0])

            self.run.info(
                'samples_information', '%d attributes read for %d samples' %
                (num_attributes, len(self.samples_information_dict)))

        if self.input_file_path:
            filesnpaths.is_file_tab_delimited(self.input_file_path)
            self.progress.new('Reading the input file')
            self.progress.update('...')
            self.variable_nts_table = utils.get_TAB_delimited_file_as_dictionary(
                self.input_file_path)
            self.progress.end()

            self.run.info('input_file',
                          '%d entries read' % len(self.variable_nts_table))
Ejemplo n.º 28
0
    def sanity_check(self):
        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path(
                'additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path,
                            new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        self.sanity_checked = True
Ejemplo n.º 29
0
def get_TAB_delimited_file_as_dictionary(file_path, expected_fields = None, dict_to_append = None, column_names = None,\
                                        column_mapping = None, indexing_field = 0, separator = '\t', no_header = False,\
                                        ascii_only = False, only_expected_fields = False, assign_none_for_missing = False,\
                                        none_value = None):
    """Takes a file path, returns a dictionary."""

    if expected_fields and not isinstance(expected_fields, list) and not isinstance(expected_fields, set):
        raise ConfigError, "'expected_fields' variable must be a list (or a set)."

        raise ConfigError, "'only_expected_fields' variable guarantees that there are no more fields present\
                            in the input file but the ones requested with 'expected_fields' variable. If you\
                            need to use this flag, you must also be explicit abou twhat fields you expect to\
                            find in the file."

    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path, separator = separator)

    f = open(file_path)

    # learn the number of fields and reset the file:
    num_fields = len(f.readline().strip('\n').split(separator))
    f.seek(0)

    # if there is no file header, make up a columns list:
    if no_header and not column_names:
        column_names = ['column_%05d' % i for i in range(0, num_fields)]

    if column_names:
        columns = column_names

        if num_fields != len(columns):
            raise  ConfigError, "Number of column names declared (%d) differs from the number of columns\
                                 found (%d) in the matrix ('%s') :/" % (len(columns), num_fields, file_path)

        # now we set the column names. if the file had its header, we must discard
        # the first line. so here we go:
        if not no_header:
            f.readline()
    else:
        columns = f.readline().strip('\n').split(separator)

    if expected_fields:
        for field in expected_fields:
            if field not in columns:
                raise ConfigError, "The file '%s' does not contain the right type of header. It was expected\
                                    to have these: '%s', however it had these: '%s'" % (file_path,
                                                                                        ', '.join(expected_fields),
                                                                                        ', '.join(columns[1:]))

    d = {}
    line_counter = 0

    for line in f.readlines():
        if ascii_only:
            if not is_ascii_only(line):
                raise ConfigError, "The input file conitans non-ascii characters at line number %d. Those lines\
                                    either should be removed, or edited." % (line_counter + 2)

        line_fields = [f if f else None for f in line.strip('\n').split(separator)]

        if column_mapping:
            updated_line_fields = []
            for i in range(0, len(line_fields)):
                try:
                    updated_line_fields.append(column_mapping[i](line_fields[i]))
                except NameError:
                    raise ConfigError, "Mapping function '%s' did not work on value '%s'. These functions can be native\
                                        Python functions, such as 'str', 'int', or 'float', or anonymous functions\
                                        defined using lambda notation." % (column_mapping[i], line_fields[i])
                except TypeError:
                    raise ConfigError, "Mapping function '%s' does not seem to be a proper Python function :/" % column_mapping[i]
                except ValueError:
                    raise ConfigError, "Mapping funciton '%s' did not like the value '%s' in column number %d\
                                        of the input matrix '%s' :/" % (column_mapping[i], line_fields[i], i + 1, file_path)
            line_fields = updated_line_fields 

        if indexing_field == -1:
            entry_name = 'line__%09d__' % line_counter
        else:
            entry_name = line_fields[indexing_field]

        d[entry_name] = {}

        for i in range(0, len(columns)):
            if i == indexing_field:
                continue
            d[entry_name][columns[i]] = line_fields[i]

        line_counter += 1

    # we have the dict, but we will not return it the way it is if its supposed to be appended to an
    # already existing dictionary.
    if dict_to_append:
        # we don't want to through keys in d each time we want to add stuff to 'dict_to_append', so we keep keys we
        # find in the first item in the dict in another variable. this is potentially very dangerous if not every
        # item in 'd' has identical set of keys.
        keys = d.values()[0].keys()

        for entry in dict_to_append:
            if entry not in d:
                # so dict to append is missing a key that is in the dict to be appended. if the user did not
                # ask us to add None for these entries via none_for_missing, we are going to make a noise,
                # otherwise we will tolerate it.
                if not assign_none_for_missing:
                    raise ConfigError, "Appending entries to the already existing dictionary from file '%s' failed\
                                        as the entry %s does not appear to be in the file." % (file_path, entry)
                else:
                    for key in keys:
                        dict_to_append[entry][key] = none_value
            else:
                for key in keys:
                    dict_to_append[entry][key] = d[entry][key]

        return dict_to_append

    return d
Ejemplo n.º 30
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_tab_delimited(self.samples_of_interest_path, expected_number_of_fields=1)
            self.samples_of_interest = set([s.strip() for s in open(self.samples_of_interest_path).readlines()])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Any genes of interest?')
        if self.genes_of_interest_path:
            filesnpaths.is_file_tab_delimited(self.genes_of_interest_path, expected_number_of_fields=1)
            try:
                self.genes_of_interest = set([int(s.strip()) for s in open(self.genes_of_interest_path).readlines()])
            except ValueError:
                self.progress.end()
                raise ConfigError, "Well. Anvi'o was working on your genes of interest .. and ... those gene IDs did not\
                                    look like anvi'o gene caller ids :/ Anvi'o is sad now."
        else:
            self.genes_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(auxiliary_data_file_path, None, ignore_hash=True)


        self.progress.update('Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'
            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'
                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([c.strip().replace('\r', '') for c in open(self.splits_of_interest_path).readlines()])

        self.input_file_path = '/' + '/'.join(os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Ejemplo n.º 31
0
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError, "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist)."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(
            self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
            self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {
            clustering_id: {
                'newick':
                ''.join([
                    l.strip()
                    for l in open(os.path.abspath(self.tree)).readlines()
                ])
            }
        }

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(
                view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {
                'header': view_data_columns[1:],
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(view_data_path)
            }
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {
                'header': ['names'],
                'dict': ad_hoc_dict
            }

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(
                self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (
                    num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        if self.title:
            self.title = self.title
Ejemplo n.º 32
0
    def get_genes_of_interest(self,
                              genes_of_interest_path=None,
                              gene_caller_ids=None):
        """
        nabs the genes of interest based on user arguments (self.args)
        """
        genes_of_interest = None

        # identify the gene caller ids of all genes available
        self.genes_in_contigs_database = set(
            dbops.ContigsSuperclass(self.args).genes_in_splits.keys())

        if not self.genes_in_contigs_database:
            raise ConfigError(
                "This contigs database does not contain any identified genes..."
            )

        # settling genes of interest
        if genes_of_interest_path and gene_caller_ids:
            raise ConfigError(
                "You can't provide a gene caller id from the command line, and a list of gene caller ids\
                               as a file at the same time, obviously.")

        if gene_caller_ids:
            gene_caller_ids = set(
                [x.strip() for x in gene_caller_ids.split(',')])

            genes_of_interest = []
            for gene in gene_caller_ids:
                try:
                    genes_of_interest.append(int(gene))
                except:
                    raise ConfigError(
                        "Anvi'o does not like your gene caller id '%s'..." %
                        str(gene))

            genes_of_interest = set(genes_of_interest)

        elif genes_of_interest_path:
            filesnpaths.is_file_tab_delimited(genes_of_interest_path,
                                              expected_number_of_fields=1)

            try:
                genes_of_interest = set([
                    int(s.strip())
                    for s in open(genes_of_interest_path).readlines()
                ])
            except ValueError:
                raise ConfigError(
                    "Well. Anvi'o was working on your genes of interest ... and ... those gene IDs did not\
                                   look like anvi'o gene caller ids :/ Anvi'o is now sad."
                )

        if not genes_of_interest:
            # no genes of interest are specified. Assuming all, which could be innumerable--raise warning
            genes_of_interest = self.genes_in_contigs_database
            self.run.warning(
                "You did not specify any genes of interest, so anvi'o will assume all of them are of interest."
            )

        return genes_of_interest
Ejemplo n.º 33
0
    def load_from_anvio_files(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header = 'Available states (%d)' % len(self.states_table.states), lc = 'green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state:
            if not self.state in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state               
Ejemplo n.º 34
0
    def __init__(self, args, external_clustering = None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')

        self.split_names_ordered = None
        self.additional_layers = None

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}


        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts()
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order()
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = completeness.Completeness(self.contigs_db_path)
            self.collections.populate_sources_dict(self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if self.manual_mode:
            self.load_from_user_files(args)
        else:
            self.load_from_anvio_files(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_from_user_files function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped automatically by the platform. Please\
                                    read the help menu for anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        tree = Tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'], format = 1)

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = [n.name for n in tree.get_leaves()]

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer = self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Ejemplo n.º 35
0
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError(
                "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database.")

        if not self.profile_db_path:
            raise ConfigError(
                "Even when you want to use the interactive interface in manual mode, you need\
                                to provide a profile database path. But you DO NOT need an already existing\
                                profile database, since anvi'o will generate an empty one for you. The profile\
                                database in this mode only used to read or store the 'state' of the display\
                                for visualization purposes, or to allow you to create and store collections."
            )

        # if the user is using an existing profile database, we need to make sure that it is not associated
        # with a contigs database, since it would mean that it is a full anvi'o profile database and should
        # not be included in manual operations.
        if filesnpaths.is_file_exists(self.profile_db_path, dont_raise=True):
            profile_db = ProfileDatabase(self.profile_db_path)
            if profile_db.meta['contigs_db_hash']:
                raise ConfigError(
                    "Well. It seems the profile database is associated with a contigs database,\
                                    which means using it in manual mode is not the best way to use it. Probably\
                                    what you wanted to do is to let the manual mode create a new profile database\
                                    for you. Simply type in a new profile database path (it can be a file name\
                                    that doesn't exist).")

        if not self.tree and not self.view_data_path:
            raise ConfigError(
                "You must be joking Mr. Feynman. No tree file, and no data file? What is it that\
                               anvi'o supposed to visualize? :(")

        if not self.tree:
            self.run.warning(
                "You haven't declared a tree file. Anvi'o will do its best to come up with an\
                              organization of your items.")

        if self.view:
            raise ConfigError(
                "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode")

        if self.show_views:
            raise ConfigError(
                "Sorry, there are no views to show in manual mode :/")

        if self.show_states:
            raise ConfigError(
                "Sorry, there are no states to show in manual mode :/")

        if self.tree:
            filesnpaths.is_file_exists(self.tree)
            newick_tree_text = ''.join([
                l.strip()
                for l in open(os.path.abspath(self.tree)).readlines()
            ])
            item_names = utils.get_names_order_from_newick_tree(
                newick_tree_text)
        else:
            item_names = utils.get_column_data_from_TAB_delim_file(
                self.view_data_path, column_indices=[0])[0][1:]

        # try to convert item names into integer values for proper sorting later. it's OK if it does
        # not work.
        try:
            item_names = [int(n) for n in item_names]
        except:
            pass

        view_data_path = os.path.abspath(
            self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.default_view = self.p_meta['default_view']

        # set some default organizations of data:
        self.p_meta['clusterings'] = {
            'Alphabetical_(reverse):none:none': {
                'basic': sorted(item_names)
            },
            'Alphabetical:none:none': {
                'basic': sorted(item_names, reverse=True)
            }
        }
        self.p_meta['available_clusterings'] = [
            'Alphabetical_(reverse):none:none', 'Alphabetical:none:none'
        ]
        self.p_meta['default_clustering'] = self.p_meta[
            'available_clusterings'][0]

        # if we have a tree, let's make arrangements for it:
        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            self.p_meta['default_clustering'] = clustering_id
            self.p_meta['available_clusterings'].append(clustering_id)
            self.p_meta['clusterings'][clustering_id] = {
                'newick': newick_tree_text
            }

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(
                view_data_path, include_first_column=True)

            # load view data as the default view:
            self.views[self.default_view] = {
                'header': view_data_columns[1:],
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(view_data_path)
            }
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            ad_hoc_dict = {}
            for item in item_names:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {
                'header': ['names'],
                'dict': ad_hoc_dict
            }

        self.displayed_item_names_ordered = list(
            self.views[self.default_view]['dict'].keys())

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(
                self.displayed_item_names_ordered) - set(
                    self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError(
                    'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"'
                    %
                    (num_names_missing_in_FASTA, names_missing_in_FASTA.pop()))

            # setup a mock splits_basic_info dict
            for split_id in self.displayed_item_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path)

        # read description from self table, if it is not available get_description function will return placeholder text
        self.p_meta['description'] = get_description_in_db(
            self.profile_db_path)

        if self.title:
            self.title = self.title
Ejemplo n.º 36
0
    def load_from_user_files(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\
                                using the '--manual-mode' flag, you still need to declare a profile database.\
                                The profile database in this mode only used to read or store the 'state' of\
                                the display for visualization purposes. You DO NOT need to point to an already\
                                existing database, as anvi'o will generate an empty one for your if there is no\
                                profile database."

        if (not self.fasta_file) or (not self.view_data_path) or (not self.tree):
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                each of '-f', '-d', and '-t' parameters. Please see the help menu for more info."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"


        view_data_path = os.path.abspath(self.view_data_path)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        # sanity of the view data
        filesnpaths.is_file_tab_delimited(view_data_path)
        view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
        if not view_data_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the view data file must\
                                      say 'contig', which is not the case for your view data file\
                                      ('%s'). Please make sure this is a properly formatted view data\
                                      file." % (view_data_path)

        # load view data as the default view:
        self.views[self.default_view] = {'header': view_data_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up: 
        self.p_meta['samples'] = self.views[self.default_view]['header']

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for ad hoc operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title
Ejemplo n.º 37
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy()

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts()
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order()
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.state and 'default' in self.states_table.states:
            self.state = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
Ejemplo n.º 38
0
    def load_from_user_files(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\
                                using the '--manual-mode' flag, you still need to declare a profile database.\
                                The profile database in this mode only used to read or store the 'state' of\
                                the display for visualization purposes. You DO NOT need to point to an already\
                                existing database, as anvi'o will generate an empty one for your if there is no\
                                profile database."

        if (not self.view_data_path) or (not self.tree):
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                each of the '-d', and '-t' parameters. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path)
        self.p_meta['splits_fasta'] = os.path.abspath(
            self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {
            'default': {
                'newick': open(os.path.abspath(self.tree)).read()
            }
        }

        self.default_view = self.p_meta['default_view']

        # sanity of the view data
        filesnpaths.is_file_tab_delimited(view_data_path)
        view_data_columns = utils.get_columns_of_TAB_delim_file(
            view_data_path, include_first_column=True)
        if not view_data_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the view data file must\
                                      say 'contig', which is not the case for your view data file\
                                      ('%s'). Please make sure this is a properly formatted view data\
                                      file." % (view_data_path)

        # load view data as the default view:
        self.views[self.default_view] = {
            'header': view_data_columns[1:],
            'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)
        }
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(
                self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(
                self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (
                    num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {
                    'length':
                    len(self.split_sequences[split_id]),
                    'gc_content':
                    utils.get_GC_content_for_sequence(
                        self.split_sequences[split_id])
                }

        # create a new, empty profile database for ad hoc operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({
                'db_type': 'profile',
                'merged': True,
                'contigs_db_hash': None,
                'samples': ','.join(self.p_meta['samples'])
            })

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_sources_dict(self.profile_db_path,
                                               anvio.__profile__version__)

        if self.title:
            self.title = self.title
Ejemplo n.º 39
0
def get_TAB_delimited_file_as_dictionary(file_path, expected_fields = None, dict_to_append = None, column_names = None,\
                                        column_mapping = None, indexing_field = 0, separator = '\t', no_header = False,\
                                        ascii_only = False, only_expected_fields = False, assign_none_for_missing = False,\
                                        none_value = None):
    """Takes a file path, returns a dictionary."""

    if expected_fields and not isinstance(expected_fields, list) and not isinstance(expected_fields, set):
        raise ConfigError, "'expected_fields' variable must be a list (or a set)."

        raise ConfigError, "'only_expected_fields' variable guarantees that there are no more fields present\
                            in the input file but the ones requested with 'expected_fields' variable. If you\
                            need to use this flag, you must also be explicit abou twhat fields you expect to\
                            find in the file."

    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path, separator = separator)

    f = open(file_path, 'rU')

    # learn the number of fields and reset the file:
    num_fields = len(f.readline().strip('\n').split(separator))
    f.seek(0)

    # if there is no file header, make up a columns list:
    if no_header and not column_names:
        column_names = ['column_%05d' % i for i in range(0, num_fields)]

    if column_names:
        columns = column_names

        if num_fields != len(columns):
            raise  ConfigError, "Number of column names declared (%d) differs from the number of columns\
                                 found (%d) in the matrix ('%s') :/" % (len(columns), num_fields, file_path)

        # now we set the column names. if the file had its header, we must discard
        # the first line. so here we go:
        if not no_header:
            f.readline()
    else:
        columns = f.readline().strip('\n').split(separator)

    if expected_fields:
        for field in expected_fields:
            if field not in columns:
                raise ConfigError, "The file '%s' does not contain the right type of header. It was expected\
                                    to have these: '%s', however it had these: '%s'" % (file_path,
                                                                                        ', '.join(expected_fields),
                                                                                        ', '.join(columns[1:]))

    d = {}
    line_counter = 0

    for line in f.readlines():
        if ascii_only:
            if not is_ascii_only(line):
                raise ConfigError, "The input file conitans non-ascii characters at line number %d. Those lines\
                                    either should be removed, or edited." % (line_counter + 2)

        line_fields = [f if f else None for f in line.strip('\n').split(separator)]

        if column_mapping:
            updated_line_fields = []
            for i in range(0, len(line_fields)):
                try:
                    updated_line_fields.append(column_mapping[i](line_fields[i]))
                except NameError:
                    raise ConfigError, "Mapping function '%s' did not work on value '%s'. These functions can be native\
                                        Python functions, such as 'str', 'int', or 'float', or anonymous functions\
                                        defined using lambda notation." % (column_mapping[i], line_fields[i])
                except TypeError:
                    raise ConfigError, "Mapping function '%s' does not seem to be a proper Python function :/" % column_mapping[i]
                except ValueError:
                    raise ConfigError, "Mapping funciton '%s' did not like the value '%s' in column number %d\
                                        of the input matrix '%s' :/" % (column_mapping[i], line_fields[i], i + 1, file_path)
            line_fields = updated_line_fields 

        if indexing_field == -1:
            entry_name = 'line__%09d__' % line_counter
        else:
            entry_name = line_fields[indexing_field]

        d[entry_name] = {}

        for i in range(0, len(columns)):
            if i == indexing_field:
                continue
            d[entry_name][columns[i]] = line_fields[i]

        line_counter += 1

    # we have the dict, but we will not return it the way it is if its supposed to be appended to an
    # already existing dictionary.
    if dict_to_append:
        # we don't want to through keys in d each time we want to add stuff to 'dict_to_append', so we keep keys we
        # find in the first item in the dict in another variable. this is potentially very dangerous if not every
        # item in 'd' has identical set of keys.
        keys = d.values()[0].keys()

        for entry in dict_to_append:
            if entry not in d:
                # so dict to append is missing a key that is in the dict to be appended. if the user did not
                # ask us to add None for these entries via none_for_missing, we are going to make a noise,
                # otherwise we will tolerate it.
                if not assign_none_for_missing:
                    raise ConfigError, "Appending entries to the already existing dictionary from file '%s' failed\
                                        as the entry %s does not appear to be in the file." % (file_path, entry)
                else:
                    for key in keys:
                        dict_to_append[entry][key] = none_value
            else:
                for key in keys:
                    dict_to_append[entry][key] = d[entry][key]

        return dict_to_append

    return d
Ejemplo n.º 40
0
    def load_full_mode(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run (or maybe you\
                                should use the `--manual` flag if that's what your intention)."

        if not self.profile_db_path:
            raise ConfigError, "So you want to run anvi'o in full mode, but without a profile database?\
                                Well. This does not make any sense."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_collections_dict(self.profile_db_path,
                                                   anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(
            os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path,
                                            anvio.__profile__version__)

        # load views from the profile database
        if self.p_meta['blank']:
            blank_dict = {}
            for split_name in self.splits_basic_info:
                blank_dict[split_name] = {'blank_view': 0}

            self.views['blank_view'] = {
                'header': ['blank_view'],
                'dict': blank_dict
            }
            self.default_view = 'blank_view'

        else:
            self.load_views()
            self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('',
                        header='Available views (%d)' % len(self.views),
                        lc='green')
            for view in self.views:
                run.info(
                    view,
                    'Via "%s" table' % self.views[view]['table_name'],
                    lc='crimson',
                    mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('',
                        header='Available states (%d)' %
                        len(self.states_table.states),
                        lc='green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' %
                         self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(
                self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [
                str
            ] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {
                'table_name':
                'NA',
                'header':
                additional_view_columns,
                'dict':
                utils.get_TAB_delimited_file_as_dictionary(
                    self.additional_view_path, column_mapping=column_mapping)
            }

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings

        if self.tree:
            clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(
                self.tree)
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = clustering_id
                self.p_meta['available_clusterings'] = [clustering_id]
                self.p_meta['clusterings'] = {
                    clustering_id: {
                        'newick': open(os.path.abspath(self.tree)).read()
                    }
                }
                run.info(
                    'Additional Tree',
                    "Splits will be organized based on '%s'." % clustering_id)
            else:
                self.p_meta['clusterings'][clustering_id] = {
                    'newick': open(os.path.abspath(self.tree)).read()
                }
                run.info(
                    'Additional Tree',
                    "'%s' has been added to available trees." % clustering_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace(
                '_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_profile_data_available:
            summary_cp_available = os.path.exists(
                os.path.join(os.path.dirname(self.profile_db_path),
                             'SUMMARY.cp'))
            self.run.warning(
                "Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" %
                ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."
                 ))

        if self.state_autoload:
            if not self.state_autoload in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state_autoload
Ejemplo n.º 41
0
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}}

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {'header': view_data_columns[1:],
                                             'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {'header': ['names'],
                                             'dict': ad_hoc_dict}

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                    'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title
Ejemplo n.º 42
0
    def load_from_profile_database(self, args):
        if self.p_meta['version'] != anvio.__profile__version__:
            raise ConfigError, "The profile database has a version number that differs from the version that is valid\
                                for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\
                                unfortunately, you need to re-profile and re-merge this project using the current anvi'o :("

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # is summary being overwritten?
        if self.summary_index:
            run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index)
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)

        if os.path.exists(self.P('SUMMARY.cp')):
            self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp'))
        else:
            self.splits_summary_index = None
            run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\
                         it will attempt to do it), but things may behave badly with the absence of\
                         SUMMARY.cp (first and foremost, you will not be able to inspect individual\
                         contigs through any of the interactive interfaces). Please investigate it\
                         if you were not expecting this.")

        # set title
        if self.title:
            self.title = self.title + ' (%s)' % self.default_view
        else:
            self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view
Ejemplo n.º 43
0
    def init(self):
        """This function is called from within the snakefile to initialize parameters."""

        super().init()
        #FIXME: Because 00_LOGS is hardcoded in the base class I need to reassign it
        self.dirs_dict.update({"LOGS_DIR": "ECOPHYLO_WORKFLOW/00_LOGS"})

        self.names_list = []
        self.names_dirs = []
        self.contigsDB_name_path_dict = {}
        self.contigsDB_name_dir_dict = {}
        self.contigsDB_name_bam_dict = {}

        # Load metagenomes.txt
        self.metagenomes = self.get_param_value_from_config(['metagenomes'])

        if self.metagenomes:
            filesnpaths.is_file_exists(self.metagenomes)
            try:
                self.metagenomes_df = pd.read_csv(self.metagenomes,
                                                  sep='\t',
                                                  index_col=False)
                self.metagenomes_name_list = self.metagenomes_df.name.to_list()
                for name in self.metagenomes_name_list:
                    if " " in name:
                        raise ConfigError(
                            "One of the names in your metagenomes.txt file contains spaces. "
                            "The EcoPhylo workflow will have a hard time with this, please "
                            "only use underscores in your metagenome names.")
                    else:
                        continue
                self.metagenomes_path_list = self.metagenomes_df.contigs_db_path.to_list(
                )
                self.metagenomes_dirname_list = [
                    os.path.dirname(x) for x in self.metagenomes_path_list
                ]
                self.contigsDB_name_path_dict.update(
                    dict(
                        zip(self.metagenomes_df.name,
                            self.metagenomes_df.contigs_db_path)))
                if 'bam' in self.metagenomes_df.columns:
                    self.contigsDB_name_bam_dict.update(
                        dict(
                            zip(self.metagenomes_df.name,
                                self.metagenomes_df.bam)))
                    self.metagenomes_profiles_list = self.metagenomes_df.bam.to_list(
                    )
                self.names_list.extend(self.metagenomes_name_list)

            except IndexError as e:
                raise ConfigError(
                    "The metagenomes.txt file, '%s', does not appear to be properly formatted. "
                    "This is the error from trying to load it: '%s'" %
                    (self.metagenomes_df, e))

        # Load external-genomes.txt
        self.external_genomes = self.get_param_value_from_config(
            ['external_genomes'])
        if self.external_genomes:
            filesnpaths.is_file_exists(self.external_genomes)
            try:
                self.external_genomes_df = pd.read_csv(self.external_genomes,
                                                       sep='\t',
                                                       index_col=False)
                self.external_genomes_names_list = self.external_genomes_df.name.to_list(
                )
                for name in self.external_genomes_names_list:
                    if " " in name:
                        raise ConfigError(
                            "One of the names in your external-genomes.txt file contains spaces. "
                            "The EcoPhylo workflow will have a hard time with this, please "
                            "only use underscores in your genome names.")
                    else:
                        continue
                self.external_genomes_path_list = self.external_genomes_df.contigs_db_path.to_list(
                )
                self.external_genomes_dirname_list = [
                    os.path.dirname(x) for x in self.external_genomes_path_list
                ]
                self.contigsDB_name_path_dict.update(
                    dict(
                        zip(self.external_genomes_names_list,
                            self.external_genomes_path_list)))
                if 'bam' in self.external_genomes_df.columns:
                    self.contigsDB_name_bam_dict.update(
                        dict(
                            zip(self.external_genomes_df.name,
                                self.external_genomes_df.bam)))
                self.names_list.extend(self.external_genomes_names_list)

            except IndexError as e:
                raise ConfigError(
                    "The external-genomes.txt file, '%s', does not appear to be properly formatted. "
                    "This is the error from trying to load it: '%s'" %
                    (self.external_genomes_df, e))
        else:
            self.external_genomes_names_list = []

        # Concatenate metagenomes.txt and external-genomes.txt
        contigsDB_name_path_list = list(self.contigsDB_name_path_dict.items())
        contigsDB_name_path_df = pd.DataFrame(
            contigsDB_name_path_list, columns=['name', 'contigs_db_path'])
        combined_genomes_df_path = "combined_genomes.txt"

        contigsDB_name_path_df.to_csv(combined_genomes_df_path, \
                                      sep="\t", \
                                      index=False, \
                                      header=True)

        # Make variables that tells whether we have metagenomes.txt, external-genomes.txt, or both
        if self.metagenomes and not self.external_genomes:
            self.mode = 'metagenomes'
        if not self.metagenomes and self.external_genomes:
            self.mode = 'external_genomes'
        if self.metagenomes and self.external_genomes:
            self.mode = 'both'

        # Load HMM list
        self.hmm_list_path = self.get_param_value_from_config(['hmm_list'])
        if self.hmm_list_path:
            filesnpaths.is_file_exists(self.hmm_list_path)
            try:
                HMM_df = pd.read_csv(self.hmm_list_path,
                                     sep='\t',
                                     index_col=False)
                self.HMM_source_dict = dict(zip(HMM_df.name, HMM_df.source))
                self.HMM_path_dict = dict(zip(HMM_df.name, HMM_df.path))

            except IndexError as e:
                raise ConfigError(
                    "The hmm_list.txt file, '%s', does not appear to be properly formatted. "
                    "This is the error from trying to load it: '%s'" %
                    (self.hmm_list_path, e))

            if any("-" in s for s in self.HMM_source_dict.keys()):
                raise ConfigError(
                    f"Please do not use " - " in your external HMM names in: "
                    f"{self.hmm_list_path}. It will make our lives "
                    f"easier with Snakemake wildcards :)")

        # FIXME: this line prints the list of HMM_sources to stdout and I don't want that
        self.internal_HMM_sources = list(anvio.data.hmm.sources.keys())

        for HMM, HMM_path in self.HMM_source_dict.items():
            if HMM_path == "INTERNAL":
                if self.HMM_source_dict[HMM] not in self.internal_HMM_sources:
                    HMM_source = self.HMM_source_dict[HMM]
                    raise ConfigError(
                        f"{HMM_source} is not an 'INTERNAL' HMM source for anvi'o. "
                        f"Please double check {self.hmm_list_path} to see if you spelled it right or "
                        f"please checkout the default internal HMMs here: https://merenlab.org/software/anvio/help/7/artifacts/hmm-source/#default-hmm-sources"
                    )

        # Load samples.txt
        self.samples_txt_file = self.get_param_value_from_config(
            ['samples_txt'])

        if not self.samples_txt_file:
            if os.path.exists('samples.txt'):
                self.samples_txt_file = 'samples.txt'
            else:
                raise ConfigError(
                    "Ehem. Your config file does not include a `samples_txt` directive. "
                    "Anvi'o tried to assume that your `samples.txt` may be in your work "
                    "directory, but you don't seem to have a `samples.txt` file anywhere "
                    "around either. So please add a `samples.txt` directive.")

        filesnpaths.is_file_tab_delimited(self.samples_txt_file)

        try:
            # getting the samples information (names, [group], path to r1, path to r2) from samples.txt
            self.samples_information = pd.read_csv(self.samples_txt_file,
                                                   sep='\t',
                                                   index_col=False)
        except IndexError as e:
            raise ConfigError(
                "Looks like your samples_txt file, '%s', is not properly formatted. "
                "This is what we know: '%s'" % (self.samples_txt_file, e))
        if 'sample' not in list(self.samples_information.columns):
            raise ConfigError(
                "Looks like your samples_txt file, '%s', is not properly formatted. "
                "We are not sure what's wrong, but we can't find a column with title 'sample'."
                % self.samples_txt_file)

        self.sample_names_for_mapping_list = self.samples_information[
            'sample'].to_list()

        # Pick which tree algorithm
        self.run_iqtree = self.get_param_value_from_config(['iqtree', 'run'])
        self.run_fasttree = self.get_param_value_from_config(
            ['fasttree', 'run'])

        if not self.run_iqtree and not self.run_fasttree:
            raise ConfigError(
                "Please choose either iqtree or fasttree in your config file to run your phylogenetic tree."
            )

        # Decide to clusterize metagenomic workflow
        self.clusterize_metagenomics_workflow = self.get_param_value_from_config(
            ['run_metagenomics_workflow', 'clusterize'])
        self.metagenomics_workflow_HPC_string = self.get_param_value_from_config(
            ['run_metagenomics_workflow', 'cluster_submission_params'])

        # Pick clustering method
        self.cluster_representative_method = self.get_param_value_from_config(
            ['cluster_representative_method', 'method'])

        if self.cluster_representative_method not in [
                'mmseqs', 'cluster_rep_with_coverages'
        ]:
            raise ConfigError(
                f"anvi'o has never heard of this method to pick a cluster representative: {self.cluster_representative_method} "
                f"Please check your config file {self.config_file} and change cluster_representative_method to one of the following: 'mmseqs' and 'cluster_rep_with_coverages'"
            )

        if self.cluster_representative_method == 'cluster_rep_with_coverages' and len(
                self.contigsDB_name_bam_dict) == 0:
            raise ConfigError(
                f"The EcoPhylo workflow can't use the cluster representative method cluster_rep_with_coverages without BAM files..."
                f"Please edit your metagenomes.txt or external-genomes.txt and add BAM files."
            )

        self.target_files = self.get_target_files()
Ejemplo n.º 44
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_tab_delimited(self.samples_of_interest_path,
                                              expected_number_of_fields=1)
            self.samples_of_interest = set([
                s.strip()
                for s in open(self.samples_of_interest_path).readlines()
            ])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Any genes of interest?')
        if self.genes_of_interest_path:
            filesnpaths.is_file_tab_delimited(self.genes_of_interest_path,
                                              expected_number_of_fields=1)
            try:
                self.genes_of_interest = set([
                    int(s.strip())
                    for s in open(self.genes_of_interest_path).readlines()
                ])
            except ValueError:
                self.progress.end()
                raise ConfigError, "Well. Anvi'o was working on your genes of interest .. and ... those gene IDs did not\
                                    look like anvi'o gene caller ids :/ Anvi'o is sad now."

        else:
            self.genes_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(
                os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                auxiliary_data_file_path, None, ignore_hash=True)

        self.progress.update(
            'Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'

            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(
                self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'

                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([
                    c.strip().replace('\r', '')
                    for c in open(self.splits_of_interest_path).readlines()
                ])

        self.input_file_path = '/' + '/'.join(
            os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples  # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(
                t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(
                t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[
                    e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Ejemplo n.º 45
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.taxonomic_level = A('taxonomic_level')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state_autoload = A('state_autoload')
        self.collection_autoload = A('collection_autoload')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy(self.taxonomic_level)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(
                self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts(
            )
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order(
            )
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(
                self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                    self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(
                self.profile_db_path,
                self.samples_information_db_path,
                manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta[
                'clusterings'] = self.clusterings = self.external_clustering[
                    'clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering[
                'default_clustering']

        if not self.state_autoload and 'default' in self.states_table.states:
            self.state_autoload = 'default'

        if not self.collection_autoload and 'default' in self.collections.collections_dict:
            self.collection_autoload = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"

            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(
            self.p_meta['clusterings'][
                self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(
                self.split_names_ordered,
                return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()