Ejemplo n.º 1
0
    def generate_tree(self):
        pcs = set(request.forms.getall('pcs[]'))
        name = request.forms.get('name')
        skip_multiple_gene_calls = request.forms.get('skip_multiple_genes')
        program = request.forms.get('program')
        aligner = request.forms.get('aligner')
        store_tree = request.forms.get('store_tree')

        temp_fasta_file = filesnpaths.get_temp_file_path()
        temp_tree_file = filesnpaths.get_temp_file_path()
        tree_text = None

        try:
            self.interactive.write_sequences_in_PCs_for_phylogenomics(pc_names=pcs, output_file_path=temp_fasta_file, skip_multiple_gene_calls=skip_multiple_gene_calls, align_with=aligner)
            drivers.driver_modules['phylogeny'][program]().run_command(temp_fasta_file, temp_tree_file)
            tree_text = open(temp_tree_file,'rb').read().decode()

            if store_tree:
                if not self.interactive.samples_information_db_path:
                    raise ConfigError("This project does not have samples db")

                samples_information_db = SamplesInformationDatabase(self.interactive.samples_information_db_path)
                samples_information_db.update(single_order_path=temp_tree_file, single_order_name=name)
                self.interactive.samples_order_dict[name] = {'newick': tree_text, 'basic': ''}
        except Exception as e:
            message = str(e.clear_text()) if 'clear_text' in dir(e) else str(e)
            return json.dumps({'status': 1, 'message': message})

        return json.dumps({'status': 0, 'tree': tree_text})
Ejemplo n.º 2
0
    def __init__(self,
                 args,
                 program_name='tRNAscan-SE',
                 run=None,
                 progress=None):
        self.program_name = program_name

        self.tested_versions = ['2.0.5']

        P = lambda p: os.path.abspath(os.path.expanduser(p))
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.num_threads = A('num_threads') or 1
        self.fasta_file_path = P(
            A('fasta_file')
            or filesnpaths.get_temp_file_path(just_the_path=True))
        self.output_file_path = P(
            A('output_file')
            or filesnpaths.get_temp_file_path(just_the_path=True))
        self.log_file_path = P(
            A('log_file')
            or filesnpaths.get_temp_file_path(just_the_path=True))
        self.cutoff_score = A('cutoff_score') or 20
        self.quiet = A('quiet')

        self.keep_output_file = A('output_file')
        self.keep_log_file = A('log_file')

        self.run = run or terminal.Run(verbose=(not self.quiet))
        self.progress = progress or terminal.Progress(not self.quiet)

        self.sanity_check()
Ejemplo n.º 3
0
    def dry_run(self, workflow_graph_output_file_path_prefix='workflow'):
        """Not your regular dry run.

           The purpose of this function is to make sure there is a way to check for
           workflow program dependencies before the workflow is actually run. this way,
           if there is a `check_workflow_program_dependencies` call at the end of the
           snake file `get_workflow_snake_file_path(self.name)`, it can be called with
           a compiled snakemake `workflow` instance."""

        if self.this_workflow_is_inherited_by_another:
            return

        self.progress.new('Bleep bloop')
        self.progress.update('Quick dry run for an initial sanity check ...')
        args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \
                '--configfile', self.config_file, '--dryrun', '--quiet']

        if self.save_workflow_graph:
            args.extend(['--dag'])

        log_file_path = filesnpaths.get_temp_file_path()
        u.run_command(args, log_file_path)
        self.progress.end()

        # here we're getting the graph info from the log file like a dirty hacker
        # we are (it still may be better to do it elsewhere more appropriate .. so
        # we can look more decent or whatever):
        if self.save_workflow_graph:
            lines = open(log_file_path, 'rU').readlines()

            try:
                line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0]
            except IndexError:
                raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have "
                                  "gone wrong in a step prior. Something tells anvi'o that if you take a look at the "
                                  "log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path)
            open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:]))

            self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot')

            if u.is_program_exists('dot', dont_raise=True):
                dot_log_file = filesnpaths.get_temp_file_path()
                u.run_command(['dot', '-Tpdf', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.pdf'], dot_log_file)
                os.remove(dot_log_file)
                self.run.info('Workflow PDF file', workflow_graph_output_file_path_prefix + '.pdf')
            else:
                self.run.warning("Well, anvi'o was going to try to save a nice PDF file for your workflow "
                                 "graph, but clearly you don't have `dot` installed on your system. That's OK. You "
                                 "have your dot file now, and you can Google 'how to view dot file on [your operating "
                                 "system goes here]', and install necessary programs (like .. `dot`).")

        os.remove(log_file_path)
Ejemplo n.º 4
0
    def dry_run(self, workflow_graph_output_file_path_prefix='workflow'):
        """Not your regular dry run.

           The purpose of this function is to make sure there is a way to check for
           workflow program dependencies before the workflow is actually run. this way,
           if there is a `check_workflow_program_dependencies` call at the end of the
           snake file `get_workflow_snake_file_path(self.name)`, it can be called with
           a compiled snakemake `workflow` instance."""

        if self.slave_mode:
            return

        self.progress.new('Bleep bloop')
        self.progress.update('Quick dry run for an initial sanity check ...')
        args = ['snakemake', '--snakefile', get_workflow_snake_file_path(self.name), \
                '--configfile', self.config_file, '--dryrun', '--quiet']

        if self.save_workflow_graph:
            args.extend(['--dag'])

        log_file_path = filesnpaths.get_temp_file_path()
        u.run_command(args, log_file_path)
        self.progress.end()

        # here we're getting the graph info from the log file like a dirty hacker
        # we are (it still may be better to do it elsewhere more appropriate .. so
        # we can look more decent or whatever):
        if self.save_workflow_graph:
            lines = open(log_file_path, 'rU').readlines()

            try:
                line_of_interest = [line_no for line_no in range(0, len(lines)) if lines[line_no].startswith('digraph')][0]
            except IndexError:
                raise ConfigError("Oh no. Anvi'o was trying to generate a DAG output for you, but something must have\
                                   gone wrong in a step prior. Something tells anvi'o that if you take a look at the\
                                   log file here, you may be able to figure it out: '%s'. Sorry!" % log_file_path)
            open(workflow_graph_output_file_path_prefix + '.dot', 'w').write(''.join(lines[line_of_interest:]))

            self.run.info('Workflow DOT file', workflow_graph_output_file_path_prefix + '.dot')

            if u.is_program_exists('dot', dont_raise=True):
                dot_log_file = filesnpaths.get_temp_file_path()
                u.run_command(['dot', '-Tpng', workflow_graph_output_file_path_prefix + '.dot', '-o', workflow_graph_output_file_path_prefix + '.png'], dot_log_file)
                os.remove(dot_log_file)
                self.run.info('Workflow PNG file', workflow_graph_output_file_path_prefix + '.png')
            else:
                self.run.warning("Well, anvi'o was going to try to save a nice PNG file for your workflow\
                                  graph, but clearly you don't have `dot` installed on your system. That's OK. You\
                                  have your dot file now, and you can Google 'how to view dot file on [your operating\
                                  system goes here]', and install necessary programs (like .. `dot`).")

        os.remove(log_file_path)
Ejemplo n.º 5
0
    def __init__(self,
                 program_name='fastANI',
                 args={},
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.run = run
        self.progress = progress
        self.program_name = program_name

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.kmer_size = A('fastani_kmer_size') or 16
        self.fragment_length = A('fragment_length') or 3000
        self.min_num_fragments = A('min_num_fragments') or 50
        self.num_threads = A('num_threads') or 1
        self.log_file_path = os.path.abspath(
            A('log_file') or filesnpaths.get_temp_file_path())
        self.quiet = A('quiet')

        self.check_programs()

        self.run.warning(
            "Anvi'o will use 'fastANI' by Jain et al. (DOI: 10.1038/s41467-018-07641-9) to compute ANI. "
            "If you publish your findings, please do not forget to properly credit their work.",
            lc='green',
            header="CITATION")
Ejemplo n.º 6
0
    def fix_input_file(self, input_file_path):
        """This is sadly necessary because Kaiju output contains either three,
           or eight TAB-delimited columns ... not very parser friendly"""

        filesnpaths.is_file_exists(input_file_path)

        self.progress.new('Fixing the broken kaiju output')
        self.progress.update('...')

        corrected_temp_file_path = filesnpaths.get_temp_file_path()
        corrected_temp_file = open(corrected_temp_file_path, 'w')
        input_file = open(input_file_path, 'rU')

        num_correct_lines = 0
        for line in input_file.readlines():
            if len(line.split('\t')) == 8:
                corrected_temp_file.write(line)
                num_correct_lines += 1

        corrected_temp_file.close()

        self.progress.end()

        if not num_correct_lines:
            os.remove(corrected_temp_file_path)
            raise ConfigError("Something must have been wrong with you input file. Not a single line in it\
                               matched to what the kaiju parsers expects (a proper file should have eight\
                               TAB-delimited columns).")

        return corrected_temp_file_path
Ejemplo n.º 7
0
    def __init__(self,
                 args={},
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 program_name='average_nucleotide_identity.py'):
        self.run = run
        self.progress = progress
        self.program_name = program_name

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.num_threads = A('num_threads') or 1
        self.method = A('method') or 'ANIb'
        self.log_file_path = os.path.abspath(
            A('log_file') or filesnpaths.get_temp_file_path())

        self.check_programs()

        self.run.warning(
            "Anvi'o will use 'PyANI' by Pritchard et al. (DOI: 10.1039/C5AY02550H) to compute ANI. If you publish your findings, \
                            please do not forget to properly credit their work.",
            lc='green',
            header="CITATION")

        self.run.info('[PyANI] Num threads to use', self.num_threads)
        self.run.info('[PyANI] Alignment method', self.method)
        self.run.info('[PyANI] Log file path', self.log_file_path, nl_after=1)
Ejemplo n.º 8
0
    def run_gene_caller(self, gene_caller='prodigal'):
        """Runs gene caller, and returns gene_calls_dict, and amino acid sequences."""
        remove_fasta_after_processing = False

        if not self.contigs_fasta:
            self.contigs_fasta = filesnpaths.get_temp_file_path()
            utils.export_sequences_from_contigs_db(
                self.contigs_db_path,
                output_file_path=self.contigs_fasta,
                run=self.run)
            remove_fasta_after_processing = True

        if self.debug:
            self.run.info_single(
                '--debug flag is [ON], which means temporary directories generated by '
                'this run will not be removed',
                nl_after=2)

        gene_caller = genecalling.GeneCaller(self.contigs_fasta,
                                             gene_caller=gene_caller,
                                             args=self.args,
                                             debug=self.debug)

        gene_calls_dict, amino_acid_sequences = gene_caller.process()

        if not self.debug and remove_fasta_after_processing:
            os.remove(self.contigs_fasta)

        return gene_calls_dict, amino_acid_sequences
Ejemplo n.º 9
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError, 'anvio could not recover the actual path of the database\
                                            (!%s) referenced in the config file, because the database\
                                            paths variable sent from the client does not have an entry\
                                            for it :( There are two options. One is to get a db_paths\
                                            dictionary sent to this class that contains a key for %s\
                                            with the full path to the dataase as a value. Or the table\
                                            "%s" can be exported to a TAB-delimited matrix and declared in\
                                            the config file. If you are experimenting and stuck here, please\
                                            see the documentation or send an e-mail to the developers.'\
                                                                                % (database, database, table)
                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.join(self.input_directory,
                                                 database)

                if not os.path.exists(database_path):
                    raise ConfigError, 'The database you requested (%s) is not in the input directory :/' % database

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError, 'The table you requested (%s) does not seem to be in %s :/' % (
                        table, database)

                table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    table_rows = [
                        r for r in table_rows
                        if r[0] in self.row_ids_of_interest
                    ]

                if not len(table_rows):
                    raise ConfigError, "It seems the table '%s' in the database it was requested from is empty. This\
                                        is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section)

                tmp_file_path = filesnpaths.get_temp_file_path()
                table_structure = dbc.get_table_structure(table)
                columns_to_exclude = [
                    c for c in ['entry_id', 'sample_id']
                    if c in table_structure
                ]
                store_array(table_rows,
                            tmp_file_path,
                            table_structure,
                            exclude_columns=columns_to_exclude)
                self.matrix_paths[alias] = tmp_file_path
Ejemplo n.º 10
0
    def __init__(self,
                 args={},
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 program_name='sourmash'):
        self.run = run
        self.progress = progress
        self.program_name = program_name
        self.check_program()

        self.results = {}

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.log_file_path = os.path.abspath(
            A('log_file') or filesnpaths.get_temp_file_path())
        self.num_threads = A('num_threads') or 1
        self.kmer_size = A('kmer_size') or 51
        self.scale = A('scale') or 1000

        self.run.warning(
            "Anvi'o will use 'sourmash' by Brown et al. (DOI: 10.21105/joss.00027) to compute kmer sequences and determine mash distances. "
            "If you publish your findings, please do not forget to properly credit their work",
            lc='green',
            header="CITATION")

        if self.num_threads != 1:
            self.num_threads = 1
            self.run.warning(
                "Anvi'o speaking: sourmash currently doesn't support multithreading. "
                "Anvi'o will have to reduce your number of threads to one :(")

        self.run.info('[sourmash] Log file path',
                      self.log_file_path,
                      nl_after=1)
Ejemplo n.º 11
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        try:
            with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
                f_out.write(f_in.read())
        except Exception as e:
            progress.end()
            raise ConfigError(f"Something went wrong while decompressing the downloaded file :/ It is likely that "
                              f"the download failed and only part of the file was downloaded. If you would like to "
                              f"try again, please run the setup command with the flag `--reset`. Here is what the "
                              f"downstream library said: '{e}'.")

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("DIAMOND does not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Ejemplo n.º 12
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path,
                  'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning(
                "Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South."
            )

        if utils.is_program_exists(
                'makeblastdb', dont_raise=True) and utils.is_program_exists(
                    'blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning(
                "BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later."
            )

        os.remove(temp_fasta_path)
Ejemplo n.º 13
0
def get_vectors_from_TAB_delim_matrix(file_path, cols_to_return=None, rows_to_return=[], transpose=False):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path)

    if transpose:
        transposed_file_path = filesnpaths.get_temp_file_path()
        transpose_tab_delimited_file(file_path, transposed_file_path)
        file_path = transposed_file_path

    rows_to_return = set(rows_to_return)
    vectors = []
    id_to_sample_dict = {}
    sample_to_id_dict = {}

    input_matrix = open(file_path)
    columns = input_matrix.readline().strip().split("\t")[1:]

    fields_of_interest = []
    if cols_to_return:
        fields_of_interest = [columns.index(col) for col in cols_to_return]
    else:
        fields_of_interest = [f for f in range(0, len(columns)) if IS_ESSENTIAL_FIELD(columns[f])]

    # update columns:
    columns = [columns[i] for i in fields_of_interest]

    if not len(columns):
        raise ConfigError, "Only a subset (%d) of fields were requested by the caller, but none of them was found\
                            in the matrix (%s) :/" % (
            len(cols_to_return),
            file_path,
        )

    id_counter = 0
    for line in input_matrix.readlines():
        row_name = line.strip().split("\t")[0]
        if rows_to_return and row_name not in rows_to_return:
            continue
        id_to_sample_dict[id_counter] = row_name
        fields = line.strip().split("\t")[1:]

        if fields_of_interest:
            vector = [float(fields[i]) for i in fields_of_interest]
        else:
            vector = [float(f) for f in fields]

        vectors.append(vector)

        id_counter += 1

    input_matrix.close()

    if transpose:
        # remove clutter
        os.remove(file_path)

    sample_to_id_dict = dict([(v, k) for k, v in id_to_sample_dict.iteritems()])

    return id_to_sample_dict, sample_to_id_dict, columns, vectors
Ejemplo n.º 14
0
    def generate(self):
        d = {}

        log_file = filesnpaths.get_temp_file_path()
        num_all_programs = len(self.all_programs)
        for i in range(0, num_all_programs):
            program_path = self.all_programs[i]
            program_name = os.path.basename(program_path)

            if program_name in self.programs_to_skip:
                run.warning("Someone doesn't want %s to be in the output :/ Fine. Skipping." % (program_name))

            progress.new('Bleep bloop')
            progress.update('%s (%d of %d)' % (program_name, i+1, num_all_programs))

            output = utils.run_command_STDIN('%s --help' % (program_path), log_file, '').split('\n')

            if anvio.DEBUG:
                    usage, description, params, output = parse_help_output(output)
            else:
                try:
                    usage, description, params, output = parse_help_output(output)
                except Exception as e:
                    progress.end()
                    run.warning("The program '%s' does not seem to have the expected help menu output. Skipping to the next.\
                                 For the curious, this was the error message: '%s'" % (program_name, str(e).strip()))
                    continue

            d[program_name] = {'usage': usage,
                               'description': description,
                               'params': params,
                               'tags': get_meta_information_from_file(program_path, '__tags__'),
                               'resources': get_meta_information_from_file(program_path, '__resources__')}

            progress.end()

        os.remove(log_file)

        # generate output
        program_names = sorted([p for p in d if not p.startswith('anvi-script-')])
        script_names = sorted([p for p in d if p.startswith('anvi-script-')])
        vignette = {'vignette': d,
                    'program_names': program_names,
                    'script_names': script_names,
                    'all_names': program_names + script_names,
                    'meta': {'summary_type': 'vignette',
                             'version': '\n'.join(['|%s|%s|' % (t[0], t[1]) for t in anvio.get_version_tuples()]),
                             'date': utils.get_date()}}

        if anvio.DEBUG:
            run.warning(None, 'THE OUTPUT DICT')
            import json
            print(json.dumps(d, indent=2))

        open(self.output_file_path, 'w').write(SummaryHTMLOutput(vignette, r=run, p=progress).render())

        run.info('Output file', os.path.abspath(self.output_file_path))
Ejemplo n.º 15
0
    def process(self):
        """
        """
        # will be empty if all sources in self.residue_annotation_sources_info have "skip": True
        residue_annotation_methods = [info["method"] for _, info in self.residue_annotation_sources_info.items() if not info["skip"]]

        # which genes had structures and which did not. this information is added to the structure database self table
        has_structure = {True: [], False: []}

        num_genes_tried = 0
        num_genes_to_try = len(self.genes_of_interest)

        for corresponding_gene_call in self.genes_of_interest:
            # MODELLER outputs a lot of stuff into its working directory. A temporary directory is
            # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this
            # directory are used in the creation of the structure database. If self.full_modeller_output is
            # provided, these directories and their contents are moved into self.full_modeller_output.
            self.args.directory = filesnpaths.get_temp_directory_path()
            self.args.target_fasta_path = filesnpaths.get_temp_file_path()

            # Export sequence
            dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path,
                                                      self.args.target_fasta_path,
                                                      set([corresponding_gene_call]),
                                                      quiet = True)

            # Model structure
            progress_title = 'Modelling gene ID %d; (%d of %d processed)' % (corresponding_gene_call, num_genes_tried, num_genes_to_try)
            modeller_out = self.run_modeller(corresponding_gene_call, progress_title)
            if modeller_out["structure_exists"]:
                self.run.info_single("Gene successfully modelled!", nl_after=1, mc="green")

            has_structure[modeller_out["structure_exists"]].append(str(corresponding_gene_call))

            # Annotate residues
            residue_info_dataframe = None
            if modeller_out["structure_exists"]:
                residue_info_dataframe = self.run_residue_annotation_for_gene(residue_annotation_methods,
                                                                              corresponding_gene_call,
                                                                              modeller_out["best_model_path"])
            # Append info to tables
            self.append_gene_info_to_tables(modeller_out, residue_info_dataframe)

            # Append metadata to self
            self.update_structure_database_meta_table(has_structure)

            if self.full_modeller_output:
                self.dump_results_to_full_output()

            num_genes_tried += 1

        if not has_structure[True]:
            raise ConfigError("Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'(")

        self.structure_db.disconnect()
        self.run.info("Structure database", self.output_db_path)
Ejemplo n.º 16
0
def get_vectors_from_TAB_delim_matrix(file_path, cols_to_return=None, rows_to_return = [], transpose = False):
    filesnpaths.is_file_exists(file_path)
    filesnpaths.is_file_tab_delimited(file_path)

    if transpose:
        transposed_file_path = filesnpaths.get_temp_file_path()
        transpose_tab_delimited_file(file_path, transposed_file_path)
        file_path = transposed_file_path

    rows_to_return = set(rows_to_return)
    vectors = []
    id_to_sample_dict = {}
    sample_to_id_dict = {}

    input_matrix = open(file_path, 'rU')
    columns = input_matrix.readline().strip().split('\t')[1:]

    fields_of_interest = []
    if cols_to_return:
        fields_of_interest = [columns.index(col) for col in cols_to_return]
    else:
        fields_of_interest = [f for f in range(0, len(columns)) if IS_ESSENTIAL_FIELD(columns[f])]

    # update columns:
    columns = [columns[i] for i in fields_of_interest]

    if not len(columns):
        raise ConfigError, "Only a subset (%d) of fields were requested by the caller, but none of them was found\
                            in the matrix (%s) :/" % (len(cols_to_return), file_path)

    id_counter = 0
    for line in input_matrix.readlines():
        row_name = line.strip().split('\t')[0]
        if rows_to_return and row_name not in rows_to_return:
                continue
        id_to_sample_dict[id_counter] = row_name
        fields = line.strip().split('\t')[1:]

        if fields_of_interest:
            vector = [float(fields[i]) for i in fields_of_interest]
        else:
            vector = [float(f) for f in fields]

        vectors.append(vector)

        id_counter += 1

    input_matrix.close()

    if transpose:
        # remove clutter
        os.remove(file_path)

    sample_to_id_dict = dict([(v, k) for k, v in id_to_sample_dict.iteritems()])

    return id_to_sample_dict, sample_to_id_dict, columns, vectors
Ejemplo n.º 17
0
def get_newick_tree_data_for_dict(d, linkage=constants.linkage_method_default, distance=constants.distance_metric_default):
    is_distance_and_linkage_compatible(distance, linkage)

    matrix_file = filesnpaths.get_temp_file_path()
    utils.store_dict_as_TAB_delimited_file(d, matrix_file, ['items'] + d[d.keys()[0]].keys())

    newick = get_newick_tree_data(matrix_file, distance=distance, linkage=linkage)

    os.remove(matrix_file)
    return newick
Ejemplo n.º 18
0
    def process_single_order_data(self, single_order_path, single_order_name):
        """Just inject a single order into the `self.samples_order_dict`"""

        if not single_order_path:
            return

        if not single_order_name:
            raise SamplesError("You provided a file for a single order, but not a name for it. This is a no no :/")

        filesnpaths.is_file_plain_text(single_order_path)

        single_order_file_content = [l.strip('\n') for l in open(single_order_path, 'rU').readlines()]

        if len(single_order_file_content) != 1:
            raise SamplesError("The single order file should contain a single line of information. It can't have nothing,\
                                it can't have too much. Just a single newick tree, or a comma-separated list of sample\
                                names.")

        _order = single_order_file_content.pop()

        # if you are reading this line, please brace yourself to possibly one of the silliest
        # bunch of lines in the anvi'o codebase. the reason we are doing this this way is quite
        # a long story, and deserves a FIXME, but in order to utilize the excellent function
        # in the filesnpaths module to check the contents of the samples order dict rigirously,
        # we need to have this information in a file. a better way could have been implementing
        # a filesnpaths.is_proper_samples_order_content function next to the currently available
        # filesnpaths.is_proper_samples_order_file (the latter would call the former with a dict
        # and it would be much more flexible), but we can't import utils form within filesnpaths.
        # without utils we don't have a get_TAB_delimited_file_as_dictionary function, and we are
        # definitely not going to implement it in two places :( recovering from a poor design by
        # doing something even poorer? couldn't have we fixed this once and for all instead of
        # writing this paragraph? well. just remember that you are thinking about a rethorical
        # question in a comment section. so sometimes we do things that are not quite productive.
        temp_samples_order_file_path = filesnpaths.get_temp_file_path()
        temp_samples_order_file = open(temp_samples_order_file_path, 'w')
        temp_samples_order_file.write('\t'.join(['attributes', 'basic', 'newick']) + '\n')

        if filesnpaths.is_proper_newick(_order, dont_raise=True):
            temp_samples_order_file.write('\t'.join([single_order_name, '', _order]) + '\n')
            self.samples_order_dict[single_order_name] = {'newick': _order, 'basic': None}
        else:
            temp_samples_order_file.write('\t'.join([single_order_name, _order, '']) + '\n')
            self.samples_order_dict[single_order_name] = {'basic': _order, 'newick': None}

        temp_samples_order_file.close()

        sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(temp_samples_order_file_path)
        os.remove(temp_samples_order_file_path)

        if not self.sample_names_in_samples_information_file:
            self.sample_names_in_samples_order_file = sample_names_in_samples_order_file

        self.available_orders.add(single_order_name)

        self.run.info('Samples order', "A single order for '%s' is also loaded" % single_order_name, quiet=self.quiet)
Ejemplo n.º 19
0
    def process_single_order_data(self, single_order_path, single_order_name):
        """Just inject a single order into the `self.samples_order_dict`"""

        if not single_order_path:
            return

        if not single_order_name:
            raise SamplesError("You provided a file for a single order, but not a name for it. This is a no no :/")

        filesnpaths.is_file_plain_text(single_order_path)

        single_order_file_content = [l.strip('\n') for l in open(single_order_path, 'rU').readlines()]

        if len(single_order_file_content) != 1:
            raise SamplesError("The single order file should contain a single line of information. It can't have nothing,\
                                it can't have too much. Just a single newick tree, or a comma-separated list of sample\
                                names.")

        _order = single_order_file_content.pop()

        # if you are reading this line, please brace yourself to possibly one of the silliest
        # bunch of lines in the anvi'o codebase. the reason we are doing this this way is quite
        # a long story, and deserves a FIXME, but in order to utilize the excellent function
        # in the filesnpaths module to check the contents of the samples order dict rigirously,
        # we need to have this information in a file. a better way could have been implementing
        # a filesnpaths.is_proper_samples_order_content function next to the currently available
        # filesnpaths.is_proper_samples_order_file (the latter would call the former with a dict
        # and it would be much more flexible), but we can't import utils form within filesnpaths.
        # without utils we don't have a get_TAB_delimited_file_as_dictionary function, and we are
        # definitely not going to implement it in two places :( recovering from a poor design by
        # doing something even poorer? couldn't have we fixed this once and for all instead of
        # writing this paragraph? well. just remember that you are thinking about a rethorical
        # question in a comment section. so sometimes we do things that are not quite productive.
        temp_samples_order_file_path = filesnpaths.get_temp_file_path()
        temp_samples_order_file = open(temp_samples_order_file_path, 'w')
        temp_samples_order_file.write('\t'.join(['attributes', 'basic', 'newick']) + '\n')

        if filesnpaths.is_proper_newick(_order, dont_raise=True):
            temp_samples_order_file.write('\t'.join([single_order_name, '', _order]) + '\n')
            self.samples_order_dict[single_order_name] = {'newick': _order, 'basic': None}
        else:
            temp_samples_order_file.write('\t'.join([single_order_name, _order, '']) + '\n')
            self.samples_order_dict[single_order_name] = {'basic': _order, 'newick': None}

        temp_samples_order_file.close()

        sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(temp_samples_order_file_path)
        os.remove(temp_samples_order_file_path)

        if not self.sample_names_in_samples_information_file:
            self.sample_names_in_samples_order_file = sample_names_in_samples_order_file

        self.available_orders.add(single_order_name)

        self.run.info('Samples order', "A single order for '%s' is also loaded" % single_order_name, quiet=self.quiet)
Ejemplo n.º 20
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Ejemplo n.º 21
0
    def check_database(self):
        """
        Checks for the .bin version of database. If it only finds the .pir version, it binarizes it.
        Sets the db filepath.
        """
        extensionless, extension = os.path.splitext(self.modeller_database)
        if extension not in [".bin", ".pir", ""]:
            raise ConfigError(
                "MODELLER :: The only possible database extensions are .bin and .pir"
            )

        bin_db_path = J(self.database_dir, extensionless + ".bin")
        pir_db_path = J(self.database_dir, extensionless + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        self.database_path = bin_db_path

        if bin_exists:
            return

        if not pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning(
                "Anvi'o looked in {} for a database with the name {} and with an extension \
                              of either .bin or .pir, but didn't find anything matching that \
                              criteria. Anvi'o will try and download the best database it knows of from \
                              https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \
                              You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \
                              database".format(self.database_dir,
                                               self.modeller_database))

            db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz")
            utils.download_file(
                "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                db_download_path)
            utils.run_command(['gzip', '-d', db_download_path],
                              log_file_path=filesnpaths.get_temp_file_path())

            pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                          dont_raise=True)

        if pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower \
                              than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)
            return
Ejemplo n.º 22
0
    def generate_tree(self):
        gene_clusters = set(request.forms.getall('gene_clusters[]'))
        name = request.forms.get('name')
        program = request.forms.get('program')
        aligner = request.forms.get('aligner')
        store_tree = request.forms.get('store_tree')

        temp_fasta_file = filesnpaths.get_temp_file_path()
        temp_tree_file = filesnpaths.get_temp_file_path()
        tree_text = None

        try:
            self.interactive.write_sequences_in_gene_clusters_for_phylogenomics(
                gene_cluster_names=gene_clusters,
                output_file_path=temp_fasta_file,
                align_with=aligner)
            drivers.driver_modules['phylogeny'][program]().run_command(
                temp_fasta_file, temp_tree_file)
            tree_text = open(temp_tree_file, 'rb').read().decode()

            if store_tree:
                TableForLayerOrders(self.interactive.args).add(
                    {name: {
                        'data_type': 'newick',
                        'data_value': tree_text
                    }})

                # TO DO: instead of injecting new newick tree, we can use TableForLayerOrders.get()
                self.interactive.layers_order_data_dict[name] = {
                    'newick': tree_text,
                    'basic': None
                }
        except Exception as e:
            message = str(e.clear_text()) if 'clear_text' in dir(e) else str(e)
            return json.dumps({'status': 1, 'message': message})

        return json.dumps({'status': 0, 'tree': tree_text})
Ejemplo n.º 23
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError, 'anvio could not recover the actual path of the database\
                                            (!%s) referenced in the config file, because the database\
                                            paths variable sent from the client does not have an entry\
                                            for it :( There are two options. One is to get a db_paths\
                                            dictionary sent to this class that contains a key for %s\
                                            with the full path to the dataase as a value. Or the table\
                                            "%s" can be exported to a TAB-delimited matrix and declared in\
                                            the config file. If you are experimenting and stuck here, please\
                                            see the documentation or send an e-mail to the developers.'\
                                                                                % (database, database, table)
                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.join(self.input_directory, database)

                if not os.path.exists(database_path):
                    raise ConfigError, 'The database you requested (%s) is not in the input directory :/' % database

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError, 'The table you requested (%s) does not seem to be in %s :/' % (table, database)

                table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    table_rows = [r for r in table_rows if r[0] in self.row_ids_of_interest]

                if not len(table_rows):
                    raise ConfigError, "It seems the table '%s' in the database it was requested from is empty. This\
                                        is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section)

                tmp_file_path = filesnpaths.get_temp_file_path()
                table_structure = dbc.get_table_structure(table)
                columns_to_exclude = [c for c in ['entry_id', 'sample_id'] if c in table_structure]
                store_array(table_rows, tmp_file_path, table_structure, exclude_columns=columns_to_exclude)
                self.matrix_paths[alias] = tmp_file_path
Ejemplo n.º 24
0
def get_newick_tree_data_for_dict(d,
                                  linkage=constants.linkage_method_default,
                                  distance=constants.distance_metric_default):
    is_distance_and_linkage_compatible(distance, linkage)

    matrix_file = filesnpaths.get_temp_file_path()
    utils.store_dict_as_TAB_delimited_file(d, matrix_file,
                                           ['items'] + d[d.keys()[0]].keys())

    newick = get_newick_tree_data(matrix_file,
                                  distance=distance,
                                  linkage=linkage)

    os.remove(matrix_file)
    return newick
Ejemplo n.º 25
0
    def __init__(self, args={}, run=terminal.Run(), progress=terminal.Progress(), program_name='average_nucleotide_identity.py'):
        self.run = run
        self.progress = progress
        self.program_name = program_name 

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.num_threads = A('num_threads') or 1
        self.method = A('method') or 'ANIb'
        self.log_file_path = os.path.abspath(A('log_file') or filesnpaths.get_temp_file_path())

        self.check_programs()

        self.run.warning("Anvi'o will use 'PyANI' by Pritchard et al. (DOI: 10.1039/C5AY02550H) to compute ANI. If you publish your findings, \
                            please do not forget to properly credit their work.", lc='green', header="CITATION")

        self.run.info('[PyANI] Num threads to use', self.num_threads)
        self.run.info('[PyANI] Alignment method', self.method)
        self.run.info('[PyANI] Log file path', self.log_file_path, nl_after=1)
Ejemplo n.º 26
0
    def fix_input_file(self, input_file_path):
        """Select columns for anvio and remove duplicate rows"""
        self.progress.new('Making agnostos output anvio friendly')
        self.progress.update('...')

        temp_file_path = filesnpaths.get_temp_file_path()

        df = pd.read_csv(input_file_path, sep='\t')
        df = df[[
            "gene_callers_id", "cl_name", "contig", "gene_x_contig", "cl_size",
            "category", "pfam", "is.HQ", "is.LS", "lowest_rank",
            "lowest_level", "niche_breadth_sign"
        ]]
        df = df.drop_duplicates(subset=["gene_callers_id"])

        df.to_csv(temp_file_path, sep='\t', index=False, na_rep='NA')

        self.progress.end()

        return temp_file_path
Ejemplo n.º 27
0
    def gen_hierarchical_clustering_of_gene_clusters(self):
        """Uses a clustering configuration to add hierarchical clustering of gene clusters into the pan db

        Note how this function cheats the system to create an enchanced clustering configuration:
        We want to use the clustering configurations for pan genomomic analyses to order
        gene clusters. however, we want to add something into the clustering configuraiton
        file, which depends on the number of genomes we have. this addition is 'num_genomes_gene_cluster_has_hits'
        data, which pulls together gene clusters that are distributed across genomes similarly based
        on this extra bit of inofrmation. becasue the clustering configurations framework in anvi'o
        does not allow us to have variable information in these recipes, we are going to generate one
        on the fly to have a more capable one."""

        if self.skip_hierarchical_clustering:
            return

        updated_clustering_configs = {}

        for config_name in constants.clustering_configs['pan']:
            config_path = constants.clustering_configs['pan'][config_name]

            # now we have the config path. we first get a temporary file path:
            enhanced_config_path = filesnpaths.get_temp_file_path()

            # setup the additional section based on the number of genomes we have:
            if config_name == 'presence-absence':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\n""" \
                                        % ','.join(['num_genomes_gene_cluster_has_hits'] * (int(round(len(self.genomes) / 2))))
            elif config_name == 'frequency':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\nlog=True\n""" \
                                        % ','.join(['num_genes_in_gene_cluster'] * (int(round(math.sqrt(len(self.genomes))))))

            # write the content down in to file at the new path:
            open(enhanced_config_path, 'w').write(
                open(config_path).read() + additional_config_section)

            # update the clustering configs:
            updated_clustering_configs[config_name] = enhanced_config_path

            dbops.do_hierarchical_clustering_of_items(self.pan_db_path, updated_clustering_configs, database_paths={'PAN.db': self.pan_db_path},\
                                                      input_directory=self.output_dir, default_clustering_config=constants.pan_default,\
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Ejemplo n.º 28
0
    def check_database(self):
        """
        Checks for the .bin version of database. If it only finds the .pir version, it binarizes it.
        Sets the db filepath.
        """
        extensionless, extension = os.path.splitext(self.modeller_database)
        if extension not in [".bin",".pir",""]:
            raise ConfigError("MODELLER :: The only possible database extensions are .bin and .pir")

        bin_db_path = J(self.database_dir, extensionless+".bin")
        pir_db_path = J(self.database_dir, extensionless+".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True)

        self.database_path = bin_db_path

        if bin_exists:
            return

        if not pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning("Anvi'o looked in {} for a database with the name {} and with an extension \
                              of either .bin or .pir, but didn't find anything matching that \
                              criteria. We'll try and download the best database we know of from \
                              https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. \
                              You can checkout https://salilab.org/modeller/ for more info about the pdb_95 \
                              database".format(self.database_dir, self.modeller_database))

            db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz")
            utils.download_file("https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path)
            utils.run_command(['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path())

            pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True)

        if pir_exists and not bin_exists:
            self.progress.clear()
            self.run.warning("Your database is not in binary format. That means accessing its contents is slower \
                              than it could be. Anvi'o is going to make a binary format. Just FYI")
            self.run_binarize_database(pir_db_path, bin_db_path)
            return
Ejemplo n.º 29
0
    def gen_hierarchical_clustering_of_gene_clusters(self):
        """Uses a clustering configuration to add hierarchical clustering of gene clusters into the pan db

        Note how this function cheats the system to create an enchanced clustering configuration:
        We want to use the clustering configurations for pan genomomic analyses to order
        gene clusters. however, we want to add something into the clustering configuraiton
        file, which depends on the number of genomes we have. this addition is 'num_genomes_gene_cluster_has_hits'
        data, which pulls together gene clusters that are distributed across genomes similarly based
        on this extra bit of inofrmation. becasue the clustering configurations framework in anvi'o
        does not allow us to have variable information in these recipes, we are going to generate one
        on the fly to have a more capable one."""

        if self.skip_hierarchical_clustering:
            return

        updated_clustering_configs = {}

        for config_name in constants.clustering_configs['pan']:
            config_path = constants.clustering_configs['pan'][config_name]

            # now we have the config path. we first get a temporary file path:
            enhanced_config_path = filesnpaths.get_temp_file_path()

            # setup the additional section based on the number of genomes we have:
            if config_name == 'presence-absence':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\n""" \
                                        % ','.join(['num_genomes_gene_cluster_has_hits'] * (int(round(len(self.genomes) / 2))))
            elif config_name == 'frequency':
                additional_config_section="""\n[AdditionalData !PAN.db::item_additional_data]\ntable_form=dataframe\ncolumns_to_use = %s\nnormalize = False\nlog=True\n""" \
                                        % ','.join(['num_genes_in_gene_cluster'] * (int(round(math.sqrt(len(self.genomes))))))

            # write the content down in to file at the new path:
            open(enhanced_config_path, 'w').write(open(config_path).read() + additional_config_section)

            # update the clustering configs:
            updated_clustering_configs[config_name] = enhanced_config_path

            dbops.do_hierarchical_clustering_of_items(self.pan_db_path, updated_clustering_configs, database_paths={'PAN.db': self.pan_db_path},\
                                                      input_directory=self.output_dir, default_clustering_config=constants.pan_default,\
                                                      distance=self.distance, linkage=self.linkage, run=self.run, progress=self.progress)
Ejemplo n.º 30
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)
Ejemplo n.º 31
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning(
                "COGs class has no hits to process. Returning empty handed.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0

        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {
                'gene_callers_id': int(gene_callers_id),
                'source': source,
                'accession': accession,
                'function': function,
                'e_value': float(e_value)
            }
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        missing_cogs_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append(
                    (ncbi_protein_id, gene_callers_id), )
            else:
                COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = set([])
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.add(category)

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            add_entry(gene_callers_id, 'COG_FUNCTION', '!!!'.join(COG_ids),
                      '!!!'.join(annotations),
                      self.hits[gene_callers_id]['evalue'])
            add_entry(gene_callers_id, 'COG_CATEGORY', '!!!'.join(categories),
                      '!!!'.join(categories), 0.0)

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(
            self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning(
                'Although your COGs are successfully added to the database, there were some COG IDs your genes hit\
                              were among the ones that were not described in the raw data. Here is the list of %d COG IDs that\
                              were hit %d times: %s.' %
                (len(missing_cogs_found), hits_for_missing_cogs,
                 ', '.join(missing_cogs_found)))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein\
                              IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function.\
                              That's OK if you don't care. But if you would like to take a look, anvi'o stored a report\
                              file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Ejemplo n.º 32
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning("COGs class has no hits to process. Returning empty handed, but still adding COGs as "
                             "functional sources.")
            gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)

            if self.COG_version == 'COG14':
                gene_function_calls_table.add_empty_sources_to_functional_sources({'COG14_FUNCTION', 'COG14_CATEGORY'})
            elif self.COG_version == 'COG20':
                gene_function_calls_table.add_empty_sources_to_functional_sources({'COG20_FUNCTION', 'COG20_CATEGORY', 'COG20_PATHWAY'})
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0


        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {'gene_callers_id': int(gene_callers_id),
                                               'source': source,
                                               'accession': accession,
                                               'function': function,
                                               'e_value': float(e_value)}
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        hits_for_missing_ncbi_protein_ids = 0

        missing_cogs_found = set([])
        missing_ncbi_protein_ids_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append((ncbi_protein_id, gene_callers_id),)
                missing_ncbi_protein_ids_found.add(ncbi_protein_id)
                hits_for_missing_ncbi_protein_ids += 1
                continue

            COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = []
            pathways = []
            category_descriptions = []
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.append(category)
                    category_descriptions.append(cogs_data.categories[category])

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

                if self.COG_version == 'COG14':
                    pass
                elif self.COG_version == 'COG20':
                    if cogs_data.cogs[COG_id]['pathway']:
                        pathways.append(cogs_data.cogs[COG_id]['pathway'])
                else:
                    raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                      "parsing of a new generation of COG files.")

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            if self.COG_version == 'COG14':
                add_entry(gene_callers_id, f'{self.COG_version}_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
                add_entry(gene_callers_id, f'{self.COG_version}_CATEGORY', '!!!'.join(categories), '!!!'.join(category_descriptions), 0.0)
            elif self.COG_version == 'COG20':
                add_entry(gene_callers_id, f'{self.COG_version}_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
                add_entry(gene_callers_id, f'{self.COG_version}_CATEGORY', '!!!'.join(categories), '!!!'.join(category_descriptions), 0.0)
                if len(pathways):
                    add_entry(gene_callers_id, f'{self.COG_version}_PATHWAY', '!!!'.join(COG_ids), '!!!'.join(pathways), self.hits[gene_callers_id]['evalue'])
            else:
                raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                                  "parsing of a new generation of COG files.")

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning('Although your COGs are successfully added to the database, there were some COG IDs your genes hit '
                             'were among the ones that were not described in the raw data. Here is the list of %d COG IDs that '
                             'were hit %d times: %s.' % (len(missing_cogs_found), hits_for_missing_cogs, ', '.join(missing_cogs_found)))

        if len(missing_ncbi_protein_ids_found):
            self.run.warning("Well. Your COGs were successfully added to the database, but there were some garbage anvi'o brushed "
                             "off under the rug. There were %d genes in your database that hit %d protein IDs in NCBIs COGs database, "
                             "but since NCBI did not release what COGs they correspond to in the database they made available (that "
                             "helps us to resolve protein IDs to COG ids), we could not annotate those genes with functions. Anvi'o "
                             "apologizes on behalf of all computer scientists for half-done stuff we often force biologists to deal "
                             "with. If you want to do some Googling, these were the offending protein IDs: '%s'." % \
                                        (hits_for_missing_ncbi_protein_ids, len(missing_ncbi_protein_ids_found), ', '.join([str(s) for s in missing_ncbi_protein_ids_found])))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein "
                             "IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function. "
                             "That's OK if you don't care. But if you would like to take a look, anvi'o stored a report "
                             "file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Ejemplo n.º 33
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError('anvio could not recover the actual path of the database\
                                            (!%s) referenced in the config file, because the database\
                                            paths variable sent from the client does not have an entry\
                                            for it :( There are two options. One is to get a db_paths\
                                            dictionary sent to this class that contains a key for %s\
                                            with the full path to the dataase as a value. Or the table\
                                            "%s" can be exported to a TAB-delimited matrix and declared in\
                                            the config file. If you are experimenting and stuck here, please\
                                            see the documentation or send an e-mail to the developers.'\
                                                                                % (database, database, table))
                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.abspath(self.db_paths[database]) if database in self.db_paths else os.path.abspath(database)

                    # if its not there, let's try one more thing
                    if not os.path.exists(database_path):
                        database_path = os.path.abspath(os.path.join(self.input_directory, database))

                if not os.path.exists(database_path):
                    raise ConfigError("The database you requested (%s) is not where it was supposed to be ('%s') :/" % (database, database_path))

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError('The table you requested (%s) does not seem to be in %s :/' % (table, database))

                # here we know we are working with a database table that we have access to. however, in anvi'o database
                # tables in two forms: dataframe form, and matrix form. in dataframe form, we have key/value pairs rather
                # than MxN matrices where each N is a column for an attribute. while the latter is easier to export as a
                # matrix the clustering module can work with, the former requires extra attention. so here we need to first
                # figure out whether which form the table is in. why this even became necessary? taking a look at this issue
                # may help: https://github.com/merenlab/anvio/issues/662
                table_form = None
                if config.has_option(section, 'table_form'):
                    table_form = config.get(section, 'table_form')

                table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    if table_form == 'dataframe':
                        raise ConfigError("Oops .. anvi'o does not know how to deal with specific row ids of interest when a table\
                                           refernced from a clustering recipe is in dataframe form :(")
                    table_rows = [r for r in table_rows if r[0] in self.row_ids_of_interest]

                if not len(table_rows):
                    raise ConfigError("It seems the table '%s' in the database it was requested from is empty. This\
                                        is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section))

                tmp_file_path = filesnpaths.get_temp_file_path()

                # time to differentially store table contents.
                if table_form == 'dataframe':
                    args = argparse.Namespace(pan_or_profile_db=database_path, table_name=table)
                    table = TableForItemAdditionalData(args)
                    table_keys_list, table_data_dict = table.get()
                    store_dict_as_TAB_delimited_file(table_data_dict, tmp_file_path)
                else:
                    table_structure = dbc.get_table_structure(table)
                    columns_to_exclude = [c for c in ['entry_id', 'sample_id'] if c in table_structure]
                    store_array(table_rows, tmp_file_path, table_structure, exclude_columns=columns_to_exclude)

                self.matrix_paths[alias] = tmp_file_path
Ejemplo n.º 34
0
    def upload_project(self):
        try:
            args = argparse.Namespace()
            args.user = request.forms.get('username')
            args.password = request.forms.get('password')
            args.api_url = anvio.D['api-url'][1]['default']
            args.project_name = request.forms.get('project_name')
            args.delete_if_exists = True if request.forms.get(
                'delete_if_exists') == "true" else False

            view_name = request.forms.get('view')
            if view_name in self.interactive.views:
                view_path = filesnpaths.get_temp_file_path()
                utils.store_array_as_TAB_delimited_file(
                    self.interactive.views[view_name][1:], view_path,
                    self.interactive.views[view_name][0])
                args.view_data = view_path

            item_order_name = request.forms.get('ordering')
            if item_order_name in self.interactive.p_meta['item_orders']:
                ordering_path = filesnpaths.get_temp_file_path()
                items_order = self.interactive.p_meta['item_orders'][
                    item_order_name]

                f = open(ordering_path, 'w')
                if items_order['type'] == 'newick':
                    f.write(items_order['data'])
                    args.tree = ordering_path
                elif items_order['type'] == 'basic':
                    f.write("\n".join(items_order['data']))
                    args.items_order = ordering_path
                f.close()

            state_name = request.forms.get('state')
            if state_name in self.interactive.states_table.states:
                state_path = filesnpaths.get_temp_file_path()
                f = open(state_path, 'w')
                f.write(self.interactive.states_table.states[state_name]
                        ['content'])
                f.close()

                args.state = state_path

            if request.forms.get('include_description') == "true":
                description_path = filesnpaths.get_temp_file_path()
                f = open(description_path, 'w')
                f.write(self.interactive.p_meta['description'])
                f.close()

                args.description = description_path

            if request.forms.get('include_samples') == "true":
                if len(self.interactive.samples_order_dict):
                    samples_order_path = filesnpaths.get_temp_file_path()
                    utils.store_dict_as_TAB_delimited_file(
                        self.interactive.samples_order_dict,
                        samples_order_path,
                        headers=['attributes', 'basic', 'newick'])
                    args.samples_order_file = samples_order_path

                if len(self.interactive.samples_information_dict):
                    samples_info_path = filesnpaths.get_temp_file_path()
                    utils.store_dict_as_TAB_delimited_file(
                        self.interactive.samples_information_dict,
                        samples_info_path)
                    args.samples_information_file = samples_info_path

            collection_name = request.forms.get('collection')
            if collection_name in self.interactive.collections.collections_dict:
                collection_path_prefix = filesnpaths.get_temp_file_path()
                self.interactive.collections.export_collection(
                    collection_name, output_file_prefix=collection_path_prefix)

                args.bins = collection_path_prefix + '.txt'
                args.bins_info = collection_path_prefix + '-info.txt'

            server = AnviServerAPI(args)
            server.login()
            server.push()
            return json.dumps({'status': 0})
        except Exception as e:
            message = str(e.clear_text()) if hasattr(e,
                                                     'clear_text') else str(e)
            return json.dumps({'status': 1, 'message': message})
Ejemplo n.º 35
0
    def gen_samples_db_for_the_merged_profile(self):
        """Geenrate a samples db for the merged profile.

           We use the ProfileSuperclass to load all the views we added into the meged profile,
           and generate clusterings of samples for each view to generate a default samples database."""

        self.run.info_single("SAMPLES.db stuff...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        class Args:
            pass

        args = Args()
        args.profile_db = self.merged_profile_db_path

        # initialize views.
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        sample_orders = {}
        failed_attempts = []
        self.progress.new('Working on SAMPLES.db')
        for essential_field in essential_fields:
            self.progress.update('recovering samples order for "%s"' %
                                 (essential_field))
            try:
                sample_orders[essential_field] = \
                        clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                 distance=self.distance,
                                                                 linkage=self.linkage,
                                                                 transpose=True)
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(sample_orders):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate a samples\
                              database for this merged profile, however, all attempts to cluster samples\
                              based on view data available in the merged profile failed. No samples db\
                              for you :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to generate a samples db with clusterings it generated\
                              using the view data that worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # generate the samples order file
        samples_order_file_path = filesnpaths.get_temp_file_path()
        samples_order_file = open(samples_order_file_path, 'w')
        samples_order_file.write('attributes\tbasic\tnewick\n')
        for sample_order in sample_orders:
            samples_order_file.write(
                '%s\t%s\t%s\n' %
                (sample_order, '', sample_orders[sample_order]))
        samples_order_file.close()

        # figure out samples information stuff
        samples_information = {}
        headers = []
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name] = {}

        self.progress.new('Working on SAMPLES.db')
        self.progress.update('...')

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name][
                'num_mapped_reads'] = self.total_reads_mapped_per_sample[
                    sample_name]

        self.progress.end()
        # generate the samples information file
        samples_information_file_path = filesnpaths.get_temp_file_path()
        utils.store_dict_as_TAB_delimited_file(samples_information,
                                               samples_information_file_path,
                                               headers=headers)

        # generate the samples database
        samples_db = dbops.SamplesInformationDatabase(self.samples_db_path,
                                                      quiet=False)
        samples_db.create(
            samples_order_path=samples_order_file_path,
            samples_information_path=samples_information_file_path)

        os.remove(samples_order_file_path)
        os.remove(samples_information_file_path)

        self.run.info('Samples database', self.samples_db_path)
Ejemplo n.º 36
0
    def store_hits_into_contigs_db(self):
        if not self.hits:
            self.run.warning("COGs class has no hits to process. Returning empty handed.")
            return

        cogs_data = COGsData(self.args)
        cogs_data.init_p_id_to_cog_id_dict()

        functions_dict = {}
        self.__entry_id = 0


        def add_entry(gene_callers_id, source, accession, function, e_value):
            functions_dict[self.__entry_id] = {'gene_callers_id': int(gene_callers_id),
                                               'source': source,
                                               'accession': accession,
                                               'function': function,
                                               'e_value': float(e_value)}
            self.__entry_id += 1

        # let's keep track of hits that match to missing COGs
        hits_for_missing_cogs = 0
        missing_cogs_found = set([])

        for gene_callers_id in self.hits:
            ncbi_protein_id = self.hits[gene_callers_id]['hit']

            in_proteins_FASTA_not_in_cogs_CSV = []
            if ncbi_protein_id not in cogs_data.p_id_to_cog_id:
                in_proteins_FASTA_not_in_cogs_CSV.append((ncbi_protein_id, gene_callers_id),)
            else:
                COG_ids = cogs_data.p_id_to_cog_id[ncbi_protein_id]

            annotations = []
            categories = set([])
            for COG_id in COG_ids:
                # is missing?
                if COG_id in cogs_data.missing_cogs:
                    missing_cogs_found.add(COG_id)
                    hits_for_missing_cogs += 1
                    continue

                # resolve categories
                for category in cogs_data.cogs[COG_id]['categories']:
                    categories.add(category)

                # append annotation
                annotations.append(cogs_data.cogs[COG_id]['annotation'])

            # all these shitty heuristics... If there are multiple COG ids or categories, separate them from each other by '!!!' so parsing
            # them later is possible. Am I embarrassed? Yes. Is there a better way of doing this efficiently? Absolutely. What time is it?
            # 9pm. Where am I? In the lab. Is it OK for me to let this slip away if it means for me to go home sooner? Yes, probably. Am I
            # gonna remember this crap in the code for the next two months at random times in the shower and feel bad about myself? F**k yes.
            add_entry(gene_callers_id, 'COG_FUNCTION', '!!!'.join(COG_ids), '!!!'.join(annotations), self.hits[gene_callers_id]['evalue'])
            add_entry(gene_callers_id, 'COG_CATEGORY', '!!!'.join(categories), '!!!'.join(categories), 0.0)

        # store hits in contigs db.
        gene_function_calls_table = TableForGeneFunctions(self.contigs_db_path, self.run, self.progress)
        gene_function_calls_table.create(functions_dict)

        if len(missing_cogs_found):
            self.run.warning('Although your COGs are successfully added to the database, there were some COG IDs your genes hit\
                              were among the ones that were not described in the raw data. Here is the list of %d COG IDs that\
                              were hit %d times: %s.' % (len(missing_cogs_found), hits_for_missing_cogs, ', '.join(missing_cogs_found)))

        if len(in_proteins_FASTA_not_in_cogs_CSV):
            # so some of the hits represented in the FASTA file from the NCBI were not put in the
            # CSV file from NCBI to associate them with COGs
            report_output_file_path = filesnpaths.get_temp_file_path()
            report_output = open(report_output_file_path, 'w')
            report_output.write('anvio_gene_callers_id\tNCBI_protein_id\n')

            for protein_id, gene_callers_id in in_proteins_FASTA_not_in_cogs_CSV:
                report_output.write('%s\t%s\n' % (gene_callers_id, protein_id))

            report_output.close()

            self.run.warning("This is important. %s hits for your genes that appeared in the proteins FASTA file from the NCBI had protein\
                              IDs that were not described in the CSV file from the NCBI that associates each protein ID with a COG function.\
                              That's OK if you don't care. But if you would like to take a look, anvi'o stored a report\
                              file for you at %s" \
                        % (len(in_proteins_FASTA_not_in_cogs_CSV), report_output_file_path))
Ejemplo n.º 37
0
    def alignment_worker(input_queue, output_queue, gene_clusters_dict, genomes_storage, align_with, run):
        # Note for future changes, this worker should not write anything to gene_clusters_dict
        # or genome_storage, changes will not be reflected to main process or other processes.

        aligner = aligners.select(align_with, quiet=True)

        # this instance of Run is here because we don't want to create this over and over again
        # in the loop down below. there also is another run instance the worker gets to make sure
        # it can report its own messages .. don't be confused we-do-not-age-discriminate-here padawan.
        r = terminal.Run()
        r.verbose = False

        # Main process needs to kill this worker after it receives all tasks because of this infinite loop
        while True:
            gene_cluster_name = input_queue.get(True)

            if len(gene_clusters_dict[gene_cluster_name]) == 1:
                # this sequence is a singleton and does not need alignment
                output_queue.put(None)
                continue

            gene_sequences_in_gene_cluster = []

            for gene_entry in gene_clusters_dict[gene_cluster_name]:
                sequence = genomes_storage.get_gene_sequence(gene_entry['genome_name'], gene_entry['gene_caller_id'])
                gene_sequences_in_gene_cluster.append(('%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id']), sequence),)

            # sometimes alignments fail, and because pangenomic analyses can take forever,
            # everything goes into the trash bin. to prevent that, here we have a try/except
            # block with lots of warnings if something goes wrong.
            try:
                alignments = aligner(run=r).run_stdin(gene_sequences_in_gene_cluster)
            except:
                # realm of sad face. before we continue to spam the user with error messages,
                # we turn our gene sequences to alignments without alignments. this worker will
                # report raw, unaligned sequences for this gene cluster as if they were aligned
                # so things will continue working operationally, and it will be on the user to
                # make sure they went through their results carefully.
                alignments = dict(gene_sequences_in_gene_cluster)

                # constructing our #sad:
                if anvio.DEBUG:
                    temp_file_path = filesnpaths.get_temp_file_path(prefix='ANVIO_GC_%s' % (gene_cluster_name))
                    with open(temp_file_path, 'w') as output:
                        for tpl in gene_sequences_in_gene_cluster:
                            output.write('>%s\n%s\n' % (tpl[0], tpl[1]))
                    debug_info = "The %d sequences in gene cluster %s are stored in the temporary file '%s'" % \
                                        (len(gene_sequences_in_gene_cluster), gene_cluster_name, temp_file_path)
                else:
                    debug_info = "If you re-run your last command with a `--debug` flag, anvi'o will generate more\
                                  information for you about the contenets of this gene cluster (but if you are seeing\
                                  millions of these warnings, it may not be a good idea since with the `--debug` flag\
                                  anvi'o will generate a FASTA file in a temporary directory with the contents of the\
                                  gene cluster, and will not attempt to delete them later)."

                run.warning("VERY BAD NEWS. The alignment of seqeunces with '%s' in the gene cluster '%s' failed\
                             for some reason. Since the real answer to 'why' is too deep in the matrix, there is\
                             no reliable solution for anvi'o to find it for you, BUT THIS WILL AFFECT YOUR SCIENCE\
                             GOING FORWARD, SO YOU SHOULD CONSIDER ADDRESSING THIS ISSUE FIRST. %s" % \
                                                       (aligner.__name__, gene_cluster_name, debug_info), nl_before=1)


            output = {'name': gene_cluster_name, 'entry': copy.deepcopy(gene_clusters_dict[gene_cluster_name])}
            for gene_entry in output['entry']:
                gene_entry['alignment_summary'] = utils.summarize_alignment(alignments['%s_%d' % (gene_entry['genome_name'], gene_entry['gene_caller_id'])])

            output_queue.put(output)
Ejemplo n.º 38
0
    def process(self):
        """
        """
        # will be empty if all sources in self.annotation_sources_info have "skip": True
        residue_annotation_methods = [
            info["method"] for _, info in self.annotation_sources_info.items()
            if not info["skip"]
        ]

        # which genes had structures and which did not. this information is added to the structure database self table
        has_structure = {True: [], False: []}

        num_genes_tried = 0
        num_genes_to_try = len(self.genes_of_interest)

        for corresponding_gene_call in self.genes_of_interest:
            # MODELLER outputs a lot of stuff into its working directory. A temporary directory is
            # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this
            # directory are used in the creation of the structure database. If self.full_modeller_output is
            # provided, these directories and their contents are moved into self.full_modeller_output.
            self.args.directory = filesnpaths.get_temp_directory_path()
            self.args.target_fasta_path = filesnpaths.get_temp_file_path()

            # Export sequence
            dbops.export_aa_sequences_from_contigs_db(
                self.contigs_db_path,
                self.args.target_fasta_path,
                set([corresponding_gene_call]),
                quiet=True)

            # Model structure
            progress_title = 'Modelling gene ID %d; (%d of %d processed)' % (
                corresponding_gene_call, num_genes_tried, num_genes_to_try)
            modeller_out = self.run_modeller(corresponding_gene_call,
                                             progress_title)
            if modeller_out["structure_exists"]:
                self.run.info_single("Gene successfully modelled!",
                                     nl_after=1,
                                     mc="green")

            has_structure[modeller_out["structure_exists"]].append(
                str(corresponding_gene_call))

            # Annotate residues
            residue_info_dataframe = None
            if modeller_out["structure_exists"]:
                residue_info_dataframe = self.run_residue_annotation_for_gene(
                    residue_annotation_methods, corresponding_gene_call,
                    modeller_out["best_model_path"])
            # Append info to tables
            self.append_gene_info_to_tables(modeller_out,
                                            residue_info_dataframe)

            # Append metadata to self
            self.update_structure_database_meta_table(has_structure)

            if self.full_modeller_output:
                self.dump_results_to_full_output()

            num_genes_tried += 1

        if not has_structure[True]:
            raise ConfigError(
                "Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'("
            )

        self.structure_db.disconnect()