Ejemplo n.º 1
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path,
                  'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning(
                "Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South."
            )

        if utils.is_program_exists(
                'makeblastdb', dont_raise=True) and utils.is_program_exists(
                    'blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning(
                "BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later."
            )

        os.remove(temp_fasta_path)
Ejemplo n.º 2
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        try:
            with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
                f_out.write(f_in.read())
        except Exception as e:
            progress.end()
            raise ConfigError(f"Something went wrong while decompressing the downloaded file :/ It is likely that "
                              f"the download failed and only part of the file was downloaded. If you would like to "
                              f"try again, please run the setup command with the flag `--reset`. Here is what the "
                              f"downstream library said: '{e}'.")

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("DIAMOND does not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to "
                             "generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Ejemplo n.º 3
0
    def run_blast(self, unique_AA_sequences_fasta_path, unique_AA_sequences_names_dict):
        self.run.warning("You elected to use NCBI's blastp for amino acid sequence search. Running blastp will \
                          be significantly slower than DIAMOND (although, anvi'o developers are convinced that \
                          you *are* doing the right thing, so, kudos to you).")
        blast = BLAST(unique_AA_sequences_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_AA_sequences_names_dict
        blast.log_file_path = self.log_file_path
        blast.search_output_path = self.get_output_file_path('blast-search-results.txt')

        return blast.get_blast_results()
Ejemplo n.º 4
0
    def format_protein_db(self, input_file_path, output_file_path):
        progress.new('Formatting raw files')
        progress.update('Decompressing protein sequences')

        # poor man's uncompress
        temp_fasta_path = filesnpaths.get_temp_file_path()
        with open(temp_fasta_path, 'wb') as f_out, gzip.open(input_file_path, 'rb') as f_in:
            f_out.write(f_in.read())

        progress.end()

        if utils.is_program_exists('diamond', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_DIAMOND')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('Diamond log', log_file_path)

            diamond = Diamond(temp_fasta_path)
            diamond.num_threads = self.num_threads
            diamond.run.log_file_path = log_file_path
            diamond.makedb(output_db_path)
        else:
            self.run.warning("Diamond does not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for it. Remember this when/if things go South.")

        if utils.is_program_exists('makeblastdb', dont_raise=True) and utils.is_program_exists('blastp', dont_raise=True):
            output_dir = J(self.COG_data_dir, 'DB_BLAST')
            if os.path.exists(output_dir):
                shutil.rmtree(output_dir)

            os.mkdir(output_dir)

            output_db_path = J(output_dir, 'COG')
            log_file_path = J(output_dir, 'log.txt')

            self.run.info('BLAST log', log_file_path)

            blast = BLAST(temp_fasta_path)
            blast.run.log_file_path = log_file_path
            blast.num_threads = self.num_threads
            blast.makedb(os.path.join(output_db_path, 'COG.fa'))
        else:
            self.run.warning("BLAST tools do not seem to be installed on this system, so anvi'o is not going to\
                              generate a search database for them to be used. Keep this in mind for later.")

        os.remove(temp_fasta_path)
Ejemplo n.º 5
0
Archivo: cogs.py Proyecto: ppflrs/anvio
    def search_with_blastp(self, aa_sequences_file_path):
        blast = BLAST(aa_sequences_file_path, run=self.run, progress=self.progress, num_threads=self.num_threads)

        blast.target_db_path = self.available_db_search_program_targets['blastp']
        self.run.log_file_path = self.log_file_path or J(self.temp_dir_path, 'log.txt')
        blast.search_output_path = J(self.temp_dir_path, 'blast-search-results.txt')
        blast.max_target_seqs = 1

        blast.blastp()

        return blast.search_output_path
Ejemplo n.º 6
0
    def run_blast(self, unique_proteins_fasta_path, unique_proteins_names_dict):
        self.run.warning("You elected to use NCBI's blastp for protein search. Running blastp will be significantly\
                          slower than DIAMOND (although, anvi'o developers are convinced that you *are*\
                          doing the right thing, so, kudos to you).")
        blast = BLAST(unique_proteins_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_proteins_names_dict
        blast.log_file_path = self.log_file_path
        blast.target_db_path = self.get_output_file_path(filesnpaths.get_name_from_file_path(unique_proteins_fasta_path))
        blast.search_output_path = self.get_output_file_path('blast-search-results.txt')

        return blast.get_blastall_results()
Ejemplo n.º 7
0
    def search_with_ncbi_blast(self, aa_sequences_file_path):
        blast = BLAST(aa_sequences_file_path, run=self.run, progress=self.progress, num_threads=self.num_threads)

        blast.target_fasta = self.available_db_search_program_targets['blastp']
        self.run.log_file_path = self.log_file_path or J(self.temp_dir_path, 'log.txt')
        blast.search_output_path = J(self.temp_dir_path, 'blast-search-results.txt')
        blast.max_target_seqs = 1

        blast.blast()

        return blast.search_output_path
Ejemplo n.º 8
0
    def run_blast(self, unique_AA_sequences_fasta_path, unique_AA_sequences_names_dict):
        self.run.warning("You elected to use NCBI's blastp for amino acid sequence search. Running blastp will "
                         "be significantly slower than DIAMOND (although, anvi'o developers are convinced that "
                         "you *are* doing the right thing, so, kudos to you).")
        blast = BLAST(unique_AA_sequences_fasta_path, run=self.run, progress=self.progress,
                          num_threads=self.num_threads, overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_AA_sequences_names_dict
        blast.log_file_path = self.log_file_path
        blast.search_output_path = self.get_output_file_path('blast-search-results.txt')

        return blast.get_blast_results()
Ejemplo n.º 9
0
    def run_blast(self, unique_AA_sequences_fasta_path,
                  unique_AA_sequences_names_dict):
        self.run.warning(
            "You elected to use NCBI's `blastp` for amino acid sequence search. Running blastp will "
            "be significantly slower than DIAMOND, but in some cases, slightly more sensitive. "
            "We are unsure about whether the slight increase in sensitivity may justify significant "
            "increase in run time, but you are the boss.",
            lc="cyan")

        blast = BLAST(
            unique_AA_sequences_fasta_path,
            run=self.run,
            progress=self.progress,
            num_threads=self.num_threads,
            overwrite_output_destinations=self.overwrite_output_destinations)

        blast.names_dict = unique_AA_sequences_names_dict
        blast.log_file_path = self.log_file_path
        blast.search_output_path = self.get_output_file_path(
            'blast-search-results.txt')

        return blast.get_blast_results()
Ejemplo n.º 10
0
    def create_search_databases(self):
        """Creates all the search databases"""

        self.progress.new("Creating search databases")
        self.progress.update(
            "Removing any database that still exists in the output directory..."
        )
        for prefix in ['.nhr', '.nin', '.nsq']:
            [
                os.remove(database_path) for database_path in
                [s['db'] + prefix for s in self.ctx.anticodons.values()]
                if os.path.exists(database_path)
            ]

        # compresssing and decompressing FASTA files changes their hash and make them look like
        # modified in git. to avoid that, we will do the database generation in a temporary directory.
        temp_dir = filesnpaths.get_temp_directory_path()

        self.progress.update("Copying FASTA files to %s ..." % (temp_dir))
        # the following line basically returns a dictionary that shows the new path
        # of the FASTA file under temp_dir for a given anticodon .. apologies for the
        # incomprehensible list comprehension
        new_paths = dict([
            (os.path.basename(fasta_path),
             shutil.copy((fasta_path + '.gz'),
                         os.path.join(temp_dir,
                                      os.path.basename(fasta_path) + '.gz')))
            for fasta_path in [s['db'] for s in self.ctx.anticodons.values()]
        ])

        missing_FASTA_files = [
            anticodon for anticodon in self.ctx.anticodons
            if not os.path.exists(new_paths[anticodon])
        ]
        if len(missing_FASTA_files):
            raise ConfigError(
                "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this "
                "can't be your fault, it is not easy to advice what could be the solution to this. If you are not "
                "an anvi'o programmer working on this problem this very moment, please get in touch with one."
            )

        self.progress.update("Decompressing FASTA files in %s" % (temp_dir))
        new_paths = dict([(anticodon,
                           utils.gzip_decompress_file(new_paths[anticodon],
                                                      keep_original=False))
                          for anticodon in new_paths])

        for anticodon in self.ctx.anticodons:
            self.progress.update("Working on %s in %d threads" %
                                 (anticodon, self.num_threads))

            FASTA_file_path_for_anticodon = new_paths[anticodon]

            # create a BLAST search database for `FASTA_file_path_for_anticodon`
            blast = BLAST(query_fasta=FASTA_file_path_for_anticodon,
                          run=run_quiet,
                          progress=progress_quiet,
                          num_threads=self.num_threads)
            blast.log_file_path = os.path.join(
                os.path.dirname(FASTA_file_path_for_anticodon),
                '%s.log' % anticodon)
            blast.makedb(dbtype='nucl')

            for prefix in ['.nhr', '.nin', '.nsq']:
                if not os.path.exists(FASTA_file_path_for_anticodon + prefix):
                    raise ConfigError(
                        "Something went wrong and BLAST did not create the database file it was supposed to "
                        "for %s :(" % anticodon)
                else:
                    shutil.move(
                        FASTA_file_path_for_anticodon + prefix,
                        os.path.dirname(self.ctx.anticodons[anticodon]['db']))

        shutil.rmtree(temp_dir)

        self.progress.end()
        self.run.info_single(
            "Every FASTA is now turned into a fancy search database. It means you are now allowed to run "
            "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are "
            "caveats to it just like every other computational approach you use to make sense of complex 'omics "
            "data. To better understand those caveats you should read our online documentation a bit. If you see "
            "things that concerns you, please let anvi'o developers know. They love bad news. If you get good "
            "results from this workflow, thank to those who contributed to the GTDB.",
            nl_after=1,
            mc="green")
Ejemplo n.º 11
0
    def find(self,
             sequence,
             sequence_name="(a sequence does not have a name)",
             display_palindromes=False):
        """Find palindromes in a single sequence, and populate `self.palindromes`

        The member function `process` may be a better one to call with an `args` object. See `anvi-search-palindromes`
        for example usage.
        """

        if sequence_name in self.palindromes:
            raise ConfigError(
                f"The sequence '{sequence_name}' is already in `self.palindromes`."
            )
        else:
            self.palindromes[sequence_name] = []

        sequence = sequence.upper()
        sequence_length = len(sequence)

        if sequence_length < self.min_palindrome_length * 2 + self.min_distance:
            self.progress.reset()
            self.run.warning(
                f"The sequence '{sequence_name}', which is only {sequence_length} nts long, is too short "
                f"to find palindromes that are at least {self.min_palindrome_length} nts, with "
                f"{self.min_distance} nucleoties in between :/ Anvi'o will skip it."
            )

        # setup BLAST job
        BLAST_search_tmp_dir = filesnpaths.get_temp_directory_path()
        fasta_file_path = os.path.join(BLAST_search_tmp_dir, 'sequence.fa')
        log_file_path = os.path.join(BLAST_search_tmp_dir, 'blast-log.txt')
        results_file_path = os.path.join(BLAST_search_tmp_dir, 'hits.xml')
        with open(fasta_file_path, 'w') as fasta_file:
            fasta_file.write(f'>sequence\n{sequence}\n')

        # run blast
        blast = BLAST(fasta_file_path,
                      search_program='blastn',
                      run=run_quiet,
                      progress=progress_quiet)
        blast.evalue = 10
        blast.num_threads = self.num_threads
        blast.min_pct_id = 100 - self.max_num_mismatches
        blast.search_output_path = results_file_path
        blast.log_file_path = log_file_path
        blast.makedb(dbtype='nucl')

        if self.min_palindrome_length < 20 and len(
                sequence
        ) > 10000 and not self.user_is_warned_for_potential_performance_issues:
            self.progress.reset()
            self.run.warning(
                f"Please note, you are searching for palindromes that are as short as {self.min_palindrome_length} "
                f"in a sequence that is {pp(len(sequence))} nts long. If your palindrome search takes a VERY long time "
                f"you may want to go for longer palindromes by setting a different `--min-palindrome-length` parameter "
                f"and by increasing the BLAST word size using `--blast-word-size` parameter (please read the help menu first). "
                f"This part of the code does not know if you have many more seqeunces to search, but anvi'o will not "
                f"continue displaying this warning for additional seqeunces to minimize redundant informatio in your "
                f"log files (because despite the popular belief anvi'o can actually sometimes be like nice and all).",
                header="ONE-TIME PERFORMANCE WARNING")
            self.user_is_warned_for_potential_performance_issues = True

        blast.blast(outputfmt='5',
                    word_size=self.blast_word_size,
                    strand='minus')

        # parse the BLAST XML output
        root = ET.parse(blast.search_output_path).getroot()
        for query_sequence_xml in root.findall(
                'BlastOutput_iterations/Iteration'):
            for hit_xml in query_sequence_xml.findall('Iteration_hits/Hit'):

                for hsp_xml in hit_xml.findall('Hit_hsps/Hsp'):
                    p = Palindrome(run=self.run)

                    p.sequence_name = sequence_name
                    p.first_start = int(
                        hsp_xml.find('Hsp_query-from').text) - 1
                    p.first_end = int(hsp_xml.find('Hsp_query-to').text)
                    p.first_sequence = hsp_xml.find('Hsp_qseq').text
                    p.second_start = int(hsp_xml.find('Hsp_hit-to').text) - 1
                    p.second_end = int(hsp_xml.find('Hsp_hit-from').text)
                    p.second_sequence = hsp_xml.find('Hsp_hseq').text
                    p.distance = p.second_start - p.first_start

                    # for each hit, there will be a copy of its reverse complement.
                    # the first half of the if statement below is to control for that
                    # and make sure we keep only one of them. the other half is to
                    # remove those that do not meet the minimum distance criterion.
                    if p.distance < 0 or p.distance < self.min_distance:
                        continue

                    # before we continue, we will test for a special case: internal palindromes
                    # within larger palindromes of 0 distance. IT DOES HAPPEN I PROM.
                    if p.distance == 0:
                        internal_palindrome = False
                        for _p in self.palindromes[sequence_name]:
                            if p.first_start > _p.first_start and p.first_start < _p.first_end:
                                internal_palindrome = True
                                break

                        if internal_palindrome:
                            continue

                    p.length = int(hsp_xml.find('Hsp_align-len').text)

                    if p.length < self.min_palindrome_length:
                        # buckle your seat belt Dorothy, 'cause Kansas is going bye-bye:
                        continue

                    p.num_gaps = int(hsp_xml.find('Hsp_gaps').text)
                    p.num_mismatches = int(
                        hsp_xml.find('Hsp_align-len').text) - int(
                            hsp_xml.find('Hsp_identity').text)
                    p.midline = ''.join([
                        '|'
                        if p.first_sequence[i] == p.second_sequence[i] else 'x'
                        for i in range(0, len(p.first_sequence))
                    ])

                    if p.num_mismatches > self.max_num_mismatches or p.num_gaps > 0:
                        # this is the crazy part: read the function docstring for `get_split_palindromes`.
                        # briefly, we conclude that there are too many mismatches in this match, we will
                        # try and see if there is anything we can salvage from it.
                        p_list = self.get_split_palindromes(
                            p, display_palindromes=display_palindromes)
                    else:
                        # there aren't too many mismatches, and the length checks out. we will continue
                        # processing this hit as a sole palindrome
                        p_list = [p]

                    for sp in p_list:
                        if anvio.DEBUG or display_palindromes or self.verbose:
                            self.progress.reset()
                            sp.display()

                        self.palindromes[sequence_name].append(sp)

        # clean after yourself
        if anvio.DEBUG:
            self.run.info("BLAST temporary dir kept",
                          BLAST_search_tmp_dir,
                          nl_before=1,
                          mc='red')
        else:
            filesnpaths.shutil.rmtree(BLAST_search_tmp_dir)
Ejemplo n.º 12
0
    def create_search_databases(self):
        """Creates all the search databases"""

        self.progress.new("Creating search databases")
        self.progress.update(
            "Removing any database that still exists in the output directory..."
        )
        for anticodon_base_path in [
                b['db'] for b in self.ctx.anticodons.values()
        ]:
            [
                os.remove(f) for f in glob.glob(anticodon_base_path + '.*')
                if not f.endswith('.gz')
            ]

        # compresssing and decompressing FASTA files changes their hash and make them look like
        # modified in git. to avoid that, we will do the database generation in a temporary directory.
        temp_dir = filesnpaths.get_temp_directory_path()

        self.progress.update("Copying FASTA files to %s ..." % (temp_dir))
        # the following line basically returns a dictionary that shows the new path
        # of the FASTA file under temp_dir for a given anticodon .. apologies for the
        # incomprehensible list comprehension
        new_paths = dict([
            (os.path.basename(fasta_path),
             shutil.copy((fasta_path + '.gz'),
                         os.path.join(temp_dir,
                                      os.path.basename(fasta_path) + '.gz')))
            for fasta_path in [s['db'] for s in self.ctx.anticodons.values()]
        ])

        missing_FASTA_files = [
            anticodon for anticodon in self.ctx.anticodons
            if not os.path.exists(new_paths[anticodon])
        ]
        if len(missing_FASTA_files):
            raise ConfigError(
                "Weird news :( Anvi'o is missing some FASTA files that were supposed to be somewhere. Since this "
                "can't be your fault, it is not easy to advice what could be the solution to this. If you are not "
                "an anvi'o programmer working on this problem this very moment, please get in touch with one."
            )

        self.progress.update("Decompressing FASTA files in %s" % (temp_dir))
        new_paths = dict([(anticodon,
                           utils.gzip_decompress_file(new_paths[anticodon],
                                                      keep_original=False))
                          for anticodon in new_paths])

        for anticodon in self.ctx.anticodons:
            self.progress.update("Working on %s in %d threads" %
                                 (anticodon, self.num_threads))

            FASTA_file_path_for_anticodon = new_paths[anticodon]

            # create a BLAST search database for `FASTA_file_path_for_anticodon`
            blast = BLAST(query_fasta=FASTA_file_path_for_anticodon,
                          run=run_quiet,
                          progress=progress_quiet,
                          num_threads=self.num_threads)
            blast.log_file_path = os.path.join(
                os.path.dirname(FASTA_file_path_for_anticodon),
                '%s.log' % anticodon)
            blast.makedb(dbtype='nucl')

            files_generated = [
                f for f in glob.glob(FASTA_file_path_for_anticodon + '.*')
            ]
            if not len(files_generated):
                raise ConfigError(
                    f"Even though the process to generate BLAST database files for '{anticodon}' has officially ended, "
                    f"anvi'o is unable to find any files generated by BLAST in the temporary directory it was working "
                    f"with :( This is as confusing to anvi'o as it probably sounds to you. A likely explanation is that "
                    f"something went wrong with the `makeblastdb` step. Please go into the following directory, and run "
                    f"`makeblastdb -in AAA -dbtype nucl; ls AAA*` manually to see what happens: '{temp_dir}'."
                )
            else:
                for file_path in files_generated:
                    shutil.move(
                        file_path,
                        os.path.dirname(self.ctx.anticodons[anticodon]['db']))

        shutil.rmtree(temp_dir)

        self.progress.end()
        self.run.info_single(
            "Every FASTA is now turned into a fancy search database. It means you are now allowed to run "
            "`anvi-run-trna-taxonomy` on anvi'o contigs databases. This workflow is very new, and there are "
            "caveats to it just like every other computational approach you use to make sense of complex 'omics "
            "data. To better understand those caveats you should read our online documentation a bit. If you see "
            "things that concerns you, please let anvi'o developers know. They love bad news. If you get good "
            "results from this workflow, thank to those who contributed to the GTDB.",
            nl_after=1,
            mc="green")