Ejemplo n.º 1
0
 def test__guess_sequence_type(self):
     urr = UnpackRawReads(None)
     self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10))
     self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10+'T'*89))
     self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('P'*10+'T'*90))
     self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('A'*300+'E'*999)) #only look at the first 300bp
     self.assertEqual('nucleotide', urr._guess_sequence_type_from_string('a'*10+'T'*89)) #lowercase
Ejemplo n.º 2
0
 def test__guess_sequence_type(self):
     urr = UnpackRawReads(None)
     self.assertEqual('aminoacid',
                      urr._guess_sequence_type_from_string('P' * 10))
     self.assertEqual(
         'aminoacid',
         urr._guess_sequence_type_from_string('P' * 10 + 'T' * 89))
     self.assertEqual(
         'nucleotide',
         urr._guess_sequence_type_from_string('P' * 10 + 'T' * 90))
     self.assertEqual(
         'nucleotide',
         urr._guess_sequence_type_from_string(
             'A' * 300 + 'E' * 999))  #only look at the first 300bp
     self.assertEqual(
         'nucleotide',
         urr._guess_sequence_type_from_string('a' * 10 +
                                              'T' * 89))  #lowercase
Ejemplo n.º 3
0
 def test_stars(self):
     urr = UnpackRawReads(None)
     self.assertEqual('aminoacid',
                      urr._guess_sequence_type_from_string('P' * 10 + "*"))
Ejemplo n.º 4
0
    def generate_expand_search_database_from_contigs(self, contig_files,
                                                     output_database_file,
                                                     search_method):
        '''Given a collection of search_hmm_files, search the contigs in 
        contig_files, and generate an HMM from the resulting hits, outputting
        it as output_database_file.
        
        Parameters
        ----------
        contig_files: list of str
            list of files to search
        output_database_file: str
            path to output file
        search_method: str
            "diamond" or "hmmsearch", to specify search method to use and what
            type of database to build. 
        
        Returns
        -------
        True if genes were recovered, else False'''

        ss = SequenceSearcher(self.search_hmm_files)
        seqio = SequenceIO()
        if search_method == self.DIAMOND_SEARCH_METHOD:
            if self.diamond_database == None or self.unaligned_sequence_database == None:
                logging.warning(
                    "Cannot expand_search continue with no diamond database or unaligned sequences."
                )
                return False

        with tempfile.NamedTemporaryFile(
                prefix='graftm_expand_search_orfs') as orfs:
            logging.info("Finding expand_search hits in provided contigs..")
            for contig_file in contig_files:
                logging.debug("Finding expand_search hits in %s.." %
                              contig_file)
                unpack = UnpackRawReads(contig_file)

                with tempfile.NamedTemporaryFile(prefix='graftm_expand_search') as \
                                                        hit_reads_orfs_fasta:
                    # search and extract matching ORFs
                    with tempfile.NamedTemporaryFile(prefix='graftm_expand_search2') as \
                                                        hmmsearch_output_table:
                        with tempfile.NamedTemporaryFile(prefix='graftm_expand_search3') as \
                                                        hit_reads_fasta:
                            ss.search_and_extract_orfs_matching_protein_database(\
                                    unpack,
                                    search_method,
                                    self.maximum_range,
                                    self.threads,
                                    self.evalue,
                                    self.min_orf_length,
                                    None,
                                    (self.diamond_database if self.diamond_database else None),
                                    hmmsearch_output_table.name,
                                    hit_reads_fasta.name,
                                    hit_reads_orfs_fasta.name)
                    # Append to the file
                    shutil.copyfileobj(open(hit_reads_orfs_fasta.name), orfs)

            # Now have a fasta file of ORFs.
            # Check to make sure the file is not zero-length
            orfs.flush()

            with tempfile.NamedTemporaryFile(
                    prefix="graftm_expand_search_aln") as aln:

                if search_method == self.HMM_SEARCH_METHOD:

                    # Check that there is more than one sequence to align.
                    if len(
                            seqio.read_fasta_file(orfs.name)
                    ) <= 1:  # Just to build on this, you need to check if there is > 1 hit
                        # otherwise mafft will fail to align, causing a crash when hmmbuild is
                        # run on an empty file.
                        logging.warn(
                            "Failed to find two or more matching ORFs in the expand_search contigs"
                        )
                        return False

                    # Run mafft to align them
                    cmd = "mafft --auto %s >%s" % (orfs.name, aln.name)
                    logging.info("Aligning expand_search hits..")
                    extern.run(cmd)

                    # Run hmmbuild to create an HMM
                    cmd = "hmmbuild --amino %s %s >/dev/null" % (
                        output_database_file, aln.name)
                    logging.info("Building HMM from expand_search hits..")

                    extern.run(cmd)

                elif search_method == self.DIAMOND_SEARCH_METHOD:

                    # Concatenate database with existing database
                    with tempfile.NamedTemporaryFile(
                            prefix="concatenated_database") as databasefile:
                        for f in [orfs.name, self.unaligned_sequence_database]:
                            for line in open(f):
                                databasefile.write(line)
                        databasefile.flush()

                        # Run diamond make to create a diamond database
                        cmd = "diamond makedb --in '%s' -d '%s'" % (
                            databasefile.name, output_database_file)
                        logging.info(
                            "Building a diamond database from expand_search hits.."
                        )
                        extern.run(cmd)

                else:
                    raise Exception("Search method not recognised: %s" %
                                    search_method)
                    return False

                return True
Ejemplo n.º 5
0
 def test_stars(self):
     urr = UnpackRawReads(None)
     self.assertEqual('aminoacid', urr._guess_sequence_type_from_string('P'*10+"*"))
Ejemplo n.º 6
0
    def graft(self):
        # The Graft pipeline:
        # Searches for reads using hmmer, and places them in phylogenetic
        # trees to derive a community structure.
        if self.args.graftm_package:
            gpkg = GraftMPackage.acquire(self.args.graftm_package)
        else:
            gpkg = None

        REVERSE_PIPE = (True if self.args.reverse else False)
        INTERLEAVED = (True if self.args.interleaved else False)
        base_list = []
        seqs_list = []
        search_results = []
        hit_read_count_list = []
        db_search_results = []

        if gpkg:
            maximum_range = gpkg.maximum_range()

            if self.args.search_diamond_file:
                self.args.search_method = self.hk.DIAMOND_SEARCH_METHOD
                diamond_db = self.args.search_diamond_file[0]
            else:
                diamond_db = gpkg.diamond_database_path()
                if self.args.search_method == self.hk.DIAMOND_SEARCH_METHOD:
                    if not diamond_db:
                        logging.error(
                            "%s search method selected, but no diamond database specified. \
                        Please either provide a gpkg to the --graftm_package flag, or a diamond \
                        database to the --search_diamond_file flag." %
                            self.args.search_method)
                        raise Exception()
        else:
            # Get the maximum range, if none exists, make one from the HMM profile
            if self.args.maximum_range:
                maximum_range = self.args.maximum_range
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    if not self.args.search_only:
                        maximum_range = self.hk.get_maximum_range(
                            self.args.aln_hmm_file)
                    else:
                        logging.debug(
                            "Running search only pipeline. maximum_range not configured."
                        )
                        maximum_range = None
                else:
                    logging.warning(
                        'Cannot determine maximum range when using %s pipeline and with no GraftM package specified'
                        % self.args.search_method)
                    logging.warning(
                        'Setting maximum_range to None (linked hits will not be detected)'
                    )
                    maximum_range = None
            if self.args.search_diamond_file:
                diamond_db = self.args.search_diamond_file
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    diamond_db = None
                else:
                    logging.error(
                        "%s search method selected, but no gpkg or diamond database selected"
                        % self.args.search_method)

        if self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            if self.args.reverse:
                logging.warn(
                    "--reverse reads specified with --assignment_method diamond. Reverse reads will be ignored."
                )
                self.args.reverse = None

        # If merge reads is specified, check that there are reverse reads to merge with
        if self.args.merge_reads and not hasattr(self.args, 'reverse'):
            raise Exception("Programming error")

        # Set the output directory if not specified and create that directory
        logging.debug('Creating working directory: %s' %
                      self.args.output_directory)
        self.hk.make_working_directory(self.args.output_directory,
                                       self.args.force)

        # Set pipeline and evalue by checking HMM format
        if self.args.search_only:
            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                hmm_type, hmm_tc = self.hk.setpipe(
                    self.args.search_hmm_files[0])
                logging.debug("HMM type: %s Trusted Cutoff: %s" %
                              (hmm_type, hmm_tc))
        else:
            hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file)
            logging.debug("HMM type: %s Trusted Cutoff: %s" %
                          (hmm_type, hmm_tc))

        if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
            setattr(self.args, 'type', hmm_type)
            if hmm_tc:
                setattr(self.args, 'evalue', '--cut_tc')
        else:
            setattr(self.args, 'type', self.PIPELINE_AA)

        if self.args.filter_minimum is not None:
            filter_minimum = self.args.filter_minimum
        else:
            if self.args.type == self.PIPELINE_NT:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES
            else:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES

        # Generate expand_search database if required
        if self.args.expand_search_contigs:
            if self.args.graftm_package:
                pkg = GraftMPackage.acquire(self.args.graftm_package)
            else:
                pkg = None
            boots = ExpandSearcher(search_hmm_files=self.args.search_hmm_files,
                                   maximum_range=self.args.maximum_range,
                                   threads=self.args.threads,
                                   evalue=self.args.evalue,
                                   min_orf_length=self.args.min_orf_length,
                                   graftm_package=pkg)

            # this is a hack, it should really use GraftMFiles but that class isn't currently flexible enough
            new_database = (os.path.join(self.args.output_directory, "expand_search.hmm") \
                            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD \
                            else os.path.join(self.args.output_directory, "expand_search")
                            )

            if boots.generate_expand_search_database_from_contigs(
                    self.args.expand_search_contigs, new_database,
                    self.args.search_method):
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    self.ss.search_hmm.append(new_database)
                else:
                    diamond_db = new_database

        first_search_method = self.args.search_method
        if self.args.decoy_database:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads),
                Diamond(self.args.decoy_database, threads=self.args.threads))
            doing_decoy_search = True
        elif self.args.search_method == self.hk.HMMSEARCH_AND_DIAMOND_SEARCH_METHOD:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads))
            doing_decoy_search = True
            first_search_method = self.hk.HMMSEARCH_SEARCH_METHOD
        else:
            doing_decoy_search = False

        # For each pair (or single file passed to GraftM)
        logging.debug('Working with %i file(s)' % len(self.sequence_pair_list))
        for pair in self.sequence_pair_list:
            # Guess the sequence file type, if not already specified to GraftM
            unpack = UnpackRawReads(pair[0], self.args.input_sequence_type,
                                    INTERLEAVED)

            # Set the basename, and make an entry to the summary table.
            base = unpack.basename()
            pair_direction = ['forward', 'reverse']
            logging.info("Working on %s" % base)

            # Make the working base subdirectory
            self.hk.make_working_directory(
                os.path.join(self.args.output_directory, base),
                self.args.force)

            # for each of the paired end read files
            for read_file in pair:
                unpack = UnpackRawReads(read_file,
                                        self.args.input_sequence_type,
                                        INTERLEAVED)
                if read_file is None:
                    # placeholder for interleaved (second file is None)
                    continue

                if not os.path.isfile(read_file):  # Check file exists
                    logging.info('%s does not exist! Skipping this file..' %
                                 read_file)
                    continue

                # Set the output file_name
                if len(pair) == 2:
                    direction = 'interleaved' if pair[1] is None \
                                              else pair_direction.pop(0)
                    logging.info("Working on %s reads" % direction)
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)
                    self.hk.make_working_directory(
                        os.path.join(self.args.output_directory, base,
                                     direction), self.args.force)
                else:
                    direction = False
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)

                if self.args.type == self.PIPELINE_AA:
                    logging.debug("Running protein pipeline")
                    try:
                        search_time, (
                            result,
                            complement_information) = self.ss.aa_db_search(
                                self.gmf,
                                base,
                                unpack,
                                first_search_method,
                                maximum_range,
                                self.args.threads,
                                self.args.evalue,
                                self.args.min_orf_length,
                                self.args.restrict_read_length,
                                diamond_db,
                                self.args.diamond_performance_parameters,
                            )
                    except NoInputSequencesException as e:
                        logging.error(
                            "No sufficiently long open reading frames were found, indicating"
                            " either the input sequences are too short or the min orf length"
                            " cutoff is too high. Cannot continue sorry. Alternatively, there"
                            " is something amiss with the installation of OrfM. The specific"
                            " command that failed was: %s" % e.command)
                        exit(Run.NO_ORFS_EXITSTATUS)

                # Or the DNA pipeline
                elif self.args.type == self.PIPELINE_NT:
                    logging.debug("Running nucleotide pipeline")
                    search_time, (
                        result, complement_information) = self.ss.nt_db_search(
                            self.gmf, base, unpack, self.args.euk_check,
                            self.args.search_method, maximum_range,
                            self.args.threads, self.args.evalue)

                reads_detected = True
                if not result.hit_fasta() or os.path.getsize(
                        result.hit_fasta()) == 0:
                    logging.info('No reads found in %s' % base)
                    reads_detected = False

                if self.args.search_only:
                    db_search_results.append(result)
                    base_list.append(base)
                    continue

                # Filter out decoys if specified
                if reads_detected and doing_decoy_search:
                    with tempfile.NamedTemporaryFile(prefix="graftm_decoy",
                                                     suffix='.fa') as f:
                        tmpname = f.name
                    any_remaining = decoy_filter.filter(
                        result.hit_fasta(), tmpname)
                    if any_remaining:
                        shutil.move(tmpname, result.hit_fasta())
                    else:
                        # No hits remain after decoy filtering.
                        os.remove(result.hit_fasta())
                        continue

                if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
                    logging.info(
                        'aligning reads to reference package database')
                    hit_aligned_reads = self.gmf.aligned_fasta_output_path(
                        base)

                    if reads_detected:
                        aln_time, aln_result = self.ss.align(
                            result.hit_fasta(), hit_aligned_reads,
                            complement_information, self.args.type,
                            filter_minimum)
                    else:
                        aln_time = 'n/a'
                    if not os.path.exists(
                            hit_aligned_reads
                    ):  # If all were filtered out, or there just was none..
                        with open(hit_aligned_reads, 'w') as f:
                            pass  # just touch the file, nothing else
                    seqs_list.append(hit_aligned_reads)

                db_search_results.append(result)
                base_list.append(base)
                search_results.append(result.search_result)
                hit_read_count_list.append(result.hit_count)

        # Write summary table
        srchtw = SearchTableWriter()
        srchtw.build_search_otu_table(
            [x.search_objects for x in db_search_results], base_list,
            self.gmf.search_otu_table())

        if self.args.search_only:
            logging.info(
                'Stopping before alignment and taxonomic assignment phase\n')
            exit(0)

        if self.args.merge_reads:  # not run when diamond is the assignment mode- enforced by argparse grokking
            logging.debug("Running merge reads output")
            if self.args.interleaved:
                fwd_seqs = seqs_list
                rev_seqs = []
            else:
                base_list = base_list[0::2]
                fwd_seqs = seqs_list[0::2]
                rev_seqs = seqs_list[1::2]
            merged_output=[GraftMFiles(base, self.args.output_directory, False).aligned_fasta_output_path(base) \
                           for base in base_list]
            logging.debug("merged reads to %s", merged_output)
            self.ss.merge_forev_aln(fwd_seqs, rev_seqs, merged_output)
            seqs_list = merged_output
            REVERSE_PIPE = False

        elif REVERSE_PIPE:
            base_list = base_list[0::2]

        # Leave the pipeline if search only was specified
        if self.args.search_and_align_only:
            logging.info('Stopping before taxonomic assignment phase\n')
            exit(0)
        elif not any(base_list):
            logging.error(
                'No hits in any of the provided files. Cannot continue with no reads to assign taxonomy to.\n'
            )
            exit(0)
        self.gmf = GraftMFiles('', self.args.output_directory, False)

        if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
            clusterer = Clusterer()
            # Classification steps
            seqs_list = clusterer.cluster(seqs_list, REVERSE_PIPE)
            logging.info("Placing reads into phylogenetic tree")
            taxonomic_assignment_time, assignments = self.p.place(
                REVERSE_PIPE, seqs_list, self.args.resolve_placements,
                self.gmf, self.args, result.slash_endings,
                gpkg.taxtastic_taxonomy_path(), clusterer)
            assignments = clusterer.uncluster_annotations(
                assignments, REVERSE_PIPE)

        elif self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            logging.info("Assigning taxonomy with diamond")
            taxonomic_assignment_time, assignments = self._assign_taxonomy_with_diamond(\
                        base_list,
                        db_search_results,
                        gpkg,
                        self.gmf,
                        self.args.diamond_performance_parameters)
            aln_time = 'n/a'
        else:
            raise Exception("Unexpected assignment method encountered: %s" %
                            self.args.placement_method)

        self.summarise(base_list, assignments, REVERSE_PIPE,
                       [search_time, aln_time, taxonomic_assignment_time],
                       hit_read_count_list, self.args.max_samples_for_krona)