Example #1
0
    def setattributes(self, args):

        self.hk = HouseKeeping()
        self.s = Stats_And_Summary()
        if args.subparser_name == 'graft':
            commands = ExternalProgramSuite([
                'orfm', 'nhmmer', 'hmmsearch', 'mfqe', 'pplacer',
                'ktImportText', 'diamond'
            ])
            self.hk.set_attributes(self.args)
            self.hk.set_euk_hmm(self.args)
            if args.euk_check:
                self.args.search_hmm_files.append(self.args.euk_hmm_file)

            self.ss = SequenceSearcher(
                self.args.search_hmm_files,
                (None if self.args.search_only else self.args.aln_hmm_file))
            self.sequence_pair_list = self.hk.parameter_checks(args)
            if hasattr(args, 'reference_package'):
                self.p = Pplacer(self.args.reference_package)

        elif self.args.subparser_name == "create":
            commands = ExternalProgramSuite(
                ['taxit', 'FastTreeMP', 'hmmalign', 'mafft'])
            self.create = Create(commands)
Example #2
0
    def _get_hmm_from_alignment(self, alignment, hmm_filename,
                                output_alignment_filename):
        '''Return a HMM file and alignment of sequences to that HMM

        Parameters
        ----------
        alignment: str
            path to aligned proteins
        hmm_filename: str
            write the hmm to this file path
        output_alignment_filename: str
            write the output alignment to this file path

        Returns
        -------
        Return the pipeline type of the HMM.
        '''
        logging.info("Building HMM from alignment")

        with tempfile.NamedTemporaryFile(suffix='.fasta',
                                         prefix='graftm',
                                         mode='w') as tempaln:

            cmd = "hmmbuild -O /dev/stdout -o /dev/stderr '%s' '%s'" % (
                hmm_filename, alignment)
            output = extern.run(cmd)

            SeqIO.write(SeqIO.parse(StringIO(output), 'stockholm'), tempaln,
                        'fasta')
            tempaln.flush()

            ptype, _ = self._pipe_type(hmm_filename)
            SequenceSearcher(hmm_filename).alignment_correcter(
                [tempaln.name], output_alignment_filename)
            return ptype
Example #3
0
    def test_merg_aln(self):
        forward_reads = '''>no_overlap
-------CGTATGCAACCTACCTT---------------------------------------
>overlap_all_match
-------CGTATGCAACCTACCTT---------------------------------------
>overlap_mismatch_in_reverse
-------CGTATGCAACCTACCTT---------------------------------------
>overlap_mismatch_in_forward
-------CGTATGCAACCTTCCTT---------------------------------------
>complete_overlap_all_match
-------CGTATGCAACCTACCTT---------------------------------------
>complete_overlap_mismatch
-------CGTATGCATCCTACCTT---------------------------------------'''
        reverse_reads = '''>no_overlap
---------------------------------------CGTATGCAACCTACCTT-------
>overlap_all_match
----------------CCTACCTTCAACCTACCTT----------------------------
>overlap_mismatch_in_reverse
----------------CCTTCCTTCAACCTACCTT----------------------------
>overlap_mismatch_in_forward
----------------CCTACCTTCAACCTACCTT----------------------------
>complete_overlap_all_match
-------CGTATGCAACCTACCTT---------------------------------------
>complete_overlap_mismatch
-------CGTTTGCAAGCTACCTT---------------------------------------'''
        expected_aln = '''>no_overlap
-------CGTATGCAACCTACCTT---------------CGTATGCAACCTACCTT-------
>overlap_all_match
-------CGTATGCAACCTACCTTCAACCTACCTT----------------------------
>overlap_mismatch_in_reverse
-------CGTATGCAACCTACCTTCAACCTACCTT----------------------------
>overlap_mismatch_in_forward
-------CGTATGCAACCTTCCTTCAACCTACCTT----------------------------
>complete_overlap_all_match
-------CGTATGCAACCTACCTT---------------------------------------
>complete_overlap_mismatch
-------CGTATGCATCCTACCTT---------------------------------------'''.split()
        with tempfile.NamedTemporaryFile(suffix='_forward.fa') as forward_file:
            with tempfile.NamedTemporaryFile(
                    suffix='_reverse.fa') as reverse_file:
                with tempfile.NamedTemporaryFile(suffix='.fa') as output_file:
                    forward_file.write(forward_reads.encode())
                    reverse_file.write(reverse_reads.encode())
                    forward_file.flush()
                    reverse_file.flush()
                    SequenceSearcher(None).merge_forev_aln([forward_file.name],
                                                           [reverse_file.name],
                                                           [output_file.name])
                    count = 0
                    for line in open(output_file.name):
                        self.assertEqual(expected_aln[count], line.strip())
                        count += 1
                    with open(output_file.name) as f:
                        self.assertEqual(count, len(f.readlines()))
Example #4
0
    def _get_hmm_from_alignment(self, alignment, hmm_filename,
                                output_alignment_filename):
        '''Return a HMM file and alignment of sequences to that HMM

        Parameters
        ----------
        alignment: str
            path to aligned proteins
        hmm_filename: str
            write the hmm to this file path
        output_alignment_filename: str
            write the output alignment to this file path

        Returns
        -------
        Return the pipeline type of the HMM.
        '''
        logging.info("Building HMM from alignment")

        with tempfile.NamedTemporaryFile(suffix='.fasta',
                                         prefix='graftm') as tempaln:
            cmd = "hmmbuild -O /dev/stdout -o /dev/stderr '%s' '%s'" % (
                hmm_filename, alignment)
            process = subprocess.Popen(["bash", "-c", cmd],
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
            output, error = process.communicate()
            logging.debug("Got STDERR from hmmbuild: %s" % error)
            if process.returncode != 0:
                logging.error(
                    "hmmbuild exitstatus was non-zero, likely indicating an error of "
                    "some description")
                logging.error("Got STDERR from hmmbuild: %s" % error)

            SeqIO.write(SeqIO.parse(StringIO(output), 'stockholm'), tempaln,
                        'fasta')
            tempaln.flush()

            ptype, _ = self._pipe_type(hmm_filename)
            SequenceSearcher(hmm_filename).alignment_correcter(
                [tempaln.name], output_alignment_filename)
            return ptype
Example #5
0
    def _align_sequences_to_hmm(self, hmm_file, sequences_file,
                                output_alignment_file):
        '''Align sequences to an HMM, and write an alignment of
        these proteins after cleanup so that they can be used for tree-making

        Parameters
        ----------
        sequences_file: str
            path to file of unaligned protein sequences
        hmm_file: str
            path to hmm file
        output_alignment_file: str
            write alignment to this file

        Returns
        -------
        nothing
        '''

        ss = SequenceSearcher(hmm_file)
        with tempfile.NamedTemporaryFile(prefix='graftm',
                                         suffix='.aln.fasta') as tempalign:
            ss.hmmalign_sequences(hmm_file, sequences_file, tempalign.name)
            ss.alignment_correcter([tempalign.name], output_alignment_file)
Example #6
0
    def generate_expand_search_database_from_contigs(self, contig_files,
                                                     output_database_file,
                                                     search_method):
        '''Given a collection of search_hmm_files, search the contigs in 
        contig_files, and generate an HMM from the resulting hits, outputting
        it as output_database_file.
        
        Parameters
        ----------
        contig_files: list of str
            list of files to search
        output_database_file: str
            path to output file
        search_method: str
            "diamond" or "hmmsearch", to specify search method to use and what
            type of database to build. 
        
        Returns
        -------
        True if genes were recovered, else False'''

        ss = SequenceSearcher(self.search_hmm_files)
        seqio = SequenceIO()
        if search_method == self.DIAMOND_SEARCH_METHOD:
            if self.diamond_database == None or self.unaligned_sequence_database == None:
                logging.warning(
                    "Cannot expand_search continue with no diamond database or unaligned sequences."
                )
                return False

        with tempfile.NamedTemporaryFile(
                prefix='graftm_expand_search_orfs') as orfs:
            logging.info("Finding expand_search hits in provided contigs..")
            for contig_file in contig_files:
                logging.debug("Finding expand_search hits in %s.." %
                              contig_file)
                unpack = UnpackRawReads(contig_file)

                with tempfile.NamedTemporaryFile(prefix='graftm_expand_search') as \
                                                        hit_reads_orfs_fasta:
                    # search and extract matching ORFs
                    with tempfile.NamedTemporaryFile(prefix='graftm_expand_search2') as \
                                                        hmmsearch_output_table:
                        with tempfile.NamedTemporaryFile(prefix='graftm_expand_search3') as \
                                                        hit_reads_fasta:
                            ss.search_and_extract_orfs_matching_protein_database(\
                                    unpack,
                                    search_method,
                                    self.maximum_range,
                                    self.threads,
                                    self.evalue,
                                    self.min_orf_length,
                                    None,
                                    (self.diamond_database if self.diamond_database else None),
                                    hmmsearch_output_table.name,
                                    hit_reads_fasta.name,
                                    hit_reads_orfs_fasta.name)
                    # Append to the file
                    shutil.copyfileobj(open(hit_reads_orfs_fasta.name), orfs)

            # Now have a fasta file of ORFs.
            # Check to make sure the file is not zero-length
            orfs.flush()

            with tempfile.NamedTemporaryFile(
                    prefix="graftm_expand_search_aln") as aln:

                if search_method == self.HMM_SEARCH_METHOD:

                    # Check that there is more than one sequence to align.
                    if len(
                            seqio.read_fasta_file(orfs.name)
                    ) <= 1:  # Just to build on this, you need to check if there is > 1 hit
                        # otherwise mafft will fail to align, causing a crash when hmmbuild is
                        # run on an empty file.
                        logging.warn(
                            "Failed to find two or more matching ORFs in the expand_search contigs"
                        )
                        return False

                    # Run mafft to align them
                    cmd = "mafft --auto %s >%s" % (orfs.name, aln.name)
                    logging.info("Aligning expand_search hits..")
                    extern.run(cmd)

                    # Run hmmbuild to create an HMM
                    cmd = "hmmbuild --amino %s %s >/dev/null" % (
                        output_database_file, aln.name)
                    logging.info("Building HMM from expand_search hits..")

                    extern.run(cmd)

                elif search_method == self.DIAMOND_SEARCH_METHOD:

                    # Concatenate database with existing database
                    with tempfile.NamedTemporaryFile(
                            prefix="concatenated_database") as databasefile:
                        for f in [orfs.name, self.unaligned_sequence_database]:
                            for line in open(f):
                                databasefile.write(line)
                        databasefile.flush()

                        # Run diamond make to create a diamond database
                        cmd = "diamond makedb --in '%s' -d '%s'" % (
                            databasefile.name, output_database_file)
                        logging.info(
                            "Building a diamond database from expand_search hits.."
                        )
                        extern.run(cmd)

                else:
                    raise Exception("Search method not recognised: %s" %
                                    search_method)
                    return False

                return True