Ejemplo n.º 1
0
    def test_hello_world(self):
        with tempdir.in_tempdir():
            with tempfile.NamedTemporaryFile() as fasta:
                with tempfile.NamedTemporaryFile() as tax:
                    fasta.write(Tests.extra_mcra_fasta)
                    fasta.flush()
                    tax.write(Tests.extra_mcra_taxonomy)
                    tax.flush()
                    prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
                    cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %(
                        path_to_script,
                        prev_path,
                        fasta.name,
                        tax.name,
                        'updated.gpkg')
                    extern.run(cmd1)

                    prev = GraftMPackage.acquire(prev_path)
                    up = GraftMPackage.acquire('updated.gpkg')
                    prevhash = prev.taxonomy_hash()
                    taxhash = up.taxonomy_hash()
                    self.assertEqual(len(prevhash)+1,
                                     len(taxhash))
                    self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'],
                                     taxhash['KYC55281.1'])
                    self.assertEqual(prevhash['638165755'],
                                     taxhash['638165755'])
                    seqio = SequenceIO()
                    self.assertEqual(
                        len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                        len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Ejemplo n.º 2
0
    def test_hello_world(self):
        with tempdir.in_tempdir():
            with tempfile.NamedTemporaryFile() as fasta:
                with tempfile.NamedTemporaryFile() as tax:
                    fasta.write(Tests.extra_mcra_fasta)
                    fasta.flush()
                    tax.write(Tests.extra_mcra_taxonomy)
                    tax.flush()
                    prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
                    cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %(
                        path_to_script,
                        prev_path,
                        fasta.name,
                        tax.name,
                        'updated.gpkg')
                    extern.run(cmd1)

                    prev = GraftMPackage.acquire(prev_path)
                    up = GraftMPackage.acquire('updated.gpkg')
                    prevhash = prev.taxonomy_hash()
                    taxhash = up.taxonomy_hash()
                    self.assertEqual(len(prevhash)+1,
                                     len(taxhash))
                    self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'],
                                     taxhash['KYC55281.1'])
                    self.assertEqual(prevhash['638165755'],
                                     taxhash['638165755'])
                    seqio = SequenceIO()
                    self.assertEqual(
                        len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                        len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Ejemplo n.º 3
0
 def test_autodecorate(self):
     with tempdir.in_tempdir():
         with tempfile.NamedTemporaryFile() as fasta:
             fasta.write(Tests.extra_mcra_fasta)
             fasta.flush()
             
             prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
             update = Update(prerequisites)
             update.update(
                 input_sequence_path = fasta.name,
                 input_graftm_package_path = prev_path,
                 output_graftm_package_path = 'updated.gpkg')
             prev = GraftMPackage.acquire(prev_path)
             up = GraftMPackage.acquire('updated.gpkg')
             prevhash = prev.taxonomy_hash()
             taxhash = up.taxonomy_hash()
             self.assertEqual(11, len(taxhash)) #hard-code 11 because of
                                                #https://github.com/geronimp/graftM/issues/204
             self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'],
                              taxhash['KYC55281.1'])
             
             self.assertEqual(prevhash['638165755'],
                              taxhash['638165755'])
             seqio = SequenceIO()
             self.assertEqual(
                 len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                 len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Ejemplo n.º 4
0
 def test_autodecorate(self):
     with tempdir.in_tempdir():
         with tempfile.NamedTemporaryFile() as fasta:
             fasta.write(Tests.extra_mcra_fasta)
             fasta.flush()
             
             prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
             update = Update(prerequisites)
             update.update(
                 input_sequence_path = fasta.name,
                 input_graftm_package_path = prev_path,
                 output_graftm_package_path = 'updated.gpkg')
             prev = GraftMPackage.acquire(prev_path)
             up = GraftMPackage.acquire('updated.gpkg')
             prevhash = prev.taxonomy_hash()
             taxhash = up.taxonomy_hash()
             self.assertEqual(11, len(taxhash)) #hard-code 11 because of
                                                #https://github.com/geronimp/graftM/issues/204
             self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'],
                              taxhash['KYC55281.1'])
             
             self.assertEqual(prevhash['638165755'],
                              taxhash['638165755'])
             seqio = SequenceIO()
             self.assertEqual(
                 len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                 len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Ejemplo n.º 5
0
    def _assign_taxonomy(self, extracted_reads, assignment_method):
        graftm_align_directory_base = os.path.join(self._working_directory,
                                                   'graftm_aligns')
        os.mkdir(graftm_align_directory_base)
        commands = []
        all_tmp_files = []
        # Run each one at a time serially so that the number of threads is
        # respected, to save RAM as one DB needs to be loaded at once, and so
        # fewer open files are needed, so that the open file count limit is
        # eased.
        for singlem_package, readsets in extracted_reads.each_package_wise():
            tmp_files = []
            for readset in readsets:
                if len(readset.sequences) > 0:
                    tmp = tempfile.NamedTemporaryFile(prefix='singlem.%s' %
                                                      readset.sample_name,
                                                      suffix=".fasta")
                    # Record basename (remove .fasta) so that the graftm output
                    # file is recorded for later on in pipe.
                    tmpbase = os.path.basename(tmp.name[:-6])
                    readset.tmpfile_basename = tmpbase
                    seqio = SequenceIO()
                    seqio.write_fasta(readset.sequences, tmp)
                    tmp.flush()
                    tmp_files.append(tmp)

            if len(tmp_files) > 0:
                tmpnames = list([tg.name for tg in tmp_files])
                cmd = "%s "\
                      "--threads %i "\
                      "--forward %s "\
                      "--graftm_package %s "\
                      "--output_directory %s/%s "\
                      "--max_samples_for_krona 0 "\
                      "--assignment_method %s" % (
                          self._graftm_command_prefix(singlem_package.is_protein_package()),
                          self._num_threads,
                          ' '.join(tmpnames),
                          singlem_package.graftm_package_path(),
                          graftm_align_directory_base,
                          singlem_package.graftm_package_basename(),
                          assignment_method)
                commands.append(cmd)
                all_tmp_files.append(tmp_files)

        extern.run_many(commands, num_threads=1)
        for tmp_files in all_tmp_files:
            [t.close() for t in tmp_files]
        logging.info("Finished running taxonomic assignment with GraftM")
        return SingleMPipeTaxonomicAssignmentResult(
            graftm_align_directory_base)
Ejemplo n.º 6
0
    def _align_sequences(self, input_sequences_path, output_alignment_path,
                         threads):
        '''Align sequences into alignment_file

        Parameters
        ----------
        input_sequences_path: str
            path to input sequences in fasta format
        output_alignment_path: str
            path to output alignment path
        threads: str
            number of threads to use
        Returns
        -------
        Nothing
        '''
        logging.debug("Aligning sequences using mafft")
        cmd = "mafft --anysymbol --thread %s --auto /dev/stdin > %s" % (
            threads, output_alignment_path)
        inputs = []
        with open(input_sequences_path) as f:
            for name, seq, _ in SequenceIO().each(f):
                inputs.append('>%s' % name)
                # Do not include * characters in the HMM, as this means tree
                # insertion fails.
                inputs.append(seq.replace('*', ''))
        extern.run(cmd, stdin="\n".join(inputs))
Ejemplo n.º 7
0
 def test_hello_world(self):
     with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f1:
         with tempfile.NamedTemporaryFile(prefix='graftm_decoy_test') as f2:
             f1.write(self.eg1)
             f1.flush()
             extern.run("diamond makedb --in %s --db %s.dmnd" %\
                        (f1.name, f1.name))
             f2.write(self.eg1)
             f2.write(self.eg2)
             f2.flush()
             extern.run("diamond makedb --in %s --db %s.dmnd" %\
                        (f2.name, f2.name))
             with tempfile.NamedTemporaryFile(
                     prefix='graftm_decoy_test') as f3:
                 with tempfile.NamedTemporaryFile(
                         prefix='graftm_decoy_test') as f4:
                     f3.write(self.eg1)
                     f3.flush()
                     ret = DecoyFilter(Diamond(f2.name + ".dmnd"),
                                       Diamond(f1.name + ".dmnd")).filter(
                                           f1.name, f4.name)
                     self.assertEqual(True, ret)
                     seqs = SequenceIO().read_fasta_file(f4.name)
                     self.assertEqual(1, len(seqs))
                     self.assertEqual("PROKKA_03952", seqs[0].name)
             # clean up
             os.remove(f1.name + ".dmnd")
             os.remove(f2.name + ".dmnd")
Ejemplo n.º 8
0
    def extract_and_read(self, reads_to_extract, database_fasta_file):
        '''Extract the reads_to_extract from the database_fasta_file and return them.

        Parameters
        ----------
        reads_to_extract: Iterable of str
            IDs of reads to be extracted
        database_fasta_file: str
            path the fasta file that containing the reads

        Returns
        -------
        An array of graftm.sequence_io.Sequence objects'''
        cmd = "fxtract -XH -f /dev/stdin '%s'" % database_fasta_file

        process = subprocess.Popen(["bash", "-c", cmd],
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE)
        output, error = process.communicate('\n'.join(reads_to_extract))

        if process.returncode != 0:
            raise Exception(
                "Extraction command '%s' failed with exitstatus %i" %
                (cmd, process.returncode))

        seqs = []
        for name, seq, _ in SequenceIO().each(StringIO(output)):
            seqs.append(Sequence(name, seq))
        return seqs
Ejemplo n.º 9
0
    def _test_package(self, package_path):
        '''Give a GraftM package a spin, and see if it works in reality with default
        parameters (i.e. pplacer). If it does not work, then raise an error.

        Parameters
        ----------
        package_path: str
            path to graftm_package to be tested
        '''
        pkg = GraftMPackage.acquire(package_path)
        with tempdir.TempDir() as graftM_graft_test_dir_name:
            # Take a subset of sequences for testing
            with tempfile.NamedTemporaryFile(suffix=".fa", mode='w') as tf:
                seqio = SequenceIO()
                with open(pkg.unaligned_sequence_database_path()) as f:
                    seqio.write_fasta(
                        itertools.islice(seqio.each_sequence(f), 10), tf)
                tf.flush()
                cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" % (
                    tf.name, package_path, graftM_graft_test_dir_name)
                extern.run(cmd)
Ejemplo n.º 10
0
    def _test_package(self, package_path):
        '''Give a GraftM package a spin, and see if it works in reality with default
        parameters (i.e. pplacer). If it does not work, then raise an error.

        Parameters
        ----------
        package_path: str
            path to graftm_package to be tested
        '''
        pkg = GraftMPackage.acquire(package_path)
        with tempdir.TempDir() as graftM_graft_test_dir_name:
            # Take a subset of sequences for testing
            with tempfile.NamedTemporaryFile(suffix=".fa") as tf:
                seqio = SequenceIO()
                seqio.write_fasta(
                    itertools.islice(seqio.each_sequence(open(pkg.unaligned_sequence_database_path())), 10),
                    tf)
                tf.flush()
                cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" %(
                    tf.name, package_path, graftM_graft_test_dir_name)
                extern.run(cmd)
Ejemplo n.º 11
0
    def extract_and_read(self, reads_to_extract, database_fasta_file):
        '''Extract the reads_to_extract from the database_fasta_file and return them.

        Parameters
        ----------
        reads_to_extract: Iterable of str
            IDs of reads to be extracted
        database_fasta_file: str
            path the fasta file that containing the reads

        Returns
        -------
        An array of graftm.sequence_io.Sequence objects'''
        cmd = "mfqe --output-uncompressed --fasta-read-name-lists /dev/stdin --input-fasta '{}' --output-fasta-files /dev/stdout".format(
            database_fasta_file)

        # Retrieve each sequence exactly once so mfqe does not croak
        output = extern.run(cmd, stdin='\n'.join(set(reads_to_extract)))

        seqs = []
        for name, seq, _ in SequenceIO().each(StringIO(output)):
            seqs.append(Sequence(name, seq))
        return seqs
Ejemplo n.º 12
0
    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()
Ejemplo n.º 13
0
class Clusterer:
    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()

    def uncluster_annotations(self, input_annotations, reverse_pipe):
        '''
        Update the annotations hash provided by pplacer to include all
        representatives within each cluster

        Parameters
        ----------
        input_annotations : hash
            Classifications for each representative sequence of the clusters.
            each key being the sequence name, and the entry being the taxonomy
            string as a list.
        reverse_pipe : bool
            True/False, whether the reverse reads pipeline is being followed.

        Returns
        -------
        output_annotations : hash
            An updated version of the above, which includes all reads from
            each cluster
        '''
        output_annotations = {}
        for placed_alignment_file_path, clusters in self.seq_library.items():

            if reverse_pipe and placed_alignment_file_path.endswith(
                    "_reverse_clustered.fa"):
                continue
            placed_alignment_file = os.path.basename(
                placed_alignment_file_path)
            cluster_classifications = input_annotations[placed_alignment_file]

            if reverse_pipe:
                placed_alignment_base = placed_alignment_file.replace(
                    '_forward_clustered.fa', '')
            else:
                placed_alignment_base = placed_alignment_file.replace(
                    '_clustered.fa', '')
            output_annotations[placed_alignment_base] = {}
            for rep_read_name, rep_read_taxonomy in cluster_classifications.items(
            ):

                if reverse_pipe:
                    orfm_regex = OrfM.regular_expression()
                    clusters = {(orfm_regex.match(key).groups(0)[0]
                                 if orfm_regex.match(key) else key): item
                                for key, item in iter(clusters.items())}
                for read in clusters[rep_read_name]:
                    output_annotations[placed_alignment_base][
                        read.name] = rep_read_taxonomy

        return output_annotations

    def cluster(self, input_fasta_list, reverse_pipe):
        '''
        cluster - Clusters reads at 100% identity level and  writes them to
        file. Resets the input_fasta variable as the FASTA file containing the
        clusters.

        Parameters
        ----------
        input_fasta_list : list
            list of strings, each a path to input fasta files to be clustered.
        reverse_pipe : bool
            True/False, whether the reverse reads pipeline is being followed.
        Returns
        -------
        output_fasta_list : list
            list of strings, each a path to the output fasta file to which
            clusters were written to.
        '''
        output_fasta_list = []
        for input_fasta in input_fasta_list:
            output_path = input_fasta.replace('_hits.aln.fa', '_clustered.fa')
            cluster_dict = {}

            logging.debug('Clustering reads')
            if os.path.exists(input_fasta):
                reads = self.seqio.read_fasta_file(
                    input_fasta)  # Read in FASTA records
                logging.debug('Found %i reads' %
                              len(reads))  # Report number found
                clusters = self.clust.deduplicate(
                    reads)  # Cluster redundant sequences
                logging.debug('Clustered to %s groups' %
                              len(clusters))  # Report number of clusters
                logging.debug(
                    'Writing representative sequences of each cluster to: %s' %
                    output_path)  # Report the name of the file
            else:
                logging.debug("Found no reads to be clustered")
                clusters = []

            self.seqio.write_fasta_file(
                [x[0] for x in clusters], output_path
            )  # Choose the first sequence to write to file as representative (all the same anyway)
            for cluster in clusters:
                cluster_dict[cluster[
                    0].name] = cluster  # assign the cluster to the dictionary
            self.seq_library[output_path] = cluster_dict

            output_fasta_list.append(output_path)

        return output_fasta_list
Ejemplo n.º 14
0
    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()
Ejemplo n.º 15
0
class Clusterer:

    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()

    def uncluster_annotations(self, input_annotations, reverse_pipe):
        '''
        Update the annotations hash provided by pplacer to include all
        representatives within each cluster

        Parameters
        ----------
        input_annotations : hash
            Classifications for each representative sequence of the clusters.
            each key being the sequence name, and the entry being the taxonomy
            string as a list.
        reverse_pipe : bool
            True/False, whether the reverse reads pipeline is being followed.

        Returns
        -------
        output_annotations : hash
            An updated version of the above, which includes all reads from
            each cluster
        '''
        output_annotations = {}
        for placed_alignment_file_path, clusters in self.seq_library.iteritems():

            if reverse_pipe and placed_alignment_file_path.endswith("_reverse_clustered.fa"): continue
            placed_alignment_file = os.path.basename(placed_alignment_file_path)
            cluster_classifications = input_annotations[placed_alignment_file]

            if reverse_pipe:
                placed_alignment_base = placed_alignment_file.replace('_forward_clustered.fa', '')
            else:
                placed_alignment_base = placed_alignment_file.replace('_clustered.fa', '')
            output_annotations[placed_alignment_base] = {}
            for rep_read_name, rep_read_taxonomy in cluster_classifications.iteritems():

                if reverse_pipe:
                    orfm_regex = OrfM.regular_expression()
                    clusters={(orfm_regex.match(key).groups(0)[0] if orfm_regex.match(key) else key): item for key, item in clusters.iteritems()}
                for read in clusters[rep_read_name]:
                    output_annotations[placed_alignment_base][read.name] = rep_read_taxonomy

        return output_annotations

    def cluster(self, input_fasta_list, reverse_pipe):
        '''
        cluster - Clusters reads at 100% identity level and  writes them to
        file. Resets the input_fasta variable as the FASTA file containing the
        clusters.

        Parameters
        ----------
        input_fasta_list : list
            list of strings, each a path to input fasta files to be clustered.
        reverse_pipe : bool
            True/False, whether the reverse reads pipeline is being followed.
        Returns
        -------
        output_fasta_list : list
            list of strings, each a path to the output fasta file to which
            clusters were written to.
        '''
        output_fasta_list = []
        for input_fasta in input_fasta_list:
            output_path  = input_fasta.replace('_hits.aln.fa', '_clustered.fa')
            cluster_dict = {}

            logging.debug('Clustering reads')
            if os.path.exists(input_fasta):
                reads=self.seqio.read_fasta_file(input_fasta) # Read in FASTA records
                logging.debug('Found %i reads' % len(reads)) # Report number found
                clusters=self.clust.deduplicate(reads) # Cluster redundant sequences
                logging.debug('Clustered to %s groups' % len(clusters)) # Report number of clusters
                logging.debug('Writing representative sequences of each cluster to: %s' % output_path) # Report the name of the file
            else:
                logging.debug("Found no reads to be clustered")
                clusters = []

            self.seqio.write_fasta_file(
                                        [x[0] for x in clusters],
                                        output_path
                                        ) # Choose the first sequence to write to file as representative (all the same anyway)
            for cluster in clusters:
                cluster_dict[cluster[0].name]=cluster # assign the cluster to the dictionary
            self.seq_library[output_path]= cluster_dict

            output_fasta_list.append(output_path)

        return output_fasta_list
Ejemplo n.º 16
0
    def generate_expand_search_database_from_contigs(self, contig_files,
                                                     output_database_file,
                                                     search_method):
        '''Given a collection of search_hmm_files, search the contigs in 
        contig_files, and generate an HMM from the resulting hits, outputting
        it as output_database_file.
        
        Parameters
        ----------
        contig_files: list of str
            list of files to search
        output_database_file: str
            path to output file
        search_method: str
            "diamond" or "hmmsearch", to specify search method to use and what
            type of database to build. 
        
        Returns
        -------
        True if genes were recovered, else False'''

        ss = SequenceSearcher(self.search_hmm_files)
        seqio = SequenceIO()
        if search_method == self.DIAMOND_SEARCH_METHOD:
            if self.diamond_database == None or self.unaligned_sequence_database == None:
                logging.warning(
                    "Cannot expand_search continue with no diamond database or unaligned sequences."
                )
                return False

        with tempfile.NamedTemporaryFile(
                prefix='graftm_expand_search_orfs') as orfs:
            logging.info("Finding expand_search hits in provided contigs..")
            for contig_file in contig_files:
                logging.debug("Finding expand_search hits in %s.." %
                              contig_file)
                unpack = UnpackRawReads(contig_file)

                with tempfile.NamedTemporaryFile(prefix='graftm_expand_search') as \
                                                        hit_reads_orfs_fasta:
                    # search and extract matching ORFs
                    with tempfile.NamedTemporaryFile(prefix='graftm_expand_search2') as \
                                                        hmmsearch_output_table:
                        with tempfile.NamedTemporaryFile(prefix='graftm_expand_search3') as \
                                                        hit_reads_fasta:
                            ss.search_and_extract_orfs_matching_protein_database(\
                                    unpack,
                                    search_method,
                                    self.maximum_range,
                                    self.threads,
                                    self.evalue,
                                    self.min_orf_length,
                                    None,
                                    (self.diamond_database if self.diamond_database else None),
                                    hmmsearch_output_table.name,
                                    hit_reads_fasta.name,
                                    hit_reads_orfs_fasta.name)
                    # Append to the file
                    shutil.copyfileobj(open(hit_reads_orfs_fasta.name), orfs)

            # Now have a fasta file of ORFs.
            # Check to make sure the file is not zero-length
            orfs.flush()

            with tempfile.NamedTemporaryFile(
                    prefix="graftm_expand_search_aln") as aln:

                if search_method == self.HMM_SEARCH_METHOD:

                    # Check that there is more than one sequence to align.
                    if len(
                            seqio.read_fasta_file(orfs.name)
                    ) <= 1:  # Just to build on this, you need to check if there is > 1 hit
                        # otherwise mafft will fail to align, causing a crash when hmmbuild is
                        # run on an empty file.
                        logging.warn(
                            "Failed to find two or more matching ORFs in the expand_search contigs"
                        )
                        return False

                    # Run mafft to align them
                    cmd = "mafft --auto %s >%s" % (orfs.name, aln.name)
                    logging.info("Aligning expand_search hits..")
                    extern.run(cmd)

                    # Run hmmbuild to create an HMM
                    cmd = "hmmbuild --amino %s %s >/dev/null" % (
                        output_database_file, aln.name)
                    logging.info("Building HMM from expand_search hits..")

                    extern.run(cmd)

                elif search_method == self.DIAMOND_SEARCH_METHOD:

                    # Concatenate database with existing database
                    with tempfile.NamedTemporaryFile(
                            prefix="concatenated_database") as databasefile:
                        for f in [orfs.name, self.unaligned_sequence_database]:
                            for line in open(f):
                                databasefile.write(line)
                        databasefile.flush()

                        # Run diamond make to create a diamond database
                        cmd = "diamond makedb --in '%s' -d '%s'" % (
                            databasefile.name, output_database_file)
                        logging.info(
                            "Building a diamond database from expand_search hits.."
                        )
                        extern.run(cmd)

                else:
                    raise Exception("Search method not recognised: %s" %
                                    search_method)
                    return False

                return True
    else:
        loglevel = logging.INFO
    logging.basicConfig(level=loglevel,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')

    # Read in taxonomy
    logging.info("Reading taxonomy..")
    gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy
    logging.info("Read in %i taxonomies" % len(gg))

    # Read in sequence
    logging.info("Reading sequences..")
    duplicates = set()
    sequences = {}
    for name, seq, _ in SequenceIO()._readfq(open(args.sequences)):
        if name in sequences:
            logging.error("Duplicate sequence name %s" % name)
            duplicates.add(name)
        else:
            sequences[name] = seq
    logging.warn("Found %i duplicated IDs" % len(duplicates))
    for dup in duplicates:
        del sequences[dup]
    logging.info("Read in %i sequences" % len(sequences))

    # Ensure that each sequence in the taxonomy has an associated sequence,
    # otherwise delete it
    tax_no_seq = set()
    for name, taxonomy in gg.items():
        if name not in sequences:
Ejemplo n.º 18
0
    def main(self, **kwargs):
        alignment = kwargs.pop('alignment',None)
        sequences = kwargs.pop('sequences',None)
        taxonomy = kwargs.pop('taxonomy',None)
        rerooted_tree = kwargs.pop('rerooted_tree',None)
        unrooted_tree = kwargs.pop('unrooted_tree',None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files',None)
        min_aligned_percent = kwargs.pop('min_aligned_percent',0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force',False)
        graftm_package = kwargs.pop('graftm_package',False)
        dereplication_level = kwargs.pop('dereplication_level',False)
        threads = kwargs.pop('threads',5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0]
                      if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []
        tempfiles_to_close = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" % output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info("Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info("Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup)
        output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
        tempfiles_to_close.append(output_alignment_fh)
        output_alignment = output_alignment_fh.name
        if user_hmm:
            align_hmm = user_hmm
        else:
            align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm')
            tempfiles_to_close.append(align_hmm_fh)
            align_hmm = align_hmm_fh.name

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup)
            ptype = self._get_hmm_from_alignment(alignment,
                                                 align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm,
                                               align_hmm, output_alignment, threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                 min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100))
            for s in insufficiently_aligned_sequences:
                logging.warn("Insufficient alignment of %s, not including this sequence" % s)

            sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
            tempfiles_to_close.append(sequences2_fh)
            sequences2 = sequences2_fh.name
            num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                  sequences,
                                                                  sequences2)
            sequences = sequences2

            if alignment:
                alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(alignment2_fh)
                alignment2 = alignment2_fh.name
                num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                      alignment,
                                                                      alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name)
                    removed_sequence_names.append(name)


            logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences)
            else:
                logging.info("Reconstructing the alignment and HMM from remaining sequences")
                output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(output_alignment_fh)
                output_alignment = output_alignment_fh.name
                if not user_hmm:
                    align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm')
                    tempfiles_to_close.append(align_hmm_fh)
                    align_hmm = align_hmm_fh.name
                ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm,
                                                   align_hmm, output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                         min_aligned_percent)
        if not search_hmm_files:
            search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm')
            tempfiles_to_close.append(search_hmm_fh)
            search_hmm = search_hmm_fh.name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error("Unable to find sequence '%s' in the taxonomy definition" % s)
            raise Exception("All sequences must be assigned a taxonomy, cannot continue")


        logging.debug("Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base+"_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names=[]
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
        tempfiles_to_close.append(sequences2_fh)
        sequences2 = sequences2_fh.name


        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype,
                                                  self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise


            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm')
                tempfiles_to_close.append(log_file_tempfile)
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm')
                tempfiles_to_close.append(tre_file_tempfile)
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name, deduplicated_alignment_file,
                                                 tre_file, log_file, ptype, self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep=[
                          seq.name for seq in
                                [x for x in [x[0] for x in deduplicated_arrays]
                                 if x]
                          ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base+"_seqinfo.csv"
            tax = base+"_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info("Building seqinfo and taxonomy file from input annotated tree")
                taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info("Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            else:
                raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

            taxonomy_definition = {x:taxonomy_definition[x]
                                   for x in taxonomy_definition
                                   if x in taxonomy_to_keep}

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition,
                                                  tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else: raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb,
                                      max_range, sequences, search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)
        for tf in tempfiles_to_close:
            tf.close()

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")
Ejemplo n.º 19
0
    def main(self, **kwargs):
        alignment = kwargs.pop('alignment', None)
        sequences = kwargs.pop('sequences', None)
        taxonomy = kwargs.pop('taxonomy', None)
        rerooted_tree = kwargs.pop('rerooted_tree', None)
        unrooted_tree = kwargs.pop('unrooted_tree', None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files', None)
        min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force', False)
        graftm_package = kwargs.pop('graftm_package', False)
        dereplication_level = kwargs.pop('dereplication_level', False)
        threads = kwargs.pop('threads', 5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0] if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" %
                             output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception(
                    "Cowardly refusing to overwrite gpkg to already existing %s"
                    % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info(
                "Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info(
                "Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(
                taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info(
                "Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception(
                "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
            )

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception(
                "Found duplicate sequence name '%s' in sequences input file" %
                dup)
        output_alignment = tempfile.NamedTemporaryFile(prefix='graftm',
                                                       suffix='.aln.faa').name
        align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile(
            prefix='graftm', suffix='_align.hmm').name)

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception(
                    "Found duplicate sequence name '%s' in alignment input file"
                    % dup)
            ptype = self._get_hmm_from_alignment(alignment, align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(
                sequences, alignment, user_hmm, align_hmm, output_alignment,
                threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(
            open(output_alignment), min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn(
                "One or more alignments do not span > %.2f %% of HMM" %
                (min_aligned_percent * 100))
            for s in insufficiently_aligned_sequences:
                logging.warn(
                    "Insufficient alignment of %s, not including this sequence"
                    % s)

            _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')
            num_sequences = self._remove_sequences_from_alignment(
                insufficiently_aligned_sequences, sequences, sequences2)
            sequences = sequences2

            if alignment:
                _, alignment2 = tempfile.mkstemp(prefix='graftm',
                                                 suffix='.aln.faa')
                num_sequences = self._remove_sequences_from_alignment(
                    insufficiently_aligned_sequences, alignment, alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning(
                            '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) '''
                            % name)
                    removed_sequence_names.append(name)

            logging.info(
                "After removing %i insufficiently aligned sequences, left with %i sequences"
                % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception(
                    "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i"
                    % num_sequences)
            else:
                logging.info(
                    "Reconstructing the alignment and HMM from remaining sequences"
                )
                output_alignment = tempfile.NamedTemporaryFile(
                    prefix='graftm', suffix='.aln.faa').name
                if not user_hmm:
                    align_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                            suffix='.hmm').name
                ptype, output_alignment = self._align_and_create_hmm(
                    sequences, alignment, user_hmm, align_hmm,
                    output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(
                    open(output_alignment), min_aligned_percent)
        if not search_hmm_files:
            search_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                     suffix='_search.hmm').name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm,
                                    dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error(
                    "Unable to find sequence '%s' in the taxonomy definition" %
                    s)
            raise Exception(
                "All sequences must be assigned a taxonomy, cannot continue")

        logging.debug(
            "Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays,
                                                   taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base + "_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names = []
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')

        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype, self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise

            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree_log', prefix='graftm')
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree', prefix='graftm')
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree',
                                                 prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name,
                                                 deduplicated_alignment_file,
                                                 tre_file, log_file, ptype,
                                                 self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep = [
            seq.name
            for seq in [x for x in [x[0] for x in deduplicated_arrays] if x]
        ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base + "_seqinfo.csv"
            tax = base + "_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info(
                    "Building seqinfo and taxonomy file from input annotated tree"
                )
                taxonomy_definition = TaxonomyExtractor(
                ).taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info(
                    "Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(
                    taxonomy).taxonomy
            else:
                raise Exception(
                    "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
                )

            taxonomy_definition = {
                x: taxonomy_definition[x]
                for x in taxonomy_definition if x in taxonomy_to_keep
            }

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else:
                raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path,
                                      refpkg,
                                      align_hmm,
                                      diamondb,
                                      max_range,
                                      sequences,
                                      search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")
Ejemplo n.º 20
0
    def _assign_taxonomy_with_diamond(self, base_list, db_search_results,
                                      graftm_package, graftm_files,
                                      diamond_performance_parameters):
        '''Run diamond to assign taxonomy

        Parameters
        ----------
        base_list: list of str
            list of sequence block names
        db_search_results: list of DBSearchResult
            the result of running hmmsearches
        graftm_package: GraftMPackage object
            Diamond is run against this database
        graftm_files: GraftMFiles object
            Result files are written here
        diamond_performance_parameters : str
            extra args for DIAMOND

        Returns
        -------
        list of
        1. time taken for assignment
        2. assignments i.e. dict of base_list entry to dict of read names to
            to taxonomies, or None if there was no hit detected.
        '''
        runner = Diamond(graftm_package.diamond_database_path(),
                         self.args.threads, self.args.evalue)
        taxonomy_definition = Getaxnseq().read_taxtastic_taxonomy_and_seqinfo\
                (open(graftm_package.taxtastic_taxonomy_path()),
                 open(graftm_package.taxtastic_seqinfo_path()))
        results = {}

        # For each of the search results,
        for i, search_result in enumerate(db_search_results):
            if search_result.hit_fasta() is None:
                sequence_id_to_taxonomy = {}
            else:
                sequence_id_to_hit = {}
                # Run diamond
                logging.debug("Running diamond on %s" %
                              search_result.hit_fasta())
                diamond_result = runner.run(
                    search_result.hit_fasta(),
                    UnpackRawReads.PROTEIN_SEQUENCE_TYPE,
                    daa_file_basename=graftm_files.
                    diamond_assignment_output_basename(base_list[i]),
                    extra_args=diamond_performance_parameters)
                for res in diamond_result.each([
                        SequenceSearchResult.QUERY_ID_FIELD,
                        SequenceSearchResult.HIT_ID_FIELD
                ]):
                    if res[0] in sequence_id_to_hit:
                        # do not accept duplicates
                        if sequence_id_to_hit[res[0]] != res[1]:
                            raise Exception(
                                "Diamond unexpectedly gave two hits for a single query sequence for %s"
                                % res[0])
                    else:
                        sequence_id_to_hit[res[0]] = res[1]

                # Extract taxonomy of the best hit, and add in the no hits
                sequence_id_to_taxonomy = {}
                for seqio in SequenceIO().read_fasta_file(
                        search_result.hit_fasta()):
                    name = seqio.name
                    if name in sequence_id_to_hit:
                        # Add Root; to be in line with pplacer assignment method
                        sequence_id_to_taxonomy[name] = [
                            'Root'
                        ] + taxonomy_definition[sequence_id_to_hit[name]]
                    else:
                        # picked up in the initial search (by hmmsearch, say), but diamond misses it
                        sequence_id_to_taxonomy[name] = ['Root']

            results[base_list[i]] = sequence_id_to_taxonomy
        return results