Example #1
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.deduplicator = Deduplicator()
     self.d = self.deduplicator
Example #2
0
    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()
Example #3
0
    def main(self, **kwargs):
        alignment = kwargs.pop('alignment', None)
        sequences = kwargs.pop('sequences', None)
        taxonomy = kwargs.pop('taxonomy', None)
        rerooted_tree = kwargs.pop('rerooted_tree', None)
        unrooted_tree = kwargs.pop('unrooted_tree', None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files', None)
        min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force', False)
        graftm_package = kwargs.pop('graftm_package', False)
        dereplication_level = kwargs.pop('dereplication_level', False)
        threads = kwargs.pop('threads', 5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0] if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" %
                             output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception(
                    "Cowardly refusing to overwrite gpkg to already existing %s"
                    % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info(
                "Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info(
                "Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(
                taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info(
                "Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception(
                "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
            )

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception(
                "Found duplicate sequence name '%s' in sequences input file" %
                dup)
        output_alignment = tempfile.NamedTemporaryFile(prefix='graftm',
                                                       suffix='.aln.faa').name
        align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile(
            prefix='graftm', suffix='_align.hmm').name)

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception(
                    "Found duplicate sequence name '%s' in alignment input file"
                    % dup)
            ptype = self._get_hmm_from_alignment(alignment, align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(
                sequences, alignment, user_hmm, align_hmm, output_alignment,
                threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(
            open(output_alignment), min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn(
                "One or more alignments do not span > %.2f %% of HMM" %
                (min_aligned_percent * 100))
            for s in insufficiently_aligned_sequences:
                logging.warn(
                    "Insufficient alignment of %s, not including this sequence"
                    % s)

            _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')
            num_sequences = self._remove_sequences_from_alignment(
                insufficiently_aligned_sequences, sequences, sequences2)
            sequences = sequences2

            if alignment:
                _, alignment2 = tempfile.mkstemp(prefix='graftm',
                                                 suffix='.aln.faa')
                num_sequences = self._remove_sequences_from_alignment(
                    insufficiently_aligned_sequences, alignment, alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning(
                            '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) '''
                            % name)
                    removed_sequence_names.append(name)

            logging.info(
                "After removing %i insufficiently aligned sequences, left with %i sequences"
                % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception(
                    "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i"
                    % num_sequences)
            else:
                logging.info(
                    "Reconstructing the alignment and HMM from remaining sequences"
                )
                output_alignment = tempfile.NamedTemporaryFile(
                    prefix='graftm', suffix='.aln.faa').name
                if not user_hmm:
                    align_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                            suffix='.hmm').name
                ptype, output_alignment = self._align_and_create_hmm(
                    sequences, alignment, user_hmm, align_hmm,
                    output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(
                    open(output_alignment), min_aligned_percent)
        if not search_hmm_files:
            search_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                     suffix='_search.hmm').name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm,
                                    dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error(
                    "Unable to find sequence '%s' in the taxonomy definition" %
                    s)
            raise Exception(
                "All sequences must be assigned a taxonomy, cannot continue")

        logging.debug(
            "Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays,
                                                   taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base + "_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names = []
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')

        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype, self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise

            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree_log', prefix='graftm')
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree', prefix='graftm')
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree',
                                                 prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name,
                                                 deduplicated_alignment_file,
                                                 tre_file, log_file, ptype,
                                                 self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep = [
            seq.name
            for seq in [x for x in [x[0] for x in deduplicated_arrays] if x]
        ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base + "_seqinfo.csv"
            tax = base + "_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info(
                    "Building seqinfo and taxonomy file from input annotated tree"
                )
                taxonomy_definition = TaxonomyExtractor(
                ).taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info(
                    "Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(
                    taxonomy).taxonomy
            else:
                raise Exception(
                    "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
                )

            taxonomy_definition = {
                x: taxonomy_definition[x]
                for x in taxonomy_definition if x in taxonomy_to_keep
            }

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else:
                raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path,
                                      refpkg,
                                      align_hmm,
                                      diamondb,
                                      max_range,
                                      sequences,
                                      search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")