Python Deduplicator.Deduplicator Examples

Programming Language: Python

Namespace/Package Name: graftm.deduplicator

Class/Type: Deduplicator

Method/Function: Deduplicator

Examples at hotexamples.com: 3

Python Deduplicator.Deduplicator - 3 examples found. These are the top rated real world Python examples of graftm.deduplicator.Deduplicator.Deduplicator extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Deduplicator(3)

deduplicate(2)

lca_taxonomy(1)

Example #1

Show file

File: test_deduplicator.py Project: xvazquezc/graftM

 def setUp(self):
     unittest.TestCase.setUp(self)
     self.deduplicator = Deduplicator()
     self.d = self.deduplicator

Example #2

Show file

    def __init__(self):
        self.clust = Deduplicator()
        self.seqio = SequenceIO()
        self.seq_library = {}

        self.orfm_regex = OrfM.regular_expression()

Example #3

Show file

    def main(self, **kwargs):
        alignment = kwargs.pop('alignment', None)
        sequences = kwargs.pop('sequences', None)
        taxonomy = kwargs.pop('taxonomy', None)
        rerooted_tree = kwargs.pop('rerooted_tree', None)
        unrooted_tree = kwargs.pop('unrooted_tree', None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files', None)
        min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force', False)
        graftm_package = kwargs.pop('graftm_package', False)
        dereplication_level = kwargs.pop('dereplication_level', False)
        threads = kwargs.pop('threads', 5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0] if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" %
                             output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception(
                    "Cowardly refusing to overwrite gpkg to already existing %s"
                    % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info(
                "Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info(
                "Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(
                taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info(
                "Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception(
                "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
            )

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception(
                "Found duplicate sequence name '%s' in sequences input file" %
                dup)
        output_alignment = tempfile.NamedTemporaryFile(prefix='graftm',
                                                       suffix='.aln.faa').name
        align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile(
            prefix='graftm', suffix='_align.hmm').name)

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception(
                    "Found duplicate sequence name '%s' in alignment input file"
                    % dup)
            ptype = self._get_hmm_from_alignment(alignment, align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(
                sequences, alignment, user_hmm, align_hmm, output_alignment,
                threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(
            open(output_alignment), min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn(
                "One or more alignments do not span > %.2f %% of HMM" %
                (min_aligned_percent * 100))
            for s in insufficiently_aligned_sequences:
                logging.warn(
                    "Insufficient alignment of %s, not including this sequence"
                    % s)

            _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')
            num_sequences = self._remove_sequences_from_alignment(
                insufficiently_aligned_sequences, sequences, sequences2)
            sequences = sequences2

            if alignment:
                _, alignment2 = tempfile.mkstemp(prefix='graftm',
                                                 suffix='.aln.faa')
                num_sequences = self._remove_sequences_from_alignment(
                    insufficiently_aligned_sequences, alignment, alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning(
                            '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) '''
                            % name)
                    removed_sequence_names.append(name)

            logging.info(
                "After removing %i insufficiently aligned sequences, left with %i sequences"
                % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception(
                    "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i"
                    % num_sequences)
            else:
                logging.info(
                    "Reconstructing the alignment and HMM from remaining sequences"
                )
                output_alignment = tempfile.NamedTemporaryFile(
                    prefix='graftm', suffix='.aln.faa').name
                if not user_hmm:
                    align_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                            suffix='.hmm').name
                ptype, output_alignment = self._align_and_create_hmm(
                    sequences, alignment, user_hmm, align_hmm,
                    output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(
                    open(output_alignment), min_aligned_percent)
        if not search_hmm_files:
            search_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                     suffix='_search.hmm').name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm,
                                    dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error(
                    "Unable to find sequence '%s' in the taxonomy definition" %
                    s)
            raise Exception(
                "All sequences must be assigned a taxonomy, cannot continue")

        logging.debug(
            "Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays,
                                                   taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base + "_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names = []
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')

        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype, self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise

            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree_log', prefix='graftm')
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree', prefix='graftm')
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree',
                                                 prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name,
                                                 deduplicated_alignment_file,
                                                 tre_file, log_file, ptype,
                                                 self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep = [
            seq.name
            for seq in [x for x in [x[0] for x in deduplicated_arrays] if x]
        ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base + "_seqinfo.csv"
            tax = base + "_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info(
                    "Building seqinfo and taxonomy file from input annotated tree"
                )
                taxonomy_definition = TaxonomyExtractor(
                ).taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info(
                    "Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(
                    taxonomy).taxonomy
            else:
                raise Exception(
                    "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
                )

            taxonomy_definition = {
                x: taxonomy_definition[x]
                for x in taxonomy_definition if x in taxonomy_to_keep
            }

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else:
                raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path,
                                      refpkg,
                                      align_hmm,
                                      diamondb,
                                      max_range,
                                      sequences,
                                      search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")