Python Getaxnseq Examples

Programming Language: Python

Namespace/Package Name: graftm.getaxnseq

Class/Type: Getaxnseq

Examples at hotexamples.com: 14

Python Getaxnseq - 14 examples found. These are the top rated real world Python examples of graftm.getaxnseq.Getaxnseq extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Getaxnseq(11)

read_taxtastic_taxonomy_and_seqinfo(3)

write_taxonomy_and_seqinfo_files(2)

Example #1

Show file

 def taxonomy_hash(self):
     '''Read in the taxonomy and return as a hash of name: taxonomy,
     where taxonomy is an array of strings.'''
     gtns = Getaxnseq()
     with open(self.taxtastic_taxonomy_path()) as tax:
         with open(self.taxtastic_seqinfo_path()) as seqinfo:
             return gtns.read_taxtastic_taxonomy_and_seqinfo(tax, seqinfo)

Example #2

Show file

 def test_hello_world(self):
     with tempfile.NamedTemporaryFile(
             prefix='graftm_test_getaxnseq') as tmp_seq:
         with tempfile.NamedTemporaryFile(
                 prefix='graftm_test_getaxnseq') as tmp_tax:
             Getaxnseq().write_taxonomy_and_seqinfo_files(
                 {
                     'seq1': ['k__me', 'p__you'],
                     'seq2': []
                 }, tmp_tax.name, tmp_seq.name)
             expected = sorted([
                 ','.join(p) + '\n'
                 for p in [['seqname', 'tax_id'], ['seq2', 'Root'],
                           ['seq1', 'p__you']]
             ])
             with open(tmp_seq.name) as f:
                 self.assertEqual(expected, sorted(f.readlines()))
             expected = '\n'.join([
                 "tax_id,parent_id,rank,tax_name,root,rank_0,rank_1",
                 "Root,Root,root,Root,Root,,",
                 "k__me,Root,rank_0,k__me,Root,k__me,",
                 "p__you,k__me,rank_1,p__you,Root,k__me,p__you"
             ]) + "\n"
             with open(tmp_tax.name) as f:
                 self.assertEqual(expected, f.read())

Example #3

Show file

File: test_getaxnseq.py Project: jonathanylin/graftM

    def test_more_than_seven_levels(self):
        with tempfile.NamedTemporaryFile(prefix='graftm_test_getaxnseq') as tmp_seq:
            with tempfile.NamedTemporaryFile(prefix='graftm_test_getaxnseq') as tmp_tax:
                Getaxnseq().write_taxonomy_and_seqinfo_files({'seq1': string.split('k__me p__you c__came over for great spaghetti extra'),
                                                              'seq1.5': string.split('k__me p__you c__came over for great spaghetti'),
                                                              'seq2': []},
                                                             tmp_tax.name,
                                                             tmp_seq.name)
                expected = "\n".join([','.join(p) for p in [['seqname','tax_id'],
                    ['seq2','Root'],
                    ['seq1','extra'],
                    ['seq1.5','spaghetti']]])+"\n"
                self.assertEqual(expected, open(tmp_seq.name).read())
                expected = '\n'.join(["tax_id,parent_id,rank,tax_name,root,rank_0,rank_1,rank_2,rank_3,rank_4,rank_5,rank_6,rank_7",
                                      "Root,Root,root,Root,Root,,,,,,,,",
                                      "k__me,Root,rank_0,k__me,Root,k__me,,,,,,,",
                                      "p__you,k__me,rank_1,p__you,Root,k__me,p__you,,,,,,",
                                      "c__came,p__you,rank_2,c__came,Root,k__me,p__you,c__came,,,,,",
                                      "over,c__came,rank_3,over,Root,k__me,p__you,c__came,over,,,,",
                                      "for,over,rank_4,for,Root,k__me,p__you,c__came,over,for,,,",
                                      "great,for,rank_5,great,Root,k__me,p__you,c__came,over,for,great,,",
                                      "spaghetti,great,rank_6,spaghetti,Root,k__me,p__you,c__came,over,for,great,spaghetti,",
                                      "extra,spaghetti,rank_7,extra,Root,k__me,p__you,c__came,over,for,great,spaghetti,extra"])+"\n"

                self.assertEqual(expected, open(tmp_tax.name).read())

Example #4

Show file

 def test_more_than_seven_levels(self):
     with tempfile.NamedTemporaryFile(
             prefix='graftm_test_getaxnseq') as tmp_seq:
         with tempfile.NamedTemporaryFile(
                 prefix='graftm_test_getaxnseq') as tmp_tax:
             Getaxnseq().write_taxonomy_and_seqinfo_files(
                 {
                     'seq1':
                     string.split(
                         'k__me p__you c__came over for great spaghetti extra'
                     ),
                     'seq1.5':
                     string.split(
                         'k__me p__you c__came over for great spaghetti'),
                     'seq2': []
                 }, tmp_tax.name, tmp_seq.name)
             expected = "\n".join([
                 ','.join(p)
                 for p in [['seqname', 'tax_id'], ['seq2', 'Root'],
                           ['seq1', 'spaghetti'], ['seq1.5', 'spaghetti']]
             ]) + "\n"
             self.assertEqual(expected, open(tmp_seq.name).read())
             expected = '\n'.join([
                 'tax_id,parent_id,rank,tax_name,root,kingdom,phylum,class,order,family,genus,species',
                 'Root,Root,root,Root,Root,,,,,,,',
                 'k__me,Root,kingdom,k__me,Root,k__me,,,,,,',
                 'p__you,k__me,phylum,p__you,Root,k__me,p__you,,,,,',
                 'c__came,p__you,class,c__came,Root,k__me,p__you,c__came,,,,',
                 'over,c__came,order,over,Root,k__me,p__you,c__came,over,,,',
                 'for,over,family,for,Root,k__me,p__you,c__came,over,for,,',
                 'great,for,genus,great,Root,k__me,p__you,c__came,over,for,great,',
                 'spaghetti,great,species,spaghetti,Root,k__me,p__you,c__came,over,for,great,spaghetti',
             ]) + "\n"
             self.assertEqual(expected, open(tmp_tax.name).read())

Example #5

Show file

File: test_getaxnseq.py Project: jonathanylin/graftM

 def test_read_taxtastic_taxonomy_and_seqinfo(self):
     tax = StringIO('\n'.join(['tax_id,parent_id,rank,tax_name,root,kingdom,phylum,class,order,family,genus,species',
                                   'Root,Root,root,Root,Root,,,,,,,',
                                   'k__me,Root,kingdom,k__me,Root,k__me,,,,,,',
                                   'p__you,k__me,phylum,p__you,Root,k__me,p__you,,,,,'])+"\n")
     seq = StringIO("\n".join([','.join(p) for p in [['seqname','tax_id'],
                 ['seq2','Root'],
                 ['seq1','p__you']]])+"\n")
     self.assertEqual({'seq1': ['k__me','p__you'],
                       'seq2': []},
                      Getaxnseq().read_taxtastic_taxonomy_and_seqinfo(tax, seq))

Example #6

Show file

    def __init__(self, tree, taxonomy, seqinfo=None):
        '''
        Parameters
        ----------
        tree        : dendropy.Tree

            dendropy.Tree object
        taxonomy    : string
            Path to a file containing taxonomy information about the tree,
            either in Greengenes or taxtastic format (seqinfo file must also
            be provided if taxonomy is in taxtastic format).
        seqinfo     : string
            Path to a seqinfo file. This is a .csv file with the first column
            denoting the sequence name, and the second column, its most resolved
            taxonomic rank.
        '''

        self.encountered_nodes = {}
        self.encountered_taxonomies = set()
        self.tree = tree

        # Read in taxonomy
        logging.info("Reading in taxonomy")
        if seqinfo:
            logging.info("Importing taxtastic taxonomy from files: %s and %s" %
                         (taxonomy, seqinfo))
            gtns = Getaxnseq()
            self.taxonomy = gtns.read_taxtastic_taxonomy_and_seqinfo(
                open(taxonomy), open(seqinfo))

        else:
            try:
                logging.info("Reading Greengenes style taxonomy")
                self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            except MalformedGreenGenesTaxonomyException:
                raise Exception("Failed to read taxonomy as a Greengenes \
                                 formatted file. Was a taxtastic style \
                                 taxonomy provided with no seqinfo file?")

Example #7

Show file

    def __init__(self, tree, taxonomy, seqinfo=None):
        '''
        Parameters
        ----------
        tree        : dendropy.Tree

            dendropy.Tree object
        taxonomy    : string
            Path to a file containing taxonomy information about the tree,
            either in Greengenes or taxtastic format (seqinfo file must also
            be provided if taxonomy is in taxtastic format).
        seqinfo     : string
            Path to a seqinfo file. This is a .csv file with the first column
            denoting the sequence name, and the second column, its most resolved
            taxonomic rank.
        '''

        self.encountered_nodes = {}
        self.encountered_taxonomies = set()
        self.tree = tree

        # Read in taxonomy
        logging.info("Reading in taxonomy")
        if seqinfo:
            logging.info("Importing taxtastic taxonomy from files: %s and %s" % (taxonomy, seqinfo))
            gtns = Getaxnseq()
            self.taxonomy =  gtns.read_taxtastic_taxonomy_and_seqinfo(open(taxonomy), open(seqinfo))

        else:
            try:
                logging.info("Reading Greengenes style taxonomy")
                self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            except MalformedGreenGenesTaxonomyException:
                raise Exception("Failed to read taxonomy as a Greengenes \
                                 formatted file. Was a taxtastic style \
                                 taxonomy provided with no seqinfo file?")

Example #8

Show file

 def test_hello_world(self):
     with tempfile.NamedTemporaryFile(
             prefix='graftm_test_getaxnseq') as tmp_seq:
         with tempfile.NamedTemporaryFile(
                 prefix='graftm_test_getaxnseq') as tmp_tax:
             Getaxnseq().write_taxonomy_and_seqinfo_files(
                 {
                     'seq1': ['k__me', 'p__you'],
                     'seq2': []
                 }, tmp_tax.name, tmp_seq.name)
             expected = "\n".join([
                 ','.join(p) for p in [['seqname', 'tax_id'],
                                       ['seq2', 'Root'], ['seq1', 'p__you']]
             ]) + "\n"
             self.assertEqual(expected, open(tmp_seq.name).read())
             expected = '\n'.join([
                 'tax_id,parent_id,rank,tax_name,root,kingdom,phylum,class,order,family,genus,species',
                 'Root,Root,root,Root,Root,,,,,,,',
                 'k__me,Root,kingdom,k__me,Root,k__me,,,,,,',
                 'p__you,k__me,phylum,p__you,Root,k__me,p__you,,,,,'
             ]) + "\n"
             self.assertEqual(expected, open(tmp_tax.name).read())

Example #9

Show file

    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop(
            'threads',
            UpdateDefaultOptions.threads)  #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function."
                % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (
            new_gpkg.name
        )  #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file(
            [old_gpkg.unaligned_sequence_database_path(), input_sequence_path],
            new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug(
                "Found %i taxonomic definitions in common between the previous and updated taxonomies"
                % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn(
                    "Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case."
                    % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences,
                              new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm,
                                     new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(
            old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()

            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(rerooted_tree,
                               old_gpkg.taxtastic_taxonomy_path(),
                               old_gpkg.taxtastic_seqinfo_path())

            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True)
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(
                    taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type, self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(total_taxonomy_hash,
                                              new_gpkg.tt_taxonomy,
                                              new_gpkg.tt_seqinfo)

        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg, True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(
            new_gpkg.name,
            new_gpkg.refpkg,
            new_gpkg.hmm,
            new_gpkg.diamond_database,
            self._define_range(new_gpkg.unaligned_sequences),
            new_gpkg.unaligned_sequences,
            search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")

Example #10

Show file

File: create.py Project: geronimp/graftM

    def main(self, **kwargs):
        alignment = kwargs.pop('alignment',None)
        sequences = kwargs.pop('sequences',None)
        taxonomy = kwargs.pop('taxonomy',None)
        rerooted_tree = kwargs.pop('rerooted_tree',None)
        unrooted_tree = kwargs.pop('unrooted_tree',None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files',None)
        min_aligned_percent = kwargs.pop('min_aligned_percent',0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force',False)
        graftm_package = kwargs.pop('graftm_package',False)
        dereplication_level = kwargs.pop('dereplication_level',False)
        threads = kwargs.pop('threads',5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0]
                      if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []
        tempfiles_to_close = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" % output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info("Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info("Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup)
        output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
        tempfiles_to_close.append(output_alignment_fh)
        output_alignment = output_alignment_fh.name
        if user_hmm:
            align_hmm = user_hmm
        else:
            align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm')
            tempfiles_to_close.append(align_hmm_fh)
            align_hmm = align_hmm_fh.name

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup)
            ptype = self._get_hmm_from_alignment(alignment,
                                                 align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm,
                                               align_hmm, output_alignment, threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                 min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100))
            for s in insufficiently_aligned_sequences:
                logging.warn("Insufficient alignment of %s, not including this sequence" % s)

            sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
            tempfiles_to_close.append(sequences2_fh)
            sequences2 = sequences2_fh.name
            num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                  sequences,
                                                                  sequences2)
            sequences = sequences2

            if alignment:
                alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(alignment2_fh)
                alignment2 = alignment2_fh.name
                num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                      alignment,
                                                                      alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name)
                    removed_sequence_names.append(name)


            logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences)
            else:
                logging.info("Reconstructing the alignment and HMM from remaining sequences")
                output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(output_alignment_fh)
                output_alignment = output_alignment_fh.name
                if not user_hmm:
                    align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm')
                    tempfiles_to_close.append(align_hmm_fh)
                    align_hmm = align_hmm_fh.name
                ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm,
                                                   align_hmm, output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                         min_aligned_percent)
        if not search_hmm_files:
            search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm')
            tempfiles_to_close.append(search_hmm_fh)
            search_hmm = search_hmm_fh.name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error("Unable to find sequence '%s' in the taxonomy definition" % s)
            raise Exception("All sequences must be assigned a taxonomy, cannot continue")


        logging.debug("Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base+"_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names=[]
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
        tempfiles_to_close.append(sequences2_fh)
        sequences2 = sequences2_fh.name


        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype,
                                                  self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise


            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm')
                tempfiles_to_close.append(log_file_tempfile)
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm')
                tempfiles_to_close.append(tre_file_tempfile)
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name, deduplicated_alignment_file,
                                                 tre_file, log_file, ptype, self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep=[
                          seq.name for seq in
                                [x for x in [x[0] for x in deduplicated_arrays]
                                 if x]
                          ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base+"_seqinfo.csv"
            tax = base+"_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info("Building seqinfo and taxonomy file from input annotated tree")
                taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info("Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            else:
                raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

            taxonomy_definition = {x:taxonomy_definition[x]
                                   for x in taxonomy_definition
                                   if x in taxonomy_to_keep}

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition,
                                                  tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else: raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb,
                                      max_range, sequences, search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)
        for tf in tempfiles_to_close:
            tf.close()

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")

Example #11

Show file

    def main(self, **kwargs):
        alignment = kwargs.pop('alignment', None)
        sequences = kwargs.pop('sequences', None)
        taxonomy = kwargs.pop('taxonomy', None)
        rerooted_tree = kwargs.pop('rerooted_tree', None)
        unrooted_tree = kwargs.pop('unrooted_tree', None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files', None)
        min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force', False)
        graftm_package = kwargs.pop('graftm_package', False)
        dereplication_level = kwargs.pop('dereplication_level', False)
        threads = kwargs.pop('threads', 5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0] if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" %
                             output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception(
                    "Cowardly refusing to overwrite gpkg to already existing %s"
                    % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info(
                "Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info(
                "Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(
                taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info(
                "Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception(
                "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
            )

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception(
                "Found duplicate sequence name '%s' in sequences input file" %
                dup)
        output_alignment = tempfile.NamedTemporaryFile(prefix='graftm',
                                                       suffix='.aln.faa').name
        align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile(
            prefix='graftm', suffix='_align.hmm').name)

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception(
                    "Found duplicate sequence name '%s' in alignment input file"
                    % dup)
            ptype = self._get_hmm_from_alignment(alignment, align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(
                sequences, alignment, user_hmm, align_hmm, output_alignment,
                threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(
            open(output_alignment), min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn(
                "One or more alignments do not span > %.2f %% of HMM" %
                (min_aligned_percent * 100))
            for s in insufficiently_aligned_sequences:
                logging.warn(
                    "Insufficient alignment of %s, not including this sequence"
                    % s)

            _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')
            num_sequences = self._remove_sequences_from_alignment(
                insufficiently_aligned_sequences, sequences, sequences2)
            sequences = sequences2

            if alignment:
                _, alignment2 = tempfile.mkstemp(prefix='graftm',
                                                 suffix='.aln.faa')
                num_sequences = self._remove_sequences_from_alignment(
                    insufficiently_aligned_sequences, alignment, alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning(
                            '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) '''
                            % name)
                    removed_sequence_names.append(name)

            logging.info(
                "After removing %i insufficiently aligned sequences, left with %i sequences"
                % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception(
                    "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i"
                    % num_sequences)
            else:
                logging.info(
                    "Reconstructing the alignment and HMM from remaining sequences"
                )
                output_alignment = tempfile.NamedTemporaryFile(
                    prefix='graftm', suffix='.aln.faa').name
                if not user_hmm:
                    align_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                            suffix='.hmm').name
                ptype, output_alignment = self._align_and_create_hmm(
                    sequences, alignment, user_hmm, align_hmm,
                    output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(
                    open(output_alignment), min_aligned_percent)
        if not search_hmm_files:
            search_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                     suffix='_search.hmm').name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm,
                                    dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error(
                    "Unable to find sequence '%s' in the taxonomy definition" %
                    s)
            raise Exception(
                "All sequences must be assigned a taxonomy, cannot continue")

        logging.debug(
            "Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays,
                                                   taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base + "_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names = []
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')

        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype, self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise

            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree_log', prefix='graftm')
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree', prefix='graftm')
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree',
                                                 prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name,
                                                 deduplicated_alignment_file,
                                                 tre_file, log_file, ptype,
                                                 self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep = [
            seq.name
            for seq in [x for x in [x[0] for x in deduplicated_arrays] if x]
        ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base + "_seqinfo.csv"
            tax = base + "_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info(
                    "Building seqinfo and taxonomy file from input annotated tree"
                )
                taxonomy_definition = TaxonomyExtractor(
                ).taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info(
                    "Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(
                    taxonomy).taxonomy
            else:
                raise Exception(
                    "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
                )

            taxonomy_definition = {
                x: taxonomy_definition[x]
                for x in taxonomy_definition if x in taxonomy_to_keep
            }

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else:
                raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path,
                                      refpkg,
                                      align_hmm,
                                      diamondb,
                                      max_range,
                                      sequences,
                                      search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")

Example #12

Show file

    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop('threads', UpdateDefaultOptions.threads) #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function." % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (new_gpkg.name) #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file([old_gpkg.unaligned_sequence_database_path(),
                                input_sequence_path],
                               new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug("Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn("Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()
            
            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(
                rerooted_tree,
                old_gpkg.taxtastic_taxonomy_path(),
                old_gpkg.taxtastic_seqinfo_path())
            
            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) 
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type,
                                         self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(
            total_taxonomy_hash,
            new_gpkg.tt_taxonomy,
            new_gpkg.tt_seqinfo)
        
        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name,
                                    new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree,
                                    new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy,
                                    new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg,
                                    True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(new_gpkg.name, new_gpkg.refpkg,
                                      new_gpkg.hmm, new_gpkg.diamond_database,
                                      self._define_range(new_gpkg.unaligned_sequences),
                                      new_gpkg.unaligned_sequences,
                                      search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")

Example #13

Show file

File: arb_database_creator.py Project: Thexiyang/HandyScripts

            "cog_ids":cog_ids,
            "cog_classifications":cog_classifications,
            "tigrfam_ids":tigrfam_ids,
            "tigrfam_classifications":tigrfam_classifications,
            "source":"IMG",
            "swiss_prot": swiss_prot}
            arb_db[seq_name].update(tmp)
        arb_db[seq_id]["Classification"] = '_'.join(parsed_gff_file[seq_name].split())



parsed_gtdb_pfam_annotation = parse_pfam_annotation_hmmoutput(os.path.join(pfam_annotations, KO_ID+'_gtdb_pfam_annotation.domtblout.txt.gz'))
parsed_unaligned_sequences = SeqIO.to_dict(SeqIO.parse(open(unaligned_sequences), "fasta"))
parsed_aligned_sequences=SeqIO.to_dict(SeqIO.parse(open(aligned_sequences), "fasta"))

gtns = Getaxnseq()

parsed_taxonomy = {}
ids =[]
for id, tax in gtns.\
            read_taxtastic_taxonomy_and_seqinfo(open(taxonomy_path),
                                                open(seqinfo_path)).iteritems():

    ids.append(id)
    if '~' in id:
        gene_name, genome_id = id, id.split('~')[1]
        
        if genome_id in gene_id_to_tax:
            raise Exception("Genome ID encountered twice: %s" % genome_id)
        else:
            gene_id_to_tax[id] = genome_id

Example #14

Show file

File: run.py Project: eliasOnAWS/graftM

    def _assign_taxonomy_with_diamond(self, base_list, db_search_results,
                                      graftm_package, graftm_files,
                                      diamond_performance_parameters):
        '''Run diamond to assign taxonomy

        Parameters
        ----------
        base_list: list of str
            list of sequence block names
        db_search_results: list of DBSearchResult
            the result of running hmmsearches
        graftm_package: GraftMPackage object
            Diamond is run against this database
        graftm_files: GraftMFiles object
            Result files are written here
        diamond_performance_parameters : str
            extra args for DIAMOND

        Returns
        -------
        list of
        1. time taken for assignment
        2. assignments i.e. dict of base_list entry to dict of read names to
            to taxonomies, or None if there was no hit detected.
        '''
        runner = Diamond(graftm_package.diamond_database_path(),
                         self.args.threads, self.args.evalue)
        taxonomy_definition = Getaxnseq().read_taxtastic_taxonomy_and_seqinfo\
                (open(graftm_package.taxtastic_taxonomy_path()),
                 open(graftm_package.taxtastic_seqinfo_path()))
        results = {}

        # For each of the search results,
        for i, search_result in enumerate(db_search_results):
            if search_result.hit_fasta() is None:
                sequence_id_to_taxonomy = {}
            else:
                sequence_id_to_hit = {}
                # Run diamond
                logging.debug("Running diamond on %s" %
                              search_result.hit_fasta())
                diamond_result = runner.run(
                    search_result.hit_fasta(),
                    UnpackRawReads.PROTEIN_SEQUENCE_TYPE,
                    daa_file_basename=graftm_files.
                    diamond_assignment_output_basename(base_list[i]),
                    extra_args=diamond_performance_parameters)
                for res in diamond_result.each([
                        SequenceSearchResult.QUERY_ID_FIELD,
                        SequenceSearchResult.HIT_ID_FIELD
                ]):
                    if res[0] in sequence_id_to_hit:
                        # do not accept duplicates
                        if sequence_id_to_hit[res[0]] != res[1]:
                            raise Exception(
                                "Diamond unexpectedly gave two hits for a single query sequence for %s"
                                % res[0])
                    else:
                        sequence_id_to_hit[res[0]] = res[1]

                # Extract taxonomy of the best hit, and add in the no hits
                sequence_id_to_taxonomy = {}
                for seqio in SequenceIO().read_fasta_file(
                        search_result.hit_fasta()):
                    name = seqio.name
                    if name in sequence_id_to_hit:
                        # Add Root; to be in line with pplacer assignment method
                        sequence_id_to_taxonomy[name] = [
                            'Root'
                        ] + taxonomy_definition[sequence_id_to_hit[name]]
                    else:
                        # picked up in the initial search (by hmmsearch, say), but diamond misses it
                        sequence_id_to_taxonomy[name] = ['Root']

            results[base_list[i]] = sequence_id_to_taxonomy
        return results