Exemple #1
0
    def test_hello_world(self):
        with tempdir.in_tempdir():
            with tempfile.NamedTemporaryFile() as fasta:
                with tempfile.NamedTemporaryFile() as tax:
                    fasta.write(Tests.extra_mcra_fasta)
                    fasta.flush()
                    tax.write(Tests.extra_mcra_taxonomy)
                    tax.flush()
                    prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
                    cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %(
                        path_to_script,
                        prev_path,
                        fasta.name,
                        tax.name,
                        'updated.gpkg')
                    extern.run(cmd1)

                    prev = GraftMPackage.acquire(prev_path)
                    up = GraftMPackage.acquire('updated.gpkg')
                    prevhash = prev.taxonomy_hash()
                    taxhash = up.taxonomy_hash()
                    self.assertEqual(len(prevhash)+1,
                                     len(taxhash))
                    self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'],
                                     taxhash['KYC55281.1'])
                    self.assertEqual(prevhash['638165755'],
                                     taxhash['638165755'])
                    seqio = SequenceIO()
                    self.assertEqual(
                        len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                        len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
 def test_is_protein_package(self):
     pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg'))
     self.assertEqual(True, pkg.is_protein_package())
     self.assertEqual(True, pkg.is_protein_package())
     pkg = GraftMPackage.acquire(os.path.join(path_to_data, '61_otus.gpkg'))
     self.assertEqual(False, pkg.is_protein_package())
     self.assertEqual(False, pkg.is_protein_package())
Exemple #3
0
 def test_autodecorate(self):
     with tempdir.in_tempdir():
         with tempfile.NamedTemporaryFile() as fasta:
             fasta.write(Tests.extra_mcra_fasta)
             fasta.flush()
             
             prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
             update = Update(prerequisites)
             update.update(
                 input_sequence_path = fasta.name,
                 input_graftm_package_path = prev_path,
                 output_graftm_package_path = 'updated.gpkg')
             prev = GraftMPackage.acquire(prev_path)
             up = GraftMPackage.acquire('updated.gpkg')
             prevhash = prev.taxonomy_hash()
             taxhash = up.taxonomy_hash()
             self.assertEqual(11, len(taxhash)) #hard-code 11 because of
                                                #https://github.com/geronimp/graftM/issues/204
             self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'],
                              taxhash['KYC55281.1'])
             
             self.assertEqual(prevhash['638165755'],
                              taxhash['638165755'])
             seqio = SequenceIO()
             self.assertEqual(
                 len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                 len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Exemple #4
0
    def test_hello_world(self):
        with tempdir.in_tempdir():
            with tempfile.NamedTemporaryFile() as fasta:
                with tempfile.NamedTemporaryFile() as tax:
                    fasta.write(Tests.extra_mcra_fasta)
                    fasta.flush()
                    tax.write(Tests.extra_mcra_taxonomy)
                    tax.flush()
                    prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
                    cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %(
                        path_to_script,
                        prev_path,
                        fasta.name,
                        tax.name,
                        'updated.gpkg')
                    extern.run(cmd1)

                    prev = GraftMPackage.acquire(prev_path)
                    up = GraftMPackage.acquire('updated.gpkg')
                    prevhash = prev.taxonomy_hash()
                    taxhash = up.taxonomy_hash()
                    self.assertEqual(len(prevhash)+1,
                                     len(taxhash))
                    self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'],
                                     taxhash['KYC55281.1'])
                    self.assertEqual(prevhash['638165755'],
                                     taxhash['638165755'])
                    seqio = SequenceIO()
                    self.assertEqual(
                        len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                        len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
Exemple #5
0
 def test_autodecorate(self):
     with tempdir.in_tempdir():
         with tempfile.NamedTemporaryFile() as fasta:
             fasta.write(Tests.extra_mcra_fasta)
             fasta.flush()
             
             prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg')
             update = Update(prerequisites)
             update.update(
                 input_sequence_path = fasta.name,
                 input_graftm_package_path = prev_path,
                 output_graftm_package_path = 'updated.gpkg')
             prev = GraftMPackage.acquire(prev_path)
             up = GraftMPackage.acquire('updated.gpkg')
             prevhash = prev.taxonomy_hash()
             taxhash = up.taxonomy_hash()
             self.assertEqual(11, len(taxhash)) #hard-code 11 because of
                                                #https://github.com/geronimp/graftM/issues/204
             self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'],
                              taxhash['KYC55281.1'])
             
             self.assertEqual(prevhash['638165755'],
                              taxhash['638165755'])
             seqio = SequenceIO()
             self.assertEqual(
                 len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1,
                 len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
 def test_acquire(self):
     pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg'))
     self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.hmm'),
                      pkg.alignment_hmm_path())
     self.assertEqual(False, pkg.use_hmm_trusted_cutoff())
     self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.refpkg','treeoN87dL.tre'),
                      pkg.reference_package_tree_path())
Exemple #7
0
 def test_input_unrooted_tree(self):
     otu61 = os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg')
     with tempfile.NamedTemporaryFile(suffix='.fa') as bad_alignment:
         with tempdir.TempDir() as tmp:
             Create(prerequisites).main(
                 taxtastic_taxonomy=os.path.join(otu61,
                                                 '61_otus_taxonomy.csv'),
                 taxtastic_seqinfo=os.path.join(otu61,
                                                '61_otus_seqinfo.csv'),
                 # created with newick_utils:
                 # nw_prune test/data/61_otus.gpkg/61_otus.refpkg/61_otus.tre 4459468 >test/data/61_otus.without_4459468.tre
                 unrooted_tree=os.path.join(path_to_data, 'create',
                                            '61_otus.without_4459468.tre'),
                 sequences=os.path.join(path_to_data, 'create',
                                        '61_otus.without_4459468.fasta'),
                 alignment=os.path.join(
                     path_to_data, 'create',
                     '61_otus.without_4459468.aln.fasta'),
                 prefix=tmp,
                 force=True)
             gpkg = GraftMPackage.acquire(tmp)
             tree = Tree.get(
                 schema='newick',
                 data=open(gpkg.reference_package_tree_path()).readline())
             self.assertEqual(21, len(tree.leaf_nodes()))
Exemple #8
0
    def compile(output_package_path, graftm_package_path, singlem_position):
        '''Create a new SingleM package with the given inputs. Any files
        specified as parameters are copied into the final package so can
        be removed after calling this function.

        Parameters
        ----------
        output_package_path: str
            path to the package being created (must not exist)
        graftm_package_path: str
            path to graftm package internal to the singlem package
        singlem_position: int
            the position in the HMM where the SingleM window starts

        Returns
        -------
        Nothing
        '''

        if os.path.exists(output_package_path):
            raise Exception(
                "Not writing new SingleM package to already existing file/directory with name %s"
                % output_package_path)
        os.mkdir(output_package_path)

        graftm_package = GraftMPackage.acquire(graftm_package_path)
        if graftm_package.version != 3:
            raise Exception(
                "SingleM packages can only be created from version 3 GraftM packages at this point."
            )
        graftm_package_basename = os.path.basename(
            output_package_path.replace('.spkg', '').replace('.gpkg', ''))
        logging.info("Using GraftM package name %s" % graftm_package_basename)
        if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME:
            raise Exception("Name of GraftM package cannot be %s" %
                            SingleMPackage._CONTENTS_FILE_NAME)
        shutil.copytree(
            graftm_package_path,
            os.path.join(output_package_path, graftm_package_basename))

        singlem_package = SingleMPackageVersion1()
        singlem_package._contents_hash = {
            SingleMPackage.VERSION_KEY: singlem_package.version,
            SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename,
            SingleMPackage.SINGLEM_POSITION_KEY: singlem_position
        }
        singlem_package._base_directory = output_package_path

        # calculate the sha256 values
        singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \
            singlem_package.calculate_alignment_hmm_sha256()
        singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \
            singlem_package.calculate_singlem_package_sha256()

        # save contents file
        json.dump(
            singlem_package._contents_hash,
            open(
                os.path.join(output_package_path,
                             SingleMPackage._CONTENTS_FILE_NAME), 'w'))
Exemple #9
0
 def test_hello_world_diamond(self):
     gpkg=os.path.join(path_to_data, "bootstrapper", "D1_gpkg_for_diamond.gpkg")
     expandsearcher = ExpandSearcher(search_hmm_files = [os.path.join(path_to_data,'bootstrapper','DNGNGWU00001.hmm')],
                          evalue='1e-5',
                          maximum_range=1000,
                          threads=1,
                          graftm_package=GraftMPackage.acquire(gpkg))
     with tempfile.NamedTemporaryFile() as tf:
         self.assertEqual(True,
                          expandsearcher.generate_expand_search_database_from_contigs(\
                             [os.path.join(path_to_data,'bootstrapper','diamond_bootstrap_contigs.fna')],
                             tf.name,
                             "diamond"))
Exemple #10
0
    def compile(output_package_path, graftm_package_path, singlem_position,
                window_size):
        if os.path.exists(output_package_path):
            raise Exception(
                "Not writing new SingleM package to already existing file/directory with name %s"
                % output_package_path)
        os.mkdir(output_package_path)

        graftm_package = GraftMPackage.acquire(graftm_package_path)
        if graftm_package.version != 3:
            raise Exception(
                "SingleM packages can only be created from version 3 GraftM packages at this point."
            )
        # Use abspath before basename so that trailing slashes are dealt with.
        graftm_package_basename = os.path.basename(
            os.path.abspath(output_package_path).replace('.spkg', '').replace(
                '.gpkg', ''))
        logging.info("Using GraftM package name %s" % graftm_package_basename)
        if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME:
            raise Exception("Name of GraftM package cannot be %s" %
                            SingleMPackage._CONTENTS_FILE_NAME)
        shutil.copytree(
            graftm_package_path,
            os.path.join(output_package_path, graftm_package_basename))

        singlem_package = SingleMPackageVersion2()
        singlem_package._contents_hash = {
            SingleMPackage.VERSION_KEY: singlem_package.version,
            SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename,
            SingleMPackage.SINGLEM_POSITION_KEY: singlem_position,
            SingleMPackage.SINGLEM_WINDOW_SIZE_KEY: window_size
        }
        singlem_package._base_directory = output_package_path

        if singlem_package.is_protein_package() and window_size % 3 != 0:
            raise Exception(
                "For protein packages, the window size must be specified in base pairs. However, the window_size specified is not divisible by 3."
            )

        # calculate the sha256 values
        singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \
            singlem_package.calculate_alignment_hmm_sha256()
        singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \
            singlem_package.calculate_singlem_package_sha256()

        # save contents file
        with open(
                os.path.join(output_package_path,
                             SingleMPackage._CONTENTS_FILE_NAME), 'w') as f:
            json.dump(singlem_package._contents_hash, f)
Exemple #11
0
    def compile(output_package_path, graftm_package_path, singlem_position):
        '''Create a new SingleM package with the given inputs. Any files
        specified as parameters are copied into the final package so can
        be removed after calling this function.

        Parameters
        ----------
        output_package_path: str
            path to the package being created (must not exist)
        graftm_package_path: str
            path to graftm package internal to the singlem package
        singlem_position: int
            the position in the HMM where the SingleM window starts

        Returns
        -------
        Nothing
        '''

        if os.path.exists(output_package_path):
            raise Exception("Not writing new SingleM package to already existing file/directory with name %s" % output_package_path)
        os.mkdir(output_package_path)

        graftm_package = GraftMPackage.acquire(graftm_package_path)
        if graftm_package.version != 3:
            raise Exception("SingleM packages can only be created from version 3 GraftM packages at this point.")
        graftm_package_basename = os.path.basename(
            output_package_path.replace('.spkg','').replace('.gpkg',''))
        logging.info("Using GraftM package name %s" % graftm_package_basename)
        if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME:
            raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME)
        shutil.copytree(graftm_package_path, os.path.join(output_package_path, graftm_package_basename))

        singlem_package = SingleMPackageVersion1()
        singlem_package._contents_hash = {SingleMPackage.VERSION_KEY: singlem_package.version,
                                          SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename,
                                          SingleMPackage.SINGLEM_POSITION_KEY: singlem_position
                                          }
        singlem_package._base_directory = output_package_path

        # calculate the sha256 values
        singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \
            singlem_package.calculate_alignment_hmm_sha256()
        singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \
            singlem_package.calculate_singlem_package_sha256()

        # save contents file
        json.dump(singlem_package._contents_hash,
                  open(os.path.join(output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w'))
Exemple #12
0
 def test_input_unrooted_tree(self):
     otu61 = os.path.join(path_to_data, '61_otus.gpkg','61_otus.refpkg')
     with tempfile.NamedTemporaryFile(suffix='.fa') as bad_alignment:
         with tempdir.TempDir() as tmp:
             Create(prerequisites).main(
                 taxtastic_taxonomy=os.path.join(otu61,'61_otus_taxonomy.csv'),
                 taxtastic_seqinfo=os.path.join(otu61,'61_otus_seqinfo.csv'),
                 # created with newick_utils:
                 # nw_prune test/data/61_otus.gpkg/61_otus.refpkg/61_otus.tre 4459468 >test/data/61_otus.without_4459468.tre
                 unrooted_tree=os.path.join(path_to_data,'create','61_otus.without_4459468.tre'),
                 sequences=os.path.join(path_to_data,'create','61_otus.without_4459468.fasta'),
                 alignment=os.path.join(path_to_data,'create','61_otus.without_4459468.aln.fasta'),
                 prefix=tmp, force=True)
             gpkg = GraftMPackage.acquire(tmp)
             tree=Tree.get(schema='newick', data=open(gpkg.reference_package_tree_path()).readline())
             self.assertEqual(21, len(tree.leaf_nodes()))
Exemple #13
0
 def test_hello_world_diamond(self):
     gpkg = os.path.join(path_to_data, "bootstrapper",
                         "D1_gpkg_for_diamond.gpkg")
     expandsearcher = ExpandSearcher(
         search_hmm_files=[
             os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm')
         ],
         evalue='1e-5',
         maximum_range=1000,
         threads=1,
         graftm_package=GraftMPackage.acquire(gpkg))
     with tempfile.NamedTemporaryFile() as tf:
         self.assertEqual(True,
                          expandsearcher.generate_expand_search_database_from_contigs(\
                             [os.path.join(path_to_data,'bootstrapper','diamond_bootstrap_contigs.fna')],
                             tf.name,
                             "diamond"))
Exemple #14
0
    def test_remove_strange_characters_integration_test(self):
        with tempdir.TempDir() as tmp:
            gpkg = tmp + ".gpkg"
            first_seq = None
            with tempfile.NamedTemporaryFile(suffix='61_otus.Rs.fasta',
                                             mode='w') as f:
                for record in SeqIO.parse(
                        open(
                            os.path.join(path_to_data, 'create',
                                         '61_otus.fasta')), 'fasta'):
                    if not first_seq:
                        first_seq = str(record.seq)
                    record.seq = Seq(str(record.seq).replace(
                        'A', 'R',
                        5))  #don't replace too many otherwise hmmbuild fails
                    SeqIO.write(record, f, 'fasta')
                f.flush()

                Create(prerequisites).main(
                    sequences=f.name,
                    taxtastic_taxonomy=os.path.join(path_to_data,
                                                    '61_otus.gpkg',
                                                    '61_otus.refpkg',
                                                    '61_otus_taxonomy.csv'),
                    taxtastic_seqinfo=os.path.join(path_to_data,
                                                   '61_otus.gpkg',
                                                   '61_otus.refpkg',
                                                   '61_otus_seqinfo.csv'),
                    alignment=os.path.join(path_to_data, '61_otus.gpkg',
                                           '61_otus.refpkg', '61_otus.aln.fa'),
                    prefix=gpkg,
                    threads=5)
                pkg = GraftMPackage.acquire(gpkg)
                self.assertEqual('NAME  61_otus.aln\n',
                                 open(pkg.alignment_hmm_path()).readlines()[1])
                self.assertEqual(pkg.diamond_database_path(), None)
                for record in SeqIO.parse(open(pkg.alignment_fasta_path()),
                                          'fasta'):
                    self.assertEqual(
                        str(record.seq).replace('R', 'N'), str(record.seq))
                    break
Exemple #15
0
    def _test_package(self, package_path):
        '''Give a GraftM package a spin, and see if it works in reality with default
        parameters (i.e. pplacer). If it does not work, then raise an error.

        Parameters
        ----------
        package_path: str
            path to graftm_package to be tested
        '''
        pkg = GraftMPackage.acquire(package_path)
        with tempdir.TempDir() as graftM_graft_test_dir_name:
            # Take a subset of sequences for testing
            with tempfile.NamedTemporaryFile(suffix=".fa") as tf:
                seqio = SequenceIO()
                seqio.write_fasta(
                    itertools.islice(seqio.each_sequence(open(pkg.unaligned_sequence_database_path())), 10),
                    tf)
                tf.flush()
                cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" %(
                    tf.name, package_path, graftM_graft_test_dir_name)
                extern.run(cmd)
Exemple #16
0
    def _test_package(self, package_path):
        '''Give a GraftM package a spin, and see if it works in reality with default
        parameters (i.e. pplacer). If it does not work, then raise an error.

        Parameters
        ----------
        package_path: str
            path to graftm_package to be tested
        '''
        pkg = GraftMPackage.acquire(package_path)
        with tempdir.TempDir() as graftM_graft_test_dir_name:
            # Take a subset of sequences for testing
            with tempfile.NamedTemporaryFile(suffix=".fa", mode='w') as tf:
                seqio = SequenceIO()
                with open(pkg.unaligned_sequence_database_path()) as f:
                    seqio.write_fasta(
                        itertools.islice(seqio.each_sequence(f), 10), tf)
                tf.flush()
                cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" % (
                    tf.name, package_path, graftM_graft_test_dir_name)
                extern.run(cmd)
Exemple #17
0
    def compile(output_package_path, graftm_package_path, singlem_position, window_size):
        if os.path.exists(output_package_path):
            raise Exception("Not writing new SingleM package to already existing file/directory with name %s" % output_package_path)
        os.mkdir(output_package_path)

        graftm_package = GraftMPackage.acquire(graftm_package_path)
        if graftm_package.version != 3:
            raise Exception("SingleM packages can only be created from version 3 GraftM packages at this point.")
        # Use abspath before basename so that trailing slashes are dealt with.
        graftm_package_basename = os.path.basename(
            os.path.abspath(output_package_path).replace('.spkg','').replace('.gpkg',''))
        logging.info("Using GraftM package name %s" % graftm_package_basename)
        if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME:
            raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME)
        shutil.copytree(graftm_package_path, os.path.join(output_package_path, graftm_package_basename))

        singlem_package = SingleMPackageVersion2()
        singlem_package._contents_hash = {SingleMPackage.VERSION_KEY: singlem_package.version,
                                          SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename,
                                          SingleMPackage.SINGLEM_POSITION_KEY: singlem_position,
                                          SingleMPackage.SINGLEM_WINDOW_SIZE_KEY: window_size
                                          }
        singlem_package._base_directory = output_package_path

        if singlem_package.is_protein_package() and window_size % 3 != 0:
            raise Exception("For protein packages, the window size must be specified in base pairs. However, the window_size specified is not divisible by 3.")

        # calculate the sha256 values
        singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \
            singlem_package.calculate_alignment_hmm_sha256()
        singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \
            singlem_package.calculate_singlem_package_sha256()

        # save contents file
        with open(os.path.join(
                output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w') as f:
            json.dump(singlem_package._contents_hash, f)
Exemple #18
0
    def test_remove_strange_characters_integration_test(self):
        with tempdir.TempDir() as tmp:
            gpkg = tmp+".gpkg"
            first_seq = None
            with tempfile.NamedTemporaryFile(suffix='61_otus.Rs.fasta') as f:
                for record in SeqIO.parse(open(os.path.join(path_to_data,'create','61_otus.fasta')), 'fasta'):
                    if not first_seq:
                        first_seq = str(record.seq)
                    record.seq = Seq(str(record.seq).replace('A','R',5)) #don't replace too many otherwise hmmbuild fails
                    SeqIO.write(record, f, 'fasta')
                f.flush()

                Create(prerequisites).main(sequences=f.name,
                              taxtastic_taxonomy=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_taxonomy.csv'),
                              taxtastic_seqinfo=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_seqinfo.csv'),
                              alignment=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus.aln.fa'),
                              prefix=gpkg,
                              threads=5)
                pkg = GraftMPackage.acquire(gpkg)
                self.assertEqual('NAME  61_otus.aln\n', open(pkg.alignment_hmm_path()).readlines()[1])
                self.assertEqual(pkg.diamond_database_path(), None)
                for record in SeqIO.parse(open(pkg.alignment_fasta_path()), 'fasta'):
                    self.assertEqual(str(record.seq).replace('R','N'), str(record.seq))
                    break
Exemple #19
0
    def set_attributes(self, args):


        # Read graftM package and assign HMM and refpkg file
        if args.no_merge_reads:
            setattr(args, 'merge_reads', False)
        else:
            if args.reverse:
                setattr(args, 'merge_reads', True)
            else:
                setattr(args, 'merge_reads', False)
                
        if args.graftm_package:
            if not os.path.isdir(args.graftm_package):
                raise Exception("%s does not exist. Are you sure you provided the correct path?" % args.graftm_package)
            else:
                gpkg = GraftMPackage.acquire(args.graftm_package)
                if hasattr(args, 'search_hmm_files'): # If a hmm is specified, overwrite the one graftM package
                    setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path())
                    setattr(args, 'reference_package', gpkg.reference_package_path())
                else:
                    setattr(args, 'search_hmm_files', [])
                    for hmm in gpkg.search_hmm_paths():
                        args.search_hmm_files.append(hmm)
                    setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path())
                    setattr(args, 'reference_package', gpkg.reference_package_path())

        elif hasattr(args, 'search_diamond_files'):
            if args.search_method == self.DIAMOND_SEARCH_METHOD:
                if hasattr(args, 'aln_hmm_file'):
                    pass
                else:
                    raise Exception("aln_hmm_file not specified")
            else:
                raise Exception("Specified DIAMOND databases when not using the diamond search pipeline. Using: %s" % (args.search_method))

        elif hasattr(args, 'search_hmm_files'):
            if args.search_method == self.HMMSEARCH_SEARCH_METHOD:
                if not hasattr(args, 'aln_hmm_file'):
                    if len(args.search_hmm_files) == 1:
                        if not args.search_only:
                            setattr(args, 'aln_hmm_file', args.search_hmm_files[0])
                    else:
                        raise Exception("Multiple search HMMs specified, but aln_hmm_file not specified")

            else:
                raise Exception("Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method))

        elif hasattr(args, 'search_hmm_list_file'):
            if args.search_method == self.HMMSEARCH_SEARCH_METHOD:
                setattr(args, 'search_hmm_files', [x.rstrip() for x in open(args.search_hmm_list_file).readlines()])
                if not hasattr(args, 'aln_hmm_file'):
                    if not args.search_only:
                        raise Exception("Multiple search HMMs specified, but aln_hmm_file not specified")
            else:
                raise Exception("Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method))

        else:
            if args.search_only:
                if args.search_diamond_file:
                    args.search_method = self.DIAMOND_SEARCH_METHOD
                    args.search_hmm_files = None
            else:
                raise Exception('No gpkg, HMM, or DIAMOND database was specified, so there is no reference database to search with.')
Exemple #20
0
    def create(self, **kwargs):
        input_graftm_package_path = kwargs.pop('input_graftm_package')
        output_singlem_package_path = kwargs.pop('output_singlem_package')
        hmm_position = kwargs.pop('hmm_position')
        window_size = kwargs.pop('window_size')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if force and os.path.exists(output_singlem_package_path):
            shutil.rmtree(output_singlem_package_path)

        # For protein packages, remove sequences from diamond database that are
        # not in the tree so that hits can be mapped onto the tree and used for
        # alpha and beta diversity metrics.
        gpkg = GraftMPackage.acquire(input_graftm_package_path)
        is_protein_package = SingleMPackageVersion2.graftm_package_is_protein(
            gpkg)
        logging.info("Detected package type as %s" %
                     ('protein' if is_protein_package else 'nucleotide'))
        if is_protein_package:
            tree_leaves = set()
            for node in dendropy.Tree.get(
                    path=gpkg.reference_package_tree_path(),
                    schema='newick').leaf_node_iter():
                # need to replace here because otherwise they don't line up with the
                # diamond database IDs
                node_name = node.taxon.label.replace(' ', '_')
                if node_name in tree_leaves:
                    raise Exception(
                        "Found duplicate tree leaf name in graftm package "
                        "tree. Currently this case is not handled, sorry")
                tree_leaves.add(node_name)
            for name in tree_leaves:  #I don't think there is a 'peek' ?
                eg_name = name
                break
            logging.info("Read in %i tree tip names e.g. %s" %
                         (len(tree_leaves), eg_name))

            # Make a new fasta file of all the sequences that are leaves
            found_sequence_names = set()
            num_seqs_unaligned = 0
            filtered_aligned_tempfile = tempfile.NamedTemporaryFile(
                prefix='singlem_package_creator', suffix='.fasta')
            for s in SeqIO.parse(gpkg.unaligned_sequence_database_path(),
                                 "fasta"):
                num_seqs_unaligned += 1
                if s.id in tree_leaves:
                    if s.id in found_sequence_names:
                        raise Exception(
                            "Found duplicate sequence names in graftm unaligned"
                            " sequence fasta file. Currently this case is not handled,"
                            " sorry")
                    SeqIO.write([s], filtered_aligned_tempfile, "fasta")
                    found_sequence_names.add(s.id)
            filtered_aligned_tempfile.flush()

            if len(tree_leaves) != len(found_sequence_names):
                for t in tree_leaves:
                    if t not in found_sequence_names:
                        raise Exception(
                            "Found some sequences that were in the tree but not the"
                            " unaligned sequences database e.g. %s. Something is"
                            " likely amiss with the input GraftM package" % t)
                raise Exception("Programming error, shouldn't get here")
            logging.info(
                "All %i sequences found in tree extracted successfully from unaligned"
                " sequences fasta file, which originally had %i sequences" %
                (len(found_sequence_names), num_seqs_unaligned))

            # Create a new diamond database
            dmnd_tf = tempfile.NamedTemporaryFile(
                prefix='singlem_package_creator', suffix='.dmnd')
            cmd = "diamond makedb --in '%s' -d '%s'" % (
                filtered_aligned_tempfile.name, dmnd_tf.name)
            logging.info("Creating DIAMOND database")
            extern.run(cmd)

        # Compile the final graftm/singlem package
        if len(gpkg.search_hmm_paths()) == 1 and \
           gpkg.search_hmm_paths()[0] == gpkg.alignment_hmm_path():
            search_hmms = None
        else:
            search_hmms = gpkg.search_hmm_paths()

        with tempdir.TempDir() as tmpdir:
            gpkg_name = os.path.join(
                tmpdir,
                os.path.basename(
                    os.path.abspath(input_graftm_package_path)).replace(
                        '.gpkg', ''))
            GraftMPackageVersion3.compile(gpkg_name,
                                          gpkg.reference_package_path(),
                                          gpkg.alignment_hmm_path(),
                                          dmnd_tf.name if is_protein_package else None,
                                          gpkg.maximum_range(),
                                          filtered_aligned_tempfile.name if is_protein_package else \
                                              gpkg.unaligned_sequence_database_path(),
                                          gpkg.use_hmm_trusted_cutoff(),
                                          search_hmms)
            logging.debug(
                "Finished creating GraftM package for conversion to SingleM package"
            )

            SingleMPackageVersion2.compile(output_singlem_package_path,
                                           gpkg_name, hmm_position,
                                           window_size)

            shutil.rmtree(gpkg_name)
            if is_protein_package:
                filtered_aligned_tempfile.close()
                dmnd_tf.close()

            logging.info("SingleM-compatible package creation finished")
Exemple #21
0
    def regenerate(self, **kwargs):
        input_singlem_package = kwargs.pop('input_singlem_package')
        output_singlem_package = kwargs.pop('output_singlem_package')
        working_directory = kwargs.pop('working_directory')
        euk_sequences = kwargs.pop('euk_sequences')
        euk_taxonomy = kwargs.pop('euk_taxonomy')
        intermediate_archaea_graftm_package = kwargs.pop('intermediate_archaea_graftm_package')
        intermediate_bacteria_graftm_package = kwargs.pop('intermediate_bacteria_graftm_package')
        input_taxonomy = kwargs.pop('input_taxonomy')

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        original_pkg = SingleMPackage.acquire(input_singlem_package)
        original_hmm_path = original_pkg.hmm_path()
        basename = original_pkg.graftm_package_basename()

        # Run GraftM on the euk sequences with the bacterial set
        euk_graftm_output = os.path.join(working_directory,
                                         "%s-euk_graftm" % basename)
        cmd = "graftM graft --graftm_package '%s' --search_and_align_only --forward '%s' --output %s --force" % (
            original_pkg.graftm_package_path(),
            euk_sequences,
            euk_graftm_output)
        extern.run(cmd)

        # Extract hit sequences from that set
        euk_result = GraftMResult(euk_graftm_output, False)
        hit_paths = euk_result.unaligned_sequence_paths(require_hits=True)
        if len(hit_paths) != 1: raise Exception(
                "Unexpected number of hits against euk in graftm")
        euk_hits_path = next(iter(hit_paths.values())) #i.e. first

        # Concatenate euk, archaea and bacterial sequences
        archaeal_intermediate_pkg = GraftMPackage.acquire(
            intermediate_archaea_graftm_package)
        bacterial_intermediate_pkg = GraftMPackage.acquire(
            intermediate_bacteria_graftm_package)
        num_euk_hits = 0
        final_sequences_path = os.path.join(working_directory,
                                            "%s_final_sequences.faa" % basename)

        with open(final_sequences_path, 'w') as final_seqs_fp:
            with open(euk_hits_path) as euk_seqs_fp:
                for name, seq, _ in SeqReader().readfq(euk_seqs_fp):
                    if name.find('_split_') == -1:
                        num_euk_hits += 1
                        final_seqs_fp.write(">%s\n%s\n" % (name, seq))
            logging.info("Found %i eukaryotic sequences to include in the package" % \
                         num_euk_hits)

            for gpkg in [archaeal_intermediate_pkg, bacterial_intermediate_pkg]:
                num_total = 0
                num_written = 0
                with open(gpkg.unaligned_sequence_database_path()) as seqs:
                    for name, seq, _ in SeqReader().readfq(seqs):
                        num_total += 1
                        # if name in species_dereplicated_ids:
                        final_seqs_fp.write(">%s\n%s\n" % (name, seq))
                        num_written += 1
                logging.info(
                    "Of %i sequences in gpkg %s, %i species-dereplicated were included in the final package." %(
                        num_total, gpkg, num_written))

        # Concatenate euk and input taxonomy
        final_taxonomy_file = os.path.join(working_directory,
                                            "%s_final_taxonomy.csv" % basename)
        extern.run("cat %s %s > %s" % (
            euk_taxonomy, input_taxonomy, final_taxonomy_file))

        # Run graftm create to get the final package
        final_gpkg = os.path.join(working_directory,
                                  "%s_final.gpkg" % basename)
        cmd = "graftM create --force --sequences %s --taxonomy %s --search_hmm_files %s %s --hmm %s --output %s" % (
            final_sequences_path,
            final_taxonomy_file,
            ' '.join(archaeal_intermediate_pkg.search_hmm_paths()),
            ' '.join(bacterial_intermediate_pkg.search_hmm_paths()),
            original_hmm_path,
            final_gpkg)
        extern.run(cmd)

        ##############################################################################
        # Remove sequences from the diamond DB that are not in the tree i.e.
        # those that are exact duplicates, so that the diamond_example hits are
        # always in the tree.
        # Read the list of IDs in the tree with dendropy
        final_gpkg_object = GraftMPackage.acquire(final_gpkg)
        unaligned_seqs = final_gpkg_object.unaligned_sequence_database_path()
        tree = dendropy.Tree.get(path=final_gpkg_object.reference_package_tree_path(),
                                 schema='newick')
        leaf_names = [l.taxon.label.replace(' ','_') for l in tree.leaf_node_iter()]
        logging.debug("Read in final tree with %i leaves" % len(leaf_names))

        # Extract out of the sequences file in the graftm package
        final_seqs = SequenceExtractor().extract_and_read(
            leaf_names, unaligned_seqs)
        if len(final_seqs) != len(leaf_names):
            raise Exception("Do not appear to have extracted the expected number of sequences from the unaligned fastat file")

        # Write the reads into sequences file in place
        with open(unaligned_seqs, 'w') as f:
            for s in final_seqs:
                f.write(">%s\n" % s.name)
                f.write(s.seq)
                f.write("\n")

        # Regenerate the diamond DB
        final_gpkg_object.create_diamond_db()

        ##############################################################################
        # Run singlem create to put the final package together
        SingleMPackageVersion2.compile(
            output_singlem_package,
            final_gpkg,
            original_pkg.singlem_position(),
            original_pkg.window_size())
        logging.info("SingleM package generated.")
Exemple #22
0
    def graft(self):
        # The Graft pipeline:
        # Searches for reads using hmmer, and places them in phylogenetic
        # trees to derive a community structure.
        if self.args.graftm_package:
            gpkg = GraftMPackage.acquire(self.args.graftm_package)
        else:
            gpkg = None

        REVERSE_PIPE = (True if self.args.reverse else False)
        INTERLEAVED = (True if self.args.interleaved else False)
        base_list = []
        seqs_list = []
        search_results = []
        hit_read_count_list = []
        db_search_results = []

        if gpkg:
            maximum_range = gpkg.maximum_range()

            if self.args.search_diamond_file:
                self.args.search_method = self.hk.DIAMOND_SEARCH_METHOD
                diamond_db = self.args.search_diamond_file[0]
            else:
                diamond_db = gpkg.diamond_database_path()
                if self.args.search_method == self.hk.DIAMOND_SEARCH_METHOD:
                    if not diamond_db:
                        logging.error(
                            "%s search method selected, but no diamond database specified. \
                        Please either provide a gpkg to the --graftm_package flag, or a diamond \
                        database to the --search_diamond_file flag." %
                            self.args.search_method)
                        raise Exception()
        else:
            # Get the maximum range, if none exists, make one from the HMM profile
            if self.args.maximum_range:
                maximum_range = self.args.maximum_range
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    if not self.args.search_only:
                        maximum_range = self.hk.get_maximum_range(
                            self.args.aln_hmm_file)
                    else:
                        logging.debug(
                            "Running search only pipeline. maximum_range not configured."
                        )
                        maximum_range = None
                else:
                    logging.warning(
                        'Cannot determine maximum range when using %s pipeline and with no GraftM package specified'
                        % self.args.search_method)
                    logging.warning(
                        'Setting maximum_range to None (linked hits will not be detected)'
                    )
                    maximum_range = None
            if self.args.search_diamond_file:
                diamond_db = self.args.search_diamond_file
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    diamond_db = None
                else:
                    logging.error(
                        "%s search method selected, but no gpkg or diamond database selected"
                        % self.args.search_method)

        if self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            if self.args.reverse:
                logging.warn(
                    "--reverse reads specified with --assignment_method diamond. Reverse reads will be ignored."
                )
                self.args.reverse = None

        # If merge reads is specified, check that there are reverse reads to merge with
        if self.args.merge_reads and not hasattr(self.args, 'reverse'):
            raise Exception("Programming error")

        # Set the output directory if not specified and create that directory
        logging.debug('Creating working directory: %s' %
                      self.args.output_directory)
        self.hk.make_working_directory(self.args.output_directory,
                                       self.args.force)

        # Set pipeline and evalue by checking HMM format
        if self.args.search_only:
            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                hmm_type, hmm_tc = self.hk.setpipe(
                    self.args.search_hmm_files[0])
                logging.debug("HMM type: %s Trusted Cutoff: %s" %
                              (hmm_type, hmm_tc))
        else:
            hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file)
            logging.debug("HMM type: %s Trusted Cutoff: %s" %
                          (hmm_type, hmm_tc))

        if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
            setattr(self.args, 'type', hmm_type)
            if hmm_tc:
                setattr(self.args, 'evalue', '--cut_tc')
        else:
            setattr(self.args, 'type', self.PIPELINE_AA)

        if self.args.filter_minimum is not None:
            filter_minimum = self.args.filter_minimum
        else:
            if self.args.type == self.PIPELINE_NT:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES
            else:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES

        # Generate expand_search database if required
        if self.args.expand_search_contigs:
            if self.args.graftm_package:
                pkg = GraftMPackage.acquire(self.args.graftm_package)
            else:
                pkg = None
            boots = ExpandSearcher(search_hmm_files=self.args.search_hmm_files,
                                   maximum_range=self.args.maximum_range,
                                   threads=self.args.threads,
                                   evalue=self.args.evalue,
                                   min_orf_length=self.args.min_orf_length,
                                   graftm_package=pkg)

            # this is a hack, it should really use GraftMFiles but that class isn't currently flexible enough
            new_database = (os.path.join(self.args.output_directory, "expand_search.hmm") \
                            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD \
                            else os.path.join(self.args.output_directory, "expand_search")
                            )

            if boots.generate_expand_search_database_from_contigs(
                    self.args.expand_search_contigs, new_database,
                    self.args.search_method):
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    self.ss.search_hmm.append(new_database)
                else:
                    diamond_db = new_database

        first_search_method = self.args.search_method
        if self.args.decoy_database:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads),
                Diamond(self.args.decoy_database, threads=self.args.threads))
            doing_decoy_search = True
        elif self.args.search_method == self.hk.HMMSEARCH_AND_DIAMOND_SEARCH_METHOD:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads))
            doing_decoy_search = True
            first_search_method = self.hk.HMMSEARCH_SEARCH_METHOD
        else:
            doing_decoy_search = False

        # For each pair (or single file passed to GraftM)
        logging.debug('Working with %i file(s)' % len(self.sequence_pair_list))
        for pair in self.sequence_pair_list:
            # Guess the sequence file type, if not already specified to GraftM
            unpack = UnpackRawReads(pair[0], self.args.input_sequence_type,
                                    INTERLEAVED)

            # Set the basename, and make an entry to the summary table.
            base = unpack.basename()
            pair_direction = ['forward', 'reverse']
            logging.info("Working on %s" % base)

            # Make the working base subdirectory
            self.hk.make_working_directory(
                os.path.join(self.args.output_directory, base),
                self.args.force)

            # for each of the paired end read files
            for read_file in pair:
                unpack = UnpackRawReads(read_file,
                                        self.args.input_sequence_type,
                                        INTERLEAVED)
                if read_file is None:
                    # placeholder for interleaved (second file is None)
                    continue

                if not os.path.isfile(read_file):  # Check file exists
                    logging.info('%s does not exist! Skipping this file..' %
                                 read_file)
                    continue

                # Set the output file_name
                if len(pair) == 2:
                    direction = 'interleaved' if pair[1] is None \
                                              else pair_direction.pop(0)
                    logging.info("Working on %s reads" % direction)
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)
                    self.hk.make_working_directory(
                        os.path.join(self.args.output_directory, base,
                                     direction), self.args.force)
                else:
                    direction = False
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)

                if self.args.type == self.PIPELINE_AA:
                    logging.debug("Running protein pipeline")
                    try:
                        search_time, (
                            result,
                            complement_information) = self.ss.aa_db_search(
                                self.gmf,
                                base,
                                unpack,
                                first_search_method,
                                maximum_range,
                                self.args.threads,
                                self.args.evalue,
                                self.args.min_orf_length,
                                self.args.restrict_read_length,
                                diamond_db,
                                self.args.diamond_performance_parameters,
                            )
                    except NoInputSequencesException as e:
                        logging.error(
                            "No sufficiently long open reading frames were found, indicating"
                            " either the input sequences are too short or the min orf length"
                            " cutoff is too high. Cannot continue sorry. Alternatively, there"
                            " is something amiss with the installation of OrfM. The specific"
                            " command that failed was: %s" % e.command)
                        exit(Run.NO_ORFS_EXITSTATUS)

                # Or the DNA pipeline
                elif self.args.type == self.PIPELINE_NT:
                    logging.debug("Running nucleotide pipeline")
                    search_time, (
                        result, complement_information) = self.ss.nt_db_search(
                            self.gmf, base, unpack, self.args.euk_check,
                            self.args.search_method, maximum_range,
                            self.args.threads, self.args.evalue)

                reads_detected = True
                if not result.hit_fasta() or os.path.getsize(
                        result.hit_fasta()) == 0:
                    logging.info('No reads found in %s' % base)
                    reads_detected = False

                if self.args.search_only:
                    db_search_results.append(result)
                    base_list.append(base)
                    continue

                # Filter out decoys if specified
                if reads_detected and doing_decoy_search:
                    with tempfile.NamedTemporaryFile(prefix="graftm_decoy",
                                                     suffix='.fa') as f:
                        tmpname = f.name
                    any_remaining = decoy_filter.filter(
                        result.hit_fasta(), tmpname)
                    if any_remaining:
                        shutil.move(tmpname, result.hit_fasta())
                    else:
                        # No hits remain after decoy filtering.
                        os.remove(result.hit_fasta())
                        continue

                if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
                    logging.info(
                        'aligning reads to reference package database')
                    hit_aligned_reads = self.gmf.aligned_fasta_output_path(
                        base)

                    if reads_detected:
                        aln_time, aln_result = self.ss.align(
                            result.hit_fasta(), hit_aligned_reads,
                            complement_information, self.args.type,
                            filter_minimum)
                    else:
                        aln_time = 'n/a'
                    if not os.path.exists(
                            hit_aligned_reads
                    ):  # If all were filtered out, or there just was none..
                        with open(hit_aligned_reads, 'w') as f:
                            pass  # just touch the file, nothing else
                    seqs_list.append(hit_aligned_reads)

                db_search_results.append(result)
                base_list.append(base)
                search_results.append(result.search_result)
                hit_read_count_list.append(result.hit_count)

        # Write summary table
        srchtw = SearchTableWriter()
        srchtw.build_search_otu_table(
            [x.search_objects for x in db_search_results], base_list,
            self.gmf.search_otu_table())

        if self.args.search_only:
            logging.info(
                'Stopping before alignment and taxonomic assignment phase\n')
            exit(0)

        if self.args.merge_reads:  # not run when diamond is the assignment mode- enforced by argparse grokking
            logging.debug("Running merge reads output")
            if self.args.interleaved:
                fwd_seqs = seqs_list
                rev_seqs = []
            else:
                base_list = base_list[0::2]
                fwd_seqs = seqs_list[0::2]
                rev_seqs = seqs_list[1::2]
            merged_output=[GraftMFiles(base, self.args.output_directory, False).aligned_fasta_output_path(base) \
                           for base in base_list]
            logging.debug("merged reads to %s", merged_output)
            self.ss.merge_forev_aln(fwd_seqs, rev_seqs, merged_output)
            seqs_list = merged_output
            REVERSE_PIPE = False

        elif REVERSE_PIPE:
            base_list = base_list[0::2]

        # Leave the pipeline if search only was specified
        if self.args.search_and_align_only:
            logging.info('Stopping before taxonomic assignment phase\n')
            exit(0)
        elif not any(base_list):
            logging.error(
                'No hits in any of the provided files. Cannot continue with no reads to assign taxonomy to.\n'
            )
            exit(0)
        self.gmf = GraftMFiles('', self.args.output_directory, False)

        if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
            clusterer = Clusterer()
            # Classification steps
            seqs_list = clusterer.cluster(seqs_list, REVERSE_PIPE)
            logging.info("Placing reads into phylogenetic tree")
            taxonomic_assignment_time, assignments = self.p.place(
                REVERSE_PIPE, seqs_list, self.args.resolve_placements,
                self.gmf, self.args, result.slash_endings,
                gpkg.taxtastic_taxonomy_path(), clusterer)
            assignments = clusterer.uncluster_annotations(
                assignments, REVERSE_PIPE)

        elif self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            logging.info("Assigning taxonomy with diamond")
            taxonomic_assignment_time, assignments = self._assign_taxonomy_with_diamond(\
                        base_list,
                        db_search_results,
                        gpkg,
                        self.gmf,
                        self.args.diamond_performance_parameters)
            aln_time = 'n/a'
        else:
            raise Exception("Unexpected assignment method encountered: %s" %
                            self.args.placement_method)

        self.summarise(base_list, assignments, REVERSE_PIPE,
                       [search_time, aln_time, taxonomic_assignment_time],
                       hit_read_count_list, self.args.max_samples_for_krona)
 def test_version3_unaligned_fasta(self):
     pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg'))
     self.assertEqual(3, pkg.version)
     self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.faa'),
                      pkg.unaligned_sequence_database_path())
Exemple #24
0
    def main(self):

        if self.args.subparser_name == 'graft':
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print('''
                                GRAFT

                       Joel Boyd, Ben Woodcroft

                                                         __/__
                                                  ______|
          _- - _                         ________|      |_____/
           - -            -             |        |____/_
           - _     >>>>  -   >>>>   ____|
          - _-  -         -             |      ______
             - _                        |_____|
           -                                  |______
            ''')
            self.graft()

        elif self.args.subparser_name == 'create':
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print('''
                            CREATE

                   Joel Boyd, Ben Woodcroft

                                                    /
              >a                                   /
              -------------                       /
              >b                        |        |
              --------          >>>     |  GPKG  |
              >c                        |________|
              ----------
''')
            if self.args.dereplication_level < 0:
                logging.error(
                    "Invalid dereplication level selected! please enter a positive integer"
                )
                exit(1)

            else:
                if not self.args.sequences:
                    if not self.args.alignment and not self.args.rerooted_annotated_tree \
                                               and not self.args.rerooted_tree:
                        logging.error(
                            "Some sort of sequence data must be provided to run graftM create"
                        )
                        exit(1)
                if self.args.taxonomy:
                    if self.args.rerooted_annotated_tree:
                        logging.error(
                            "--taxonomy is incompatible with --rerooted_annotated_tree"
                        )
                        exit(1)
                    if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --taxonomy"
                        )
                        exit(1)
                elif self.args.rerooted_annotated_tree:
                    if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --rerooted_annotated_tree"
                        )
                        exit(1)
                else:
                    if not self.args.taxtastic_taxonomy or not self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxonomy, --rerooted_annotated_tree or --taxtastic_taxonomy/--taxtastic_seqinfo is required"
                        )
                        exit(1)
                if bool(self.args.taxtastic_taxonomy) ^ bool(
                        self.args.taxtastic_seqinfo):
                    logging.error(
                        "Both or neither of --taxtastic_taxonomy and --taxtastic_seqinfo must be defined"
                    )
                    exit(1)
                if self.args.alignment and self.args.hmm:
                    logging.warn(
                        "Using both --alignment and --hmm is rarely useful, but proceding on the assumption you understand."
                    )
                if len([
                        _f for _f in [
                            self.args.rerooted_tree,
                            self.args.rerooted_annotated_tree, self.args.tree
                        ] if _f
                ]) > 1:
                    logging.error("Only 1 input tree can be specified")
                    exit(1)

                self.create.main(
                    dereplication_level=self.args.dereplication_level,
                    sequences=self.args.sequences,
                    alignment=self.args.alignment,
                    taxonomy=self.args.taxonomy,
                    rerooted_tree=self.args.rerooted_tree,
                    unrooted_tree=self.args.tree,
                    tree_log=self.args.tree_log,
                    prefix=self.args.output,
                    rerooted_annotated_tree=self.args.rerooted_annotated_tree,
                    min_aligned_percent=float(self.args.min_aligned_percent) /
                    100,
                    taxtastic_taxonomy=self.args.taxtastic_taxonomy,
                    taxtastic_seqinfo=self.args.taxtastic_seqinfo,
                    hmm=self.args.hmm,
                    search_hmm_files=self.args.search_hmm_files,
                    force=self.args.force,
                    threads=self.args.threads)

        elif self.args.subparser_name == 'update':
            logging.info(
                "GraftM package %s specified to update with sequences in %s" %
                (self.args.graftm_package, self.args.sequences))
            if self.args.regenerate_diamond_db:
                gpkg = GraftMPackage.acquire(self.args.graftm_package)
                logging.info("Regenerating diamond DB..")
                gpkg.create_diamond_db()
                logging.info("Diamond database regenerated.")
                return
            elif not self.args.sequences:
                logging.error(
                    "--sequences is required unless regenerating the diamond DB"
                )
                exit(1)

            if not self.args.output:
                if self.args.graftm_package.endswith(".gpkg"):
                    self.args.output = self.args.graftm_package.replace(
                        ".gpkg", "-updated.gpkg")
                else:
                    self.args.output = self.args.graftm_package + '-update.gpkg'

            Update(
                ExternalProgramSuite([
                    'taxit', 'FastTreeMP', 'hmmalign', 'mafft'
                ])).update(input_sequence_path=self.args.sequences,
                           input_taxonomy_path=self.args.taxonomy,
                           input_graftm_package_path=self.args.graftm_package,
                           output_graftm_package_path=self.args.output)

        elif self.args.subparser_name == 'expand_search':
            args = self.args
            if not args.graftm_package and not args.search_hmm_files:
                logging.error(
                    "expand_search mode requires either --graftm_package or --search_hmm_files"
                )
                exit(1)

            if args.graftm_package:
                pkg = GraftMPackage.acquire(args.graftm_package)
            else:
                pkg = None

            expandsearcher = ExpandSearcher(
                search_hmm_files=args.search_hmm_files,
                maximum_range=args.maximum_range,
                threads=args.threads,
                evalue=args.evalue,
                min_orf_length=args.min_orf_length,
                graftm_package=pkg)
            expandsearcher.generate_expand_search_database_from_contigs(
                args.contigs,
                args.output_hmm,
                search_method=ExpandSearcher.HMM_SEARCH_METHOD)

        elif self.args.subparser_name == 'tree':
            if self.args.graftm_package:
                # shim in the paths from the graftm package, not overwriting
                # any of the provided paths.
                gpkg = GraftMPackage.acquire(self.args.graftm_package)
                if not self.args.rooted_tree:
                    self.args.rooted_tree = gpkg.reference_package_tree_path()
                if not self.args.input_greengenes_taxonomy:
                    if not self.args.input_taxtastic_seqinfo:
                        self.args.input_taxtastic_seqinfo = gpkg.taxtastic_seqinfo_path(
                        )
                    if not self.args.input_taxtastic_taxonomy:
                        self.args.input_taxtastic_taxonomy = gpkg.taxtastic_taxonomy_path(
                        )

            if self.args.rooted_tree:
                if self.args.unrooted_tree:
                    logging.error(
                        "Both a rooted tree and an un-rooted tree were provided, so it's unclear what you are asking GraftM to do. \
If you're unsure see graftM tree -h")
                    exit(1)
                elif self.args.reference_tree:
                    logging.error(
                        "Both a rooted tree and reference tree were provided, so it's unclear what you are asking GraftM to do. \
If you're unsure see graftM tree -h")
                    exit(1)

                if not self.args.decorate:
                    logging.error(
                        "It seems a rooted tree has been provided, but --decorate has not been specified so it is unclear what you are asking graftM to do."
                    )
                    exit(1)

                dec = Decorator(tree_path=self.args.rooted_tree)

            elif self.args.unrooted_tree and self.args.reference_tree:
                logging.debug(
                    "Using provided reference tree %s to reroot %s" %
                    (self.args.reference_tree, self.args.unrooted_tree))
                dec = Decorator(reference_tree_path=self.args.reference_tree,
                                tree_path=self.args.unrooted_tree)
            else:
                logging.error(
                    "Some tree(s) must be provided, either a rooted tree or both an unrooted tree and a reference tree"
                )
                exit(1)

            if self.args.output_taxonomy is None and self.args.output_tree is None:
                logging.error(
                    "Either an output tree or taxonomy must be provided")
                exit(1)
            if self.args.input_greengenes_taxonomy:
                if self.args.input_taxtastic_seqinfo or self.args.input_taxtastic_taxonomy:
                    logging.error(
                        "Both taxtastic and greengenes taxonomy were provided, so its unclear what taxonomy you want graftM to decorate with"
                    )
                    exit(1)
                logging.debug("Using input GreenGenes style taxonomy file")
                dec.main(self.args.input_greengenes_taxonomy,
                         self.args.output_tree, self.args.output_taxonomy,
                         self.args.no_unique_tax, self.args.decorate, None)
            elif self.args.input_taxtastic_seqinfo and self.args.input_taxtastic_taxonomy:
                logging.debug("Using input taxtastic style taxonomy/seqinfo")
                dec.main(self.args.input_taxtastic_taxonomy,
                         self.args.output_tree, self.args.output_taxonomy,
                         self.args.no_unique_tax, self.args.decorate,
                         self.args.input_taxtastic_seqinfo)
            else:
                logging.error(
                    "Either a taxtastic taxonomy or seqinfo file was provided. GraftM cannot continue without both."
                )
                exit(1)

        elif self.args.subparser_name == 'archive':
            # Back slashes in the ASCII art are escaped.
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print("""
                               ARCHIVE

                        Joel Boyd, Ben Woodcroft

                  ____.----.
        ____.----'          \\
        \\                    \\
         \\                    \\
          \\                    \\
           \\          ____.----'`--.__
            \\___.----'          |     `--.____
           /`-._                |       __.-' \\
          /     `-._            ___.---'       \\
         /          `-.____.---'                \\           +------+
        /            / | \\                       \\          |`.    |`.
       /            /  |  \\                   _.--'  <===>  |  `+--+---+
       `-.         /   |   \\            __.--'              |   |  |   |
          `-._    /    |    \\     __.--'     |              |   |  |   |
            | `-./     |     \\_.-'           |              +---+--+   |
            |          |                     |               `. |   `. |
            |          |                     |                 `+------+
            |          |                     |
            |          |                     |
            |          |                     |
            |          |                     |
            |          |                     |
            `-.        |                  _.-'
               `-.     |           __..--'
                  `-.  |      __.-'
                     `-|__.--'
            """)
            if self.args.create:
                if self.args.extract:
                    logging.error(
                        "Please specify whether to either create or export a GraftM package"
                    )
                    exit(1)
                if not self.args.graftm_package:
                    logging.error(
                        "Creating a GraftM package archive requires an package to be specified"
                    )
                    exit(1)
                if not self.args.archive:
                    logging.error(
                        "Creating a GraftM package archive requires an output archive path to be specified"
                    )
                    exit(1)

                archive = Archive()
                archive.create(self.args.graftm_package,
                               self.args.archive,
                               force=self.args.force)

            elif self.args.extract:
                archive = Archive()
                archive.extract(self.args.archive,
                                self.args.graftm_package,
                                force=self.args.force)
            else:
                logging.error(
                    "Please specify whether to either create or export a GraftM package"
                )
                exit(1)

        else:
            raise Exception("Unexpected subparser name %s" %
                            self.args.subparser_name)
Exemple #25
0
    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop('threads', UpdateDefaultOptions.threads) #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function." % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (new_gpkg.name) #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file([old_gpkg.unaligned_sequence_database_path(),
                                input_sequence_path],
                               new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug("Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn("Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()
            
            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(
                rerooted_tree,
                old_gpkg.taxtastic_taxonomy_path(),
                old_gpkg.taxtastic_seqinfo_path())
            
            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) 
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type,
                                         self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(
            total_taxonomy_hash,
            new_gpkg.tt_taxonomy,
            new_gpkg.tt_seqinfo)
        
        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name,
                                    new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree,
                                    new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy,
                                    new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg,
                                    True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(new_gpkg.name, new_gpkg.refpkg,
                                      new_gpkg.hmm, new_gpkg.diamond_database,
                                      self._define_range(new_gpkg.unaligned_sequences),
                                      new_gpkg.unaligned_sequences,
                                      search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")
Exemple #26
0
 def graftm_package(self):
     if self.graftm_package_cache is None:
         self.graftm_package_cache = GraftMPackage.acquire(self.graftm_package_path())
     return self.graftm_package_cache
Exemple #27
0
 def graftm_package(self):
     if self.graftm_package_cache is None:
         self.graftm_package_cache = GraftMPackage.acquire(
             self.graftm_package_path())
     return self.graftm_package_cache
Exemple #28
0
    def create(self, **kwargs):
        input_graftm_package_path = kwargs.pop('input_graftm_package')
        output_singlem_package_path = kwargs.pop('output_singlem_package')
        hmm_position = kwargs.pop('hmm_position')
        window_size = kwargs.pop('window_size')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        if force and os.path.exists(output_singlem_package_path):
            shutil.rmtree(output_singlem_package_path)

        # For protein packages, remove sequences from diamond database that are
        # not in the tree so that hits can be mapped onto the tree and used for
        # alpha and beta diversity metrics.
        gpkg = GraftMPackage.acquire(input_graftm_package_path)
        is_protein_package = SingleMPackageVersion2.graftm_package_is_protein(gpkg)
        logging.info("Detected package type as %s" %
                     ('protein' if is_protein_package else 'nucleotide'))
        if is_protein_package:
            tree_leaves = set()
            for node in dendropy.Tree.get(
                    path=gpkg.reference_package_tree_path(),
                    schema='newick').leaf_node_iter():
                # need to replace here because otherwise they don't line up with the
                # diamond database IDs
                node_name = node.taxon.label.replace(' ','_')
                if node_name in tree_leaves:
                    raise Exception("Found duplicate tree leaf name in graftm package "
                                    "tree. Currently this case is not handled, sorry")
                tree_leaves.add(node_name)
            for name in tree_leaves: #I don't think there is a 'peek' ?
                eg_name = name
                break
            logging.info("Read in %i tree tip names e.g. %s" % (
                len(tree_leaves), eg_name))

            # Make a new fasta file of all the sequences that are leaves
            found_sequence_names = set()
            num_seqs_unaligned = 0
            filtered_aligned_tempfile = tempfile.NamedTemporaryFile(prefix='singlem_package_creator',
                                                                    suffix='.fasta')
            for s in SeqIO.parse(gpkg.unaligned_sequence_database_path(), "fasta"):
                num_seqs_unaligned += 1
                if s.id in tree_leaves:
                    if s.id in found_sequence_names:
                        raise Exception("Found duplicate sequence names in graftm unaligned"
                                        " sequence fasta file. Currently this case is not handled,"
                                        " sorry")
                    SeqIO.write([s], filtered_aligned_tempfile, "fasta")
                    found_sequence_names.add(s.id)
            filtered_aligned_tempfile.flush()

            if len(tree_leaves) != len(found_sequence_names):
                for t in tree_leaves:
                    if t not in found_sequence_names:
                        raise Exception("Found some sequences that were in the tree but not the"
                                        " unaligned sequences database e.g. %s. Something is"
                                        " likely amiss with the input GraftM package" % t)
                raise Exception("Programming error, shouldn't get here")
            logging.info("All %i sequences found in tree extracted successfully from unaligned"
                         " sequences fasta file, which originally had %i sequences" % (
                             len(found_sequence_names), num_seqs_unaligned))

            # Create a new diamond database
            dmnd_tf = tempfile.NamedTemporaryFile(prefix='singlem_package_creator',suffix='.dmnd')
            cmd = "diamond makedb --in '%s' -d '%s'" % (filtered_aligned_tempfile.name, dmnd_tf.name)
            logging.info("Creating DIAMOND database")
            extern.run(cmd)

        # Compile the final graftm/singlem package
        if len(gpkg.search_hmm_paths()) == 1 and \
           gpkg.search_hmm_paths()[0] == gpkg.alignment_hmm_path():
            search_hmms = None
        else:
            search_hmms = gpkg.search_hmm_paths()

        with tempdir.TempDir() as tmpdir:
            gpkg_name = os.path.join(
                tmpdir,
                os.path.basename(
                    os.path.abspath(input_graftm_package_path)).replace('.gpkg',''))
            GraftMPackageVersion3.compile(gpkg_name,
                                          gpkg.reference_package_path(),
                                          gpkg.alignment_hmm_path(),
                                          dmnd_tf.name if is_protein_package else None,
                                          gpkg.maximum_range(),
                                          filtered_aligned_tempfile.name if is_protein_package else \
                                              gpkg.unaligned_sequence_database_path(),
                                          gpkg.use_hmm_trusted_cutoff(),
                                          search_hmms)
            logging.debug("Finished creating GraftM package for conversion to SingleM package")

            SingleMPackageVersion2.compile(output_singlem_package_path,
                                           gpkg_name, hmm_position, window_size)

            shutil.rmtree(gpkg_name)
            if is_protein_package:
                filtered_aligned_tempfile.close()
                dmnd_tf.close()

            logging.info("SingleM-compatible package creation finished")
Exemple #29
0
    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop(
            'threads',
            UpdateDefaultOptions.threads)  #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function."
                % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (
            new_gpkg.name
        )  #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file(
            [old_gpkg.unaligned_sequence_database_path(), input_sequence_path],
            new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug(
                "Found %i taxonomic definitions in common between the previous and updated taxonomies"
                % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn(
                    "Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case."
                    % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences,
                              new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm,
                                     new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(
            old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()

            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(rerooted_tree,
                               old_gpkg.taxtastic_taxonomy_path(),
                               old_gpkg.taxtastic_seqinfo_path())

            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True)
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(
                    taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type, self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(total_taxonomy_hash,
                                              new_gpkg.tt_taxonomy,
                                              new_gpkg.tt_seqinfo)

        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg, True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(
            new_gpkg.name,
            new_gpkg.refpkg,
            new_gpkg.hmm,
            new_gpkg.diamond_database,
            self._define_range(new_gpkg.unaligned_sequences),
            new_gpkg.unaligned_sequences,
            search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")
Exemple #30
0
#!/usr/bin/env python2.7
import argparse
import os
import json
import itertools

import dendropy
from graftm.graftm_package import GraftMPackage

parser = argparse.ArgumentParser()
parser.add_argument('--graftm_package', help='package to look at', required=True)
args = parser.parse_args()

gpkg = GraftMPackage.acquire(args.graftm_package)

taxonomy_hash = gpkg.taxonomy_hash()

taxonomy_to_leaves = {}
for name, taxonomy in taxonomy_hash.items():
    for i in range(len(taxonomy)):
        tax = '; '.join(taxonomy[:(i+1)])
        if tax not in taxonomy_to_leaves:
            taxonomy_to_leaves[tax] = []
        taxonomy_to_leaves[tax].append(name)

refpkg_contents = os.path.join(gpkg.reference_package_path(),'CONTENTS.json')
refpkg = json.loads(open(refpkg_contents).read())
tree_file = os.path.join(gpkg.reference_package_path(),refpkg['files']['tree'])
tree = dendropy.Tree.get(path=tree_file, schema='newick')

print "\t".join([
Exemple #31
0
#!/usr/bin/env python2.7
import argparse
import os
import json
import itertools

import dendropy
from graftm.graftm_package import GraftMPackage

parser = argparse.ArgumentParser()
parser.add_argument('--graftm_package',
                    help='package to look at',
                    required=True)
args = parser.parse_args()

gpkg = GraftMPackage.acquire(args.graftm_package)

taxonomy_hash = gpkg.taxonomy_hash()

taxonomy_to_leaves = {}
for name, taxonomy in taxonomy_hash.items():
    for i in range(len(taxonomy)):
        tax = '; '.join(taxonomy[:(i + 1)])
        if tax not in taxonomy_to_leaves:
            taxonomy_to_leaves[tax] = []
        taxonomy_to_leaves[tax].append(name)

refpkg_contents = os.path.join(gpkg.reference_package_path(), 'CONTENTS.json')
refpkg = json.loads(open(refpkg_contents).read())
tree_file = os.path.join(gpkg.reference_package_path(),
                         refpkg['files']['tree'])
Exemple #32
0
    def set_attributes(self, args):

        # Read graftM package and assign HMM and refpkg file
        if args.no_merge_reads:
            setattr(args, 'merge_reads', False)
        else:
            if args.reverse:
                setattr(args, 'merge_reads', True)
            else:
                setattr(args, 'merge_reads', False)

        if args.graftm_package:
            if not os.path.isdir(args.graftm_package):
                raise Exception(
                    "%s does not exist. Are you sure you provided the correct path?"
                    % args.graftm_package)
            else:
                gpkg = GraftMPackage.acquire(args.graftm_package)
                if hasattr(
                        args, 'search_hmm_files'
                ):  # If a hmm is specified, overwrite the one graftM package
                    setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path())
                    setattr(args, 'reference_package',
                            gpkg.reference_package_path())
                else:
                    setattr(args, 'search_hmm_files', [])
                    for hmm in gpkg.search_hmm_paths():
                        args.search_hmm_files.append(hmm)
                    setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path())
                    setattr(args, 'reference_package',
                            gpkg.reference_package_path())

        elif hasattr(args, 'search_diamond_files'):
            if args.search_method == self.DIAMOND_SEARCH_METHOD:
                if hasattr(args, 'aln_hmm_file'):
                    pass
                else:
                    raise Exception("aln_hmm_file not specified")
            else:
                raise Exception(
                    "Specified DIAMOND databases when not using the diamond search pipeline. Using: %s"
                    % (args.search_method))

        elif hasattr(args, 'search_hmm_files'):
            if args.search_method == self.HMMSEARCH_SEARCH_METHOD:
                if not hasattr(args, 'aln_hmm_file'):
                    if len(args.search_hmm_files) == 1:
                        if not args.search_only:
                            setattr(args, 'aln_hmm_file',
                                    args.search_hmm_files[0])
                    else:
                        raise Exception(
                            "Multiple search HMMs specified, but aln_hmm_file not specified"
                        )

            else:
                raise Exception(
                    "Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s"
                    % (args.search_method))

        elif hasattr(args, 'search_hmm_list_file'):
            if args.search_method == self.HMMSEARCH_SEARCH_METHOD:
                setattr(args, 'search_hmm_files', [
                    x.rstrip()
                    for x in open(args.search_hmm_list_file).readlines()
                ])
                if not hasattr(args, 'aln_hmm_file'):
                    if not args.search_only:
                        raise Exception(
                            "Multiple search HMMs specified, but aln_hmm_file not specified"
                        )
            else:
                raise Exception(
                    "Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s"
                    % (args.search_method))

        else:
            if args.search_only:
                if args.search_diamond_file:
                    args.search_method = self.DIAMOND_SEARCH_METHOD
                    args.search_hmm_files = None
            else:
                raise Exception(
                    'No gpkg, HMM, or DIAMOND database was specified, so there is no reference database to search with.'
                )
Exemple #33
0
    def regenerate(self, **kwargs):
        input_singlem_package = kwargs.pop('input_singlem_package')
        output_singlem_package = kwargs.pop('output_singlem_package')
        working_directory = kwargs.pop('working_directory')
        euk_sequences = kwargs.pop('euk_sequences')
        euk_taxonomy = kwargs.pop('euk_taxonomy')
        intermediate_archaea_graftm_package = kwargs.pop('intermediate_archaea_graftm_package')
        intermediate_bacteria_graftm_package = kwargs.pop('intermediate_bacteria_graftm_package')
        input_taxonomy = kwargs.pop('input_taxonomy')
        type_strains_list_file = kwargs.pop('type_strains_list_file')

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        original_pkg = SingleMPackage.acquire(input_singlem_package)
        original_hmm_path = original_pkg.hmm_path()
        basename = original_pkg.graftm_package_basename()

        # Run GraftM on the euk sequences with the bacterial set
        euk_graftm_output = os.path.join(working_directory,
                                         "%s-euk_graftm" % basename)
        cmd = "graftM graft --graftm_package '%s' --search_and_align_only --forward '%s' --output %s --force" % (
            original_pkg.graftm_package_path(),
            euk_sequences,
            euk_graftm_output)
        extern.run(cmd)

        # Extract hit sequences from that set
        euk_result = GraftMResult(euk_graftm_output)
        hit_paths = euk_result.unaligned_sequence_paths(require_hits=True)
        if len(hit_paths) != 1: raise Exception(
                "Unexpected number of hits against euk in graftm")
        euk_hits_path = hit_paths.values()[0]

        # Concatenate euk, archaea and bacterial sequences
        archaeal_intermediate_pkg = GraftMPackage.acquire(
            intermediate_archaea_graftm_package)
        bacterial_intermediate_pkg = GraftMPackage.acquire(
            intermediate_bacteria_graftm_package)
        num_euk_hits = 0
        final_sequences_path = os.path.join(working_directory,
                                            "%s_final_sequences.faa" % basename)
        archeal_seqs = archaeal_intermediate_pkg.unaligned_sequence_database_path()
        bacterial_seqs = bacterial_intermediate_pkg.unaligned_sequence_database_path()
        with open(type_strains_list_file) as f:
            type_strain_identifiers = [s.strip() for s in f.readlines()]
        logging.info("Read in %i type strain IDs e.g. %s" % (
            len(type_strain_identifiers), type_strain_identifiers[0]))

        with open(final_sequences_path, 'w') as final_seqs_fp:
            with open(euk_hits_path) as euk_seqs_fp:
                for name, seq, _ in SeqReader().readfq(euk_seqs_fp):
                    if name.find('_split_') == -1:
                        num_euk_hits += 1
                        #TODO: Dereplicate at some level
                        final_seqs_fp.write(">%s\n%s\n" % (name, seq))
            logging.info("Found %i eukaryotic sequences to include in the package" % \
                         num_euk_hits)

            # Dereplicate hit sequences on the species level, choosing type strains
            # where applicable.
            dereplicator = Dereplicator()
            for gpkg in [archaeal_intermediate_pkg, bacterial_intermediate_pkg]:
                tax = gpkg.taxonomy_hash()
                species_dereplicated_ids = dereplicator.dereplicate(
                    list(tax.keys()),
                    8, # root, kingdom, phylum, c o f g s
                    tax,
                    type_strain_identifiers)
                logging.debug("Dereplicator returned %i entries" % len(species_dereplicated_ids))
                num_total = 0
                num_written = 0
                with open(gpkg.unaligned_sequence_database_path()) as seqs:
                    for name, seq, _ in SeqReader().readfq(seqs):
                        num_total += 1
                        if name in species_dereplicated_ids:
                            final_seqs_fp.write(">%s\n%s\n" % (name, seq))
                            num_written += 1
                logging.info(
                    "Of %i sequences in gpkg %s, %i species-dereplicated were included in the final package." %(
                        num_total, gpkg, num_written))

        # Concatenate euk and input taxonomy
        final_taxonomy_file = os.path.join(working_directory,
                                            "%s_final_taxonomy.csv" % basename)
        extern.run("cat %s %s > %s" % (
            euk_taxonomy, input_taxonomy, final_taxonomy_file))

        # Run graftm create to get the final package
        final_gpkg = os.path.join(working_directory,
                                  "%s_final.gpkg" % basename)
        cmd = "graftM create --force --sequences %s --taxonomy %s --search_hmm_files %s %s --hmm %s --output %s" % (
            final_sequences_path,
            final_taxonomy_file,
            ' '.join(archaeal_intermediate_pkg.search_hmm_paths()),
            ' '.join(bacterial_intermediate_pkg.search_hmm_paths()),
            original_hmm_path,
            final_gpkg)
        extern.run(cmd)

        ##############################################################################
        # Remove sequences from the diamond DB that are not in the tree i.e.
        # those that are exact duplicates, so that the diamond_example hits are
        # always in the tree.
        # Read the list of IDs in the tree with dendropy
        final_gpkg_object = GraftMPackage.acquire(final_gpkg)
        unaligned_seqs = final_gpkg_object.unaligned_sequence_database_path()
        tree = dendropy.Tree.get(path=final_gpkg_object.reference_package_tree_path(),
                                 schema='newick')
        leaf_names = [l.taxon.label.replace(' ','_') for l in tree.leaf_node_iter()]
        logging.debug("Read in final tree with %i leaves" % len(leaf_names))

        # Extract out of the sequences file in the graftm package
        final_seqs = SequenceExtractor().extract_and_read(
            leaf_names, unaligned_seqs)
        if len(final_seqs) != len(leaf_names):
            raise Exception("Do not appear to have extracted the expected number of sequences from the unaligned fastat file")

        # Write the reads into sequences file in place
        with open(unaligned_seqs, 'w') as f:
            for s in final_seqs:
                f.write(">%s\n" % s.name)
                f.write(s.seq)
                f.write("\n")

        # Regenerate the diamond DB
        final_gpkg_object.create_diamond_db()

        ##############################################################################
        # Run singlem create to put the final package together
        SingleMPackageVersion2.compile(
            output_singlem_package,
            final_gpkg,
            original_pkg.singlem_position(),
            original_pkg.window_size())
        logging.info("SingleM package generated.")