def test_hello_world(self): with tempdir.in_tempdir(): with tempfile.NamedTemporaryFile() as fasta: with tempfile.NamedTemporaryFile() as tax: fasta.write(Tests.extra_mcra_fasta) fasta.flush() tax.write(Tests.extra_mcra_taxonomy) tax.flush() prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg') cmd1 = "%s update --graftm_package %s --sequences %s --taxonomy %s --output %s" %( path_to_script, prev_path, fasta.name, tax.name, 'updated.gpkg') extern.run(cmd1) prev = GraftMPackage.acquire(prev_path) up = GraftMPackage.acquire('updated.gpkg') prevhash = prev.taxonomy_hash() taxhash = up.taxonomy_hash() self.assertEqual(len(prevhash)+1, len(taxhash)) self.assertEqual(['mcrA','Euryarchaeota_mcrA','Methanofastidiosa'], taxhash['KYC55281.1']) self.assertEqual(prevhash['638165755'], taxhash['638165755']) seqio = SequenceIO() self.assertEqual( len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1, len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
def test_is_protein_package(self): pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg')) self.assertEqual(True, pkg.is_protein_package()) self.assertEqual(True, pkg.is_protein_package()) pkg = GraftMPackage.acquire(os.path.join(path_to_data, '61_otus.gpkg')) self.assertEqual(False, pkg.is_protein_package()) self.assertEqual(False, pkg.is_protein_package())
def test_autodecorate(self): with tempdir.in_tempdir(): with tempfile.NamedTemporaryFile() as fasta: fasta.write(Tests.extra_mcra_fasta) fasta.flush() prev_path = os.path.join(path_to_data,'mcrA.10seqs.gpkg') update = Update(prerequisites) update.update( input_sequence_path = fasta.name, input_graftm_package_path = prev_path, output_graftm_package_path = 'updated.gpkg') prev = GraftMPackage.acquire(prev_path) up = GraftMPackage.acquire('updated.gpkg') prevhash = prev.taxonomy_hash() taxhash = up.taxonomy_hash() self.assertEqual(11, len(taxhash)) #hard-code 11 because of #https://github.com/geronimp/graftM/issues/204 self.assertEqual(['mcrA','Euryarchaeota_mcrA', 'Methanomicrobia'], taxhash['KYC55281.1']) self.assertEqual(prevhash['638165755'], taxhash['638165755']) seqio = SequenceIO() self.assertEqual( len(seqio.read_fasta_file(prev.unaligned_sequence_database_path()))+1, len(seqio.read_fasta_file(up.unaligned_sequence_database_path())))
def test_acquire(self): pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg')) self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.hmm'), pkg.alignment_hmm_path()) self.assertEqual(False, pkg.use_hmm_trusted_cutoff()) self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.refpkg','treeoN87dL.tre'), pkg.reference_package_tree_path())
def test_input_unrooted_tree(self): otu61 = os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg') with tempfile.NamedTemporaryFile(suffix='.fa') as bad_alignment: with tempdir.TempDir() as tmp: Create(prerequisites).main( taxtastic_taxonomy=os.path.join(otu61, '61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(otu61, '61_otus_seqinfo.csv'), # created with newick_utils: # nw_prune test/data/61_otus.gpkg/61_otus.refpkg/61_otus.tre 4459468 >test/data/61_otus.without_4459468.tre unrooted_tree=os.path.join(path_to_data, 'create', '61_otus.without_4459468.tre'), sequences=os.path.join(path_to_data, 'create', '61_otus.without_4459468.fasta'), alignment=os.path.join( path_to_data, 'create', '61_otus.without_4459468.aln.fasta'), prefix=tmp, force=True) gpkg = GraftMPackage.acquire(tmp) tree = Tree.get( schema='newick', data=open(gpkg.reference_package_tree_path()).readline()) self.assertEqual(21, len(tree.leaf_nodes()))
def compile(output_package_path, graftm_package_path, singlem_position): '''Create a new SingleM package with the given inputs. Any files specified as parameters are copied into the final package so can be removed after calling this function. Parameters ---------- output_package_path: str path to the package being created (must not exist) graftm_package_path: str path to graftm package internal to the singlem package singlem_position: int the position in the HMM where the SingleM window starts Returns ------- Nothing ''' if os.path.exists(output_package_path): raise Exception( "Not writing new SingleM package to already existing file/directory with name %s" % output_package_path) os.mkdir(output_package_path) graftm_package = GraftMPackage.acquire(graftm_package_path) if graftm_package.version != 3: raise Exception( "SingleM packages can only be created from version 3 GraftM packages at this point." ) graftm_package_basename = os.path.basename( output_package_path.replace('.spkg', '').replace('.gpkg', '')) logging.info("Using GraftM package name %s" % graftm_package_basename) if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME: raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME) shutil.copytree( graftm_package_path, os.path.join(output_package_path, graftm_package_basename)) singlem_package = SingleMPackageVersion1() singlem_package._contents_hash = { SingleMPackage.VERSION_KEY: singlem_package.version, SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename, SingleMPackage.SINGLEM_POSITION_KEY: singlem_position } singlem_package._base_directory = output_package_path # calculate the sha256 values singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \ singlem_package.calculate_alignment_hmm_sha256() singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \ singlem_package.calculate_singlem_package_sha256() # save contents file json.dump( singlem_package._contents_hash, open( os.path.join(output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w'))
def test_hello_world_diamond(self): gpkg=os.path.join(path_to_data, "bootstrapper", "D1_gpkg_for_diamond.gpkg") expandsearcher = ExpandSearcher(search_hmm_files = [os.path.join(path_to_data,'bootstrapper','DNGNGWU00001.hmm')], evalue='1e-5', maximum_range=1000, threads=1, graftm_package=GraftMPackage.acquire(gpkg)) with tempfile.NamedTemporaryFile() as tf: self.assertEqual(True, expandsearcher.generate_expand_search_database_from_contigs(\ [os.path.join(path_to_data,'bootstrapper','diamond_bootstrap_contigs.fna')], tf.name, "diamond"))
def compile(output_package_path, graftm_package_path, singlem_position, window_size): if os.path.exists(output_package_path): raise Exception( "Not writing new SingleM package to already existing file/directory with name %s" % output_package_path) os.mkdir(output_package_path) graftm_package = GraftMPackage.acquire(graftm_package_path) if graftm_package.version != 3: raise Exception( "SingleM packages can only be created from version 3 GraftM packages at this point." ) # Use abspath before basename so that trailing slashes are dealt with. graftm_package_basename = os.path.basename( os.path.abspath(output_package_path).replace('.spkg', '').replace( '.gpkg', '')) logging.info("Using GraftM package name %s" % graftm_package_basename) if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME: raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME) shutil.copytree( graftm_package_path, os.path.join(output_package_path, graftm_package_basename)) singlem_package = SingleMPackageVersion2() singlem_package._contents_hash = { SingleMPackage.VERSION_KEY: singlem_package.version, SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename, SingleMPackage.SINGLEM_POSITION_KEY: singlem_position, SingleMPackage.SINGLEM_WINDOW_SIZE_KEY: window_size } singlem_package._base_directory = output_package_path if singlem_package.is_protein_package() and window_size % 3 != 0: raise Exception( "For protein packages, the window size must be specified in base pairs. However, the window_size specified is not divisible by 3." ) # calculate the sha256 values singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \ singlem_package.calculate_alignment_hmm_sha256() singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \ singlem_package.calculate_singlem_package_sha256() # save contents file with open( os.path.join(output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w') as f: json.dump(singlem_package._contents_hash, f)
def compile(output_package_path, graftm_package_path, singlem_position): '''Create a new SingleM package with the given inputs. Any files specified as parameters are copied into the final package so can be removed after calling this function. Parameters ---------- output_package_path: str path to the package being created (must not exist) graftm_package_path: str path to graftm package internal to the singlem package singlem_position: int the position in the HMM where the SingleM window starts Returns ------- Nothing ''' if os.path.exists(output_package_path): raise Exception("Not writing new SingleM package to already existing file/directory with name %s" % output_package_path) os.mkdir(output_package_path) graftm_package = GraftMPackage.acquire(graftm_package_path) if graftm_package.version != 3: raise Exception("SingleM packages can only be created from version 3 GraftM packages at this point.") graftm_package_basename = os.path.basename( output_package_path.replace('.spkg','').replace('.gpkg','')) logging.info("Using GraftM package name %s" % graftm_package_basename) if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME: raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME) shutil.copytree(graftm_package_path, os.path.join(output_package_path, graftm_package_basename)) singlem_package = SingleMPackageVersion1() singlem_package._contents_hash = {SingleMPackage.VERSION_KEY: singlem_package.version, SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename, SingleMPackage.SINGLEM_POSITION_KEY: singlem_position } singlem_package._base_directory = output_package_path # calculate the sha256 values singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \ singlem_package.calculate_alignment_hmm_sha256() singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \ singlem_package.calculate_singlem_package_sha256() # save contents file json.dump(singlem_package._contents_hash, open(os.path.join(output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w'))
def test_input_unrooted_tree(self): otu61 = os.path.join(path_to_data, '61_otus.gpkg','61_otus.refpkg') with tempfile.NamedTemporaryFile(suffix='.fa') as bad_alignment: with tempdir.TempDir() as tmp: Create(prerequisites).main( taxtastic_taxonomy=os.path.join(otu61,'61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(otu61,'61_otus_seqinfo.csv'), # created with newick_utils: # nw_prune test/data/61_otus.gpkg/61_otus.refpkg/61_otus.tre 4459468 >test/data/61_otus.without_4459468.tre unrooted_tree=os.path.join(path_to_data,'create','61_otus.without_4459468.tre'), sequences=os.path.join(path_to_data,'create','61_otus.without_4459468.fasta'), alignment=os.path.join(path_to_data,'create','61_otus.without_4459468.aln.fasta'), prefix=tmp, force=True) gpkg = GraftMPackage.acquire(tmp) tree=Tree.get(schema='newick', data=open(gpkg.reference_package_tree_path()).readline()) self.assertEqual(21, len(tree.leaf_nodes()))
def test_hello_world_diamond(self): gpkg = os.path.join(path_to_data, "bootstrapper", "D1_gpkg_for_diamond.gpkg") expandsearcher = ExpandSearcher( search_hmm_files=[ os.path.join(path_to_data, 'bootstrapper', 'DNGNGWU00001.hmm') ], evalue='1e-5', maximum_range=1000, threads=1, graftm_package=GraftMPackage.acquire(gpkg)) with tempfile.NamedTemporaryFile() as tf: self.assertEqual(True, expandsearcher.generate_expand_search_database_from_contigs(\ [os.path.join(path_to_data,'bootstrapper','diamond_bootstrap_contigs.fna')], tf.name, "diamond"))
def test_remove_strange_characters_integration_test(self): with tempdir.TempDir() as tmp: gpkg = tmp + ".gpkg" first_seq = None with tempfile.NamedTemporaryFile(suffix='61_otus.Rs.fasta', mode='w') as f: for record in SeqIO.parse( open( os.path.join(path_to_data, 'create', '61_otus.fasta')), 'fasta'): if not first_seq: first_seq = str(record.seq) record.seq = Seq(str(record.seq).replace( 'A', 'R', 5)) #don't replace too many otherwise hmmbuild fails SeqIO.write(record, f, 'fasta') f.flush() Create(prerequisites).main( sequences=f.name, taxtastic_taxonomy=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus_seqinfo.csv'), alignment=os.path.join(path_to_data, '61_otus.gpkg', '61_otus.refpkg', '61_otus.aln.fa'), prefix=gpkg, threads=5) pkg = GraftMPackage.acquire(gpkg) self.assertEqual('NAME 61_otus.aln\n', open(pkg.alignment_hmm_path()).readlines()[1]) self.assertEqual(pkg.diamond_database_path(), None) for record in SeqIO.parse(open(pkg.alignment_fasta_path()), 'fasta'): self.assertEqual( str(record.seq).replace('R', 'N'), str(record.seq)) break
def _test_package(self, package_path): '''Give a GraftM package a spin, and see if it works in reality with default parameters (i.e. pplacer). If it does not work, then raise an error. Parameters ---------- package_path: str path to graftm_package to be tested ''' pkg = GraftMPackage.acquire(package_path) with tempdir.TempDir() as graftM_graft_test_dir_name: # Take a subset of sequences for testing with tempfile.NamedTemporaryFile(suffix=".fa") as tf: seqio = SequenceIO() seqio.write_fasta( itertools.islice(seqio.each_sequence(open(pkg.unaligned_sequence_database_path())), 10), tf) tf.flush() cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" %( tf.name, package_path, graftM_graft_test_dir_name) extern.run(cmd)
def _test_package(self, package_path): '''Give a GraftM package a spin, and see if it works in reality with default parameters (i.e. pplacer). If it does not work, then raise an error. Parameters ---------- package_path: str path to graftm_package to be tested ''' pkg = GraftMPackage.acquire(package_path) with tempdir.TempDir() as graftM_graft_test_dir_name: # Take a subset of sequences for testing with tempfile.NamedTemporaryFile(suffix=".fa", mode='w') as tf: seqio = SequenceIO() with open(pkg.unaligned_sequence_database_path()) as f: seqio.write_fasta( itertools.islice(seqio.each_sequence(f), 10), tf) tf.flush() cmd = "graftM graft --forward %s --graftm_package %s --output_directory %s --force" % ( tf.name, package_path, graftM_graft_test_dir_name) extern.run(cmd)
def compile(output_package_path, graftm_package_path, singlem_position, window_size): if os.path.exists(output_package_path): raise Exception("Not writing new SingleM package to already existing file/directory with name %s" % output_package_path) os.mkdir(output_package_path) graftm_package = GraftMPackage.acquire(graftm_package_path) if graftm_package.version != 3: raise Exception("SingleM packages can only be created from version 3 GraftM packages at this point.") # Use abspath before basename so that trailing slashes are dealt with. graftm_package_basename = os.path.basename( os.path.abspath(output_package_path).replace('.spkg','').replace('.gpkg','')) logging.info("Using GraftM package name %s" % graftm_package_basename) if graftm_package_basename == SingleMPackage._CONTENTS_FILE_NAME: raise Exception("Name of GraftM package cannot be %s" % SingleMPackage._CONTENTS_FILE_NAME) shutil.copytree(graftm_package_path, os.path.join(output_package_path, graftm_package_basename)) singlem_package = SingleMPackageVersion2() singlem_package._contents_hash = {SingleMPackage.VERSION_KEY: singlem_package.version, SingleMPackage.GRAFTM_PACKAGE_KEY: graftm_package_basename, SingleMPackage.SINGLEM_POSITION_KEY: singlem_position, SingleMPackage.SINGLEM_WINDOW_SIZE_KEY: window_size } singlem_package._base_directory = output_package_path if singlem_package.is_protein_package() and window_size % 3 != 0: raise Exception("For protein packages, the window size must be specified in base pairs. However, the window_size specified is not divisible by 3.") # calculate the sha256 values singlem_package._contents_hash[SingleMPackage.ALIGNMENT_HMM_SHA256_KEY] = \ singlem_package.calculate_alignment_hmm_sha256() singlem_package._contents_hash[SingleMPackage.SINGLEM_PACKAGE_SHA256_KEY] = \ singlem_package.calculate_singlem_package_sha256() # save contents file with open(os.path.join( output_package_path, SingleMPackage._CONTENTS_FILE_NAME), 'w') as f: json.dump(singlem_package._contents_hash, f)
def test_remove_strange_characters_integration_test(self): with tempdir.TempDir() as tmp: gpkg = tmp+".gpkg" first_seq = None with tempfile.NamedTemporaryFile(suffix='61_otus.Rs.fasta') as f: for record in SeqIO.parse(open(os.path.join(path_to_data,'create','61_otus.fasta')), 'fasta'): if not first_seq: first_seq = str(record.seq) record.seq = Seq(str(record.seq).replace('A','R',5)) #don't replace too many otherwise hmmbuild fails SeqIO.write(record, f, 'fasta') f.flush() Create(prerequisites).main(sequences=f.name, taxtastic_taxonomy=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_taxonomy.csv'), taxtastic_seqinfo=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus_seqinfo.csv'), alignment=os.path.join(path_to_data,'61_otus.gpkg','61_otus.refpkg','61_otus.aln.fa'), prefix=gpkg, threads=5) pkg = GraftMPackage.acquire(gpkg) self.assertEqual('NAME 61_otus.aln\n', open(pkg.alignment_hmm_path()).readlines()[1]) self.assertEqual(pkg.diamond_database_path(), None) for record in SeqIO.parse(open(pkg.alignment_fasta_path()), 'fasta'): self.assertEqual(str(record.seq).replace('R','N'), str(record.seq)) break
def set_attributes(self, args): # Read graftM package and assign HMM and refpkg file if args.no_merge_reads: setattr(args, 'merge_reads', False) else: if args.reverse: setattr(args, 'merge_reads', True) else: setattr(args, 'merge_reads', False) if args.graftm_package: if not os.path.isdir(args.graftm_package): raise Exception("%s does not exist. Are you sure you provided the correct path?" % args.graftm_package) else: gpkg = GraftMPackage.acquire(args.graftm_package) if hasattr(args, 'search_hmm_files'): # If a hmm is specified, overwrite the one graftM package setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path()) setattr(args, 'reference_package', gpkg.reference_package_path()) else: setattr(args, 'search_hmm_files', []) for hmm in gpkg.search_hmm_paths(): args.search_hmm_files.append(hmm) setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path()) setattr(args, 'reference_package', gpkg.reference_package_path()) elif hasattr(args, 'search_diamond_files'): if args.search_method == self.DIAMOND_SEARCH_METHOD: if hasattr(args, 'aln_hmm_file'): pass else: raise Exception("aln_hmm_file not specified") else: raise Exception("Specified DIAMOND databases when not using the diamond search pipeline. Using: %s" % (args.search_method)) elif hasattr(args, 'search_hmm_files'): if args.search_method == self.HMMSEARCH_SEARCH_METHOD: if not hasattr(args, 'aln_hmm_file'): if len(args.search_hmm_files) == 1: if not args.search_only: setattr(args, 'aln_hmm_file', args.search_hmm_files[0]) else: raise Exception("Multiple search HMMs specified, but aln_hmm_file not specified") else: raise Exception("Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method)) elif hasattr(args, 'search_hmm_list_file'): if args.search_method == self.HMMSEARCH_SEARCH_METHOD: setattr(args, 'search_hmm_files', [x.rstrip() for x in open(args.search_hmm_list_file).readlines()]) if not hasattr(args, 'aln_hmm_file'): if not args.search_only: raise Exception("Multiple search HMMs specified, but aln_hmm_file not specified") else: raise Exception("Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method)) else: if args.search_only: if args.search_diamond_file: args.search_method = self.DIAMOND_SEARCH_METHOD args.search_hmm_files = None else: raise Exception('No gpkg, HMM, or DIAMOND database was specified, so there is no reference database to search with.')
def create(self, **kwargs): input_graftm_package_path = kwargs.pop('input_graftm_package') output_singlem_package_path = kwargs.pop('output_singlem_package') hmm_position = kwargs.pop('hmm_position') window_size = kwargs.pop('window_size') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if force and os.path.exists(output_singlem_package_path): shutil.rmtree(output_singlem_package_path) # For protein packages, remove sequences from diamond database that are # not in the tree so that hits can be mapped onto the tree and used for # alpha and beta diversity metrics. gpkg = GraftMPackage.acquire(input_graftm_package_path) is_protein_package = SingleMPackageVersion2.graftm_package_is_protein( gpkg) logging.info("Detected package type as %s" % ('protein' if is_protein_package else 'nucleotide')) if is_protein_package: tree_leaves = set() for node in dendropy.Tree.get( path=gpkg.reference_package_tree_path(), schema='newick').leaf_node_iter(): # need to replace here because otherwise they don't line up with the # diamond database IDs node_name = node.taxon.label.replace(' ', '_') if node_name in tree_leaves: raise Exception( "Found duplicate tree leaf name in graftm package " "tree. Currently this case is not handled, sorry") tree_leaves.add(node_name) for name in tree_leaves: #I don't think there is a 'peek' ? eg_name = name break logging.info("Read in %i tree tip names e.g. %s" % (len(tree_leaves), eg_name)) # Make a new fasta file of all the sequences that are leaves found_sequence_names = set() num_seqs_unaligned = 0 filtered_aligned_tempfile = tempfile.NamedTemporaryFile( prefix='singlem_package_creator', suffix='.fasta') for s in SeqIO.parse(gpkg.unaligned_sequence_database_path(), "fasta"): num_seqs_unaligned += 1 if s.id in tree_leaves: if s.id in found_sequence_names: raise Exception( "Found duplicate sequence names in graftm unaligned" " sequence fasta file. Currently this case is not handled," " sorry") SeqIO.write([s], filtered_aligned_tempfile, "fasta") found_sequence_names.add(s.id) filtered_aligned_tempfile.flush() if len(tree_leaves) != len(found_sequence_names): for t in tree_leaves: if t not in found_sequence_names: raise Exception( "Found some sequences that were in the tree but not the" " unaligned sequences database e.g. %s. Something is" " likely amiss with the input GraftM package" % t) raise Exception("Programming error, shouldn't get here") logging.info( "All %i sequences found in tree extracted successfully from unaligned" " sequences fasta file, which originally had %i sequences" % (len(found_sequence_names), num_seqs_unaligned)) # Create a new diamond database dmnd_tf = tempfile.NamedTemporaryFile( prefix='singlem_package_creator', suffix='.dmnd') cmd = "diamond makedb --in '%s' -d '%s'" % ( filtered_aligned_tempfile.name, dmnd_tf.name) logging.info("Creating DIAMOND database") extern.run(cmd) # Compile the final graftm/singlem package if len(gpkg.search_hmm_paths()) == 1 and \ gpkg.search_hmm_paths()[0] == gpkg.alignment_hmm_path(): search_hmms = None else: search_hmms = gpkg.search_hmm_paths() with tempdir.TempDir() as tmpdir: gpkg_name = os.path.join( tmpdir, os.path.basename( os.path.abspath(input_graftm_package_path)).replace( '.gpkg', '')) GraftMPackageVersion3.compile(gpkg_name, gpkg.reference_package_path(), gpkg.alignment_hmm_path(), dmnd_tf.name if is_protein_package else None, gpkg.maximum_range(), filtered_aligned_tempfile.name if is_protein_package else \ gpkg.unaligned_sequence_database_path(), gpkg.use_hmm_trusted_cutoff(), search_hmms) logging.debug( "Finished creating GraftM package for conversion to SingleM package" ) SingleMPackageVersion2.compile(output_singlem_package_path, gpkg_name, hmm_position, window_size) shutil.rmtree(gpkg_name) if is_protein_package: filtered_aligned_tempfile.close() dmnd_tf.close() logging.info("SingleM-compatible package creation finished")
def regenerate(self, **kwargs): input_singlem_package = kwargs.pop('input_singlem_package') output_singlem_package = kwargs.pop('output_singlem_package') working_directory = kwargs.pop('working_directory') euk_sequences = kwargs.pop('euk_sequences') euk_taxonomy = kwargs.pop('euk_taxonomy') intermediate_archaea_graftm_package = kwargs.pop('intermediate_archaea_graftm_package') intermediate_bacteria_graftm_package = kwargs.pop('intermediate_bacteria_graftm_package') input_taxonomy = kwargs.pop('input_taxonomy') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) original_pkg = SingleMPackage.acquire(input_singlem_package) original_hmm_path = original_pkg.hmm_path() basename = original_pkg.graftm_package_basename() # Run GraftM on the euk sequences with the bacterial set euk_graftm_output = os.path.join(working_directory, "%s-euk_graftm" % basename) cmd = "graftM graft --graftm_package '%s' --search_and_align_only --forward '%s' --output %s --force" % ( original_pkg.graftm_package_path(), euk_sequences, euk_graftm_output) extern.run(cmd) # Extract hit sequences from that set euk_result = GraftMResult(euk_graftm_output, False) hit_paths = euk_result.unaligned_sequence_paths(require_hits=True) if len(hit_paths) != 1: raise Exception( "Unexpected number of hits against euk in graftm") euk_hits_path = next(iter(hit_paths.values())) #i.e. first # Concatenate euk, archaea and bacterial sequences archaeal_intermediate_pkg = GraftMPackage.acquire( intermediate_archaea_graftm_package) bacterial_intermediate_pkg = GraftMPackage.acquire( intermediate_bacteria_graftm_package) num_euk_hits = 0 final_sequences_path = os.path.join(working_directory, "%s_final_sequences.faa" % basename) with open(final_sequences_path, 'w') as final_seqs_fp: with open(euk_hits_path) as euk_seqs_fp: for name, seq, _ in SeqReader().readfq(euk_seqs_fp): if name.find('_split_') == -1: num_euk_hits += 1 final_seqs_fp.write(">%s\n%s\n" % (name, seq)) logging.info("Found %i eukaryotic sequences to include in the package" % \ num_euk_hits) for gpkg in [archaeal_intermediate_pkg, bacterial_intermediate_pkg]: num_total = 0 num_written = 0 with open(gpkg.unaligned_sequence_database_path()) as seqs: for name, seq, _ in SeqReader().readfq(seqs): num_total += 1 # if name in species_dereplicated_ids: final_seqs_fp.write(">%s\n%s\n" % (name, seq)) num_written += 1 logging.info( "Of %i sequences in gpkg %s, %i species-dereplicated were included in the final package." %( num_total, gpkg, num_written)) # Concatenate euk and input taxonomy final_taxonomy_file = os.path.join(working_directory, "%s_final_taxonomy.csv" % basename) extern.run("cat %s %s > %s" % ( euk_taxonomy, input_taxonomy, final_taxonomy_file)) # Run graftm create to get the final package final_gpkg = os.path.join(working_directory, "%s_final.gpkg" % basename) cmd = "graftM create --force --sequences %s --taxonomy %s --search_hmm_files %s %s --hmm %s --output %s" % ( final_sequences_path, final_taxonomy_file, ' '.join(archaeal_intermediate_pkg.search_hmm_paths()), ' '.join(bacterial_intermediate_pkg.search_hmm_paths()), original_hmm_path, final_gpkg) extern.run(cmd) ############################################################################## # Remove sequences from the diamond DB that are not in the tree i.e. # those that are exact duplicates, so that the diamond_example hits are # always in the tree. # Read the list of IDs in the tree with dendropy final_gpkg_object = GraftMPackage.acquire(final_gpkg) unaligned_seqs = final_gpkg_object.unaligned_sequence_database_path() tree = dendropy.Tree.get(path=final_gpkg_object.reference_package_tree_path(), schema='newick') leaf_names = [l.taxon.label.replace(' ','_') for l in tree.leaf_node_iter()] logging.debug("Read in final tree with %i leaves" % len(leaf_names)) # Extract out of the sequences file in the graftm package final_seqs = SequenceExtractor().extract_and_read( leaf_names, unaligned_seqs) if len(final_seqs) != len(leaf_names): raise Exception("Do not appear to have extracted the expected number of sequences from the unaligned fastat file") # Write the reads into sequences file in place with open(unaligned_seqs, 'w') as f: for s in final_seqs: f.write(">%s\n" % s.name) f.write(s.seq) f.write("\n") # Regenerate the diamond DB final_gpkg_object.create_diamond_db() ############################################################################## # Run singlem create to put the final package together SingleMPackageVersion2.compile( output_singlem_package, final_gpkg, original_pkg.singlem_position(), original_pkg.window_size()) logging.info("SingleM package generated.")
def graft(self): # The Graft pipeline: # Searches for reads using hmmer, and places them in phylogenetic # trees to derive a community structure. if self.args.graftm_package: gpkg = GraftMPackage.acquire(self.args.graftm_package) else: gpkg = None REVERSE_PIPE = (True if self.args.reverse else False) INTERLEAVED = (True if self.args.interleaved else False) base_list = [] seqs_list = [] search_results = [] hit_read_count_list = [] db_search_results = [] if gpkg: maximum_range = gpkg.maximum_range() if self.args.search_diamond_file: self.args.search_method = self.hk.DIAMOND_SEARCH_METHOD diamond_db = self.args.search_diamond_file[0] else: diamond_db = gpkg.diamond_database_path() if self.args.search_method == self.hk.DIAMOND_SEARCH_METHOD: if not diamond_db: logging.error( "%s search method selected, but no diamond database specified. \ Please either provide a gpkg to the --graftm_package flag, or a diamond \ database to the --search_diamond_file flag." % self.args.search_method) raise Exception() else: # Get the maximum range, if none exists, make one from the HMM profile if self.args.maximum_range: maximum_range = self.args.maximum_range else: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: if not self.args.search_only: maximum_range = self.hk.get_maximum_range( self.args.aln_hmm_file) else: logging.debug( "Running search only pipeline. maximum_range not configured." ) maximum_range = None else: logging.warning( 'Cannot determine maximum range when using %s pipeline and with no GraftM package specified' % self.args.search_method) logging.warning( 'Setting maximum_range to None (linked hits will not be detected)' ) maximum_range = None if self.args.search_diamond_file: diamond_db = self.args.search_diamond_file else: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: diamond_db = None else: logging.error( "%s search method selected, but no gpkg or diamond database selected" % self.args.search_method) if self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT: if self.args.reverse: logging.warn( "--reverse reads specified with --assignment_method diamond. Reverse reads will be ignored." ) self.args.reverse = None # If merge reads is specified, check that there are reverse reads to merge with if self.args.merge_reads and not hasattr(self.args, 'reverse'): raise Exception("Programming error") # Set the output directory if not specified and create that directory logging.debug('Creating working directory: %s' % self.args.output_directory) self.hk.make_working_directory(self.args.output_directory, self.args.force) # Set pipeline and evalue by checking HMM format if self.args.search_only: if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: hmm_type, hmm_tc = self.hk.setpipe( self.args.search_hmm_files[0]) logging.debug("HMM type: %s Trusted Cutoff: %s" % (hmm_type, hmm_tc)) else: hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file) logging.debug("HMM type: %s Trusted Cutoff: %s" % (hmm_type, hmm_tc)) if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: setattr(self.args, 'type', hmm_type) if hmm_tc: setattr(self.args, 'evalue', '--cut_tc') else: setattr(self.args, 'type', self.PIPELINE_AA) if self.args.filter_minimum is not None: filter_minimum = self.args.filter_minimum else: if self.args.type == self.PIPELINE_NT: filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES else: filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES # Generate expand_search database if required if self.args.expand_search_contigs: if self.args.graftm_package: pkg = GraftMPackage.acquire(self.args.graftm_package) else: pkg = None boots = ExpandSearcher(search_hmm_files=self.args.search_hmm_files, maximum_range=self.args.maximum_range, threads=self.args.threads, evalue=self.args.evalue, min_orf_length=self.args.min_orf_length, graftm_package=pkg) # this is a hack, it should really use GraftMFiles but that class isn't currently flexible enough new_database = (os.path.join(self.args.output_directory, "expand_search.hmm") \ if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD \ else os.path.join(self.args.output_directory, "expand_search") ) if boots.generate_expand_search_database_from_contigs( self.args.expand_search_contigs, new_database, self.args.search_method): if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD: self.ss.search_hmm.append(new_database) else: diamond_db = new_database first_search_method = self.args.search_method if self.args.decoy_database: decoy_filter = DecoyFilter( Diamond(diamond_db, threads=self.args.threads), Diamond(self.args.decoy_database, threads=self.args.threads)) doing_decoy_search = True elif self.args.search_method == self.hk.HMMSEARCH_AND_DIAMOND_SEARCH_METHOD: decoy_filter = DecoyFilter( Diamond(diamond_db, threads=self.args.threads)) doing_decoy_search = True first_search_method = self.hk.HMMSEARCH_SEARCH_METHOD else: doing_decoy_search = False # For each pair (or single file passed to GraftM) logging.debug('Working with %i file(s)' % len(self.sequence_pair_list)) for pair in self.sequence_pair_list: # Guess the sequence file type, if not already specified to GraftM unpack = UnpackRawReads(pair[0], self.args.input_sequence_type, INTERLEAVED) # Set the basename, and make an entry to the summary table. base = unpack.basename() pair_direction = ['forward', 'reverse'] logging.info("Working on %s" % base) # Make the working base subdirectory self.hk.make_working_directory( os.path.join(self.args.output_directory, base), self.args.force) # for each of the paired end read files for read_file in pair: unpack = UnpackRawReads(read_file, self.args.input_sequence_type, INTERLEAVED) if read_file is None: # placeholder for interleaved (second file is None) continue if not os.path.isfile(read_file): # Check file exists logging.info('%s does not exist! Skipping this file..' % read_file) continue # Set the output file_name if len(pair) == 2: direction = 'interleaved' if pair[1] is None \ else pair_direction.pop(0) logging.info("Working on %s reads" % direction) self.gmf = GraftMFiles(base, self.args.output_directory, direction) self.hk.make_working_directory( os.path.join(self.args.output_directory, base, direction), self.args.force) else: direction = False self.gmf = GraftMFiles(base, self.args.output_directory, direction) if self.args.type == self.PIPELINE_AA: logging.debug("Running protein pipeline") try: search_time, ( result, complement_information) = self.ss.aa_db_search( self.gmf, base, unpack, first_search_method, maximum_range, self.args.threads, self.args.evalue, self.args.min_orf_length, self.args.restrict_read_length, diamond_db, self.args.diamond_performance_parameters, ) except NoInputSequencesException as e: logging.error( "No sufficiently long open reading frames were found, indicating" " either the input sequences are too short or the min orf length" " cutoff is too high. Cannot continue sorry. Alternatively, there" " is something amiss with the installation of OrfM. The specific" " command that failed was: %s" % e.command) exit(Run.NO_ORFS_EXITSTATUS) # Or the DNA pipeline elif self.args.type == self.PIPELINE_NT: logging.debug("Running nucleotide pipeline") search_time, ( result, complement_information) = self.ss.nt_db_search( self.gmf, base, unpack, self.args.euk_check, self.args.search_method, maximum_range, self.args.threads, self.args.evalue) reads_detected = True if not result.hit_fasta() or os.path.getsize( result.hit_fasta()) == 0: logging.info('No reads found in %s' % base) reads_detected = False if self.args.search_only: db_search_results.append(result) base_list.append(base) continue # Filter out decoys if specified if reads_detected and doing_decoy_search: with tempfile.NamedTemporaryFile(prefix="graftm_decoy", suffix='.fa') as f: tmpname = f.name any_remaining = decoy_filter.filter( result.hit_fasta(), tmpname) if any_remaining: shutil.move(tmpname, result.hit_fasta()) else: # No hits remain after decoy filtering. os.remove(result.hit_fasta()) continue if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT: logging.info( 'aligning reads to reference package database') hit_aligned_reads = self.gmf.aligned_fasta_output_path( base) if reads_detected: aln_time, aln_result = self.ss.align( result.hit_fasta(), hit_aligned_reads, complement_information, self.args.type, filter_minimum) else: aln_time = 'n/a' if not os.path.exists( hit_aligned_reads ): # If all were filtered out, or there just was none.. with open(hit_aligned_reads, 'w') as f: pass # just touch the file, nothing else seqs_list.append(hit_aligned_reads) db_search_results.append(result) base_list.append(base) search_results.append(result.search_result) hit_read_count_list.append(result.hit_count) # Write summary table srchtw = SearchTableWriter() srchtw.build_search_otu_table( [x.search_objects for x in db_search_results], base_list, self.gmf.search_otu_table()) if self.args.search_only: logging.info( 'Stopping before alignment and taxonomic assignment phase\n') exit(0) if self.args.merge_reads: # not run when diamond is the assignment mode- enforced by argparse grokking logging.debug("Running merge reads output") if self.args.interleaved: fwd_seqs = seqs_list rev_seqs = [] else: base_list = base_list[0::2] fwd_seqs = seqs_list[0::2] rev_seqs = seqs_list[1::2] merged_output=[GraftMFiles(base, self.args.output_directory, False).aligned_fasta_output_path(base) \ for base in base_list] logging.debug("merged reads to %s", merged_output) self.ss.merge_forev_aln(fwd_seqs, rev_seqs, merged_output) seqs_list = merged_output REVERSE_PIPE = False elif REVERSE_PIPE: base_list = base_list[0::2] # Leave the pipeline if search only was specified if self.args.search_and_align_only: logging.info('Stopping before taxonomic assignment phase\n') exit(0) elif not any(base_list): logging.error( 'No hits in any of the provided files. Cannot continue with no reads to assign taxonomy to.\n' ) exit(0) self.gmf = GraftMFiles('', self.args.output_directory, False) if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT: clusterer = Clusterer() # Classification steps seqs_list = clusterer.cluster(seqs_list, REVERSE_PIPE) logging.info("Placing reads into phylogenetic tree") taxonomic_assignment_time, assignments = self.p.place( REVERSE_PIPE, seqs_list, self.args.resolve_placements, self.gmf, self.args, result.slash_endings, gpkg.taxtastic_taxonomy_path(), clusterer) assignments = clusterer.uncluster_annotations( assignments, REVERSE_PIPE) elif self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT: logging.info("Assigning taxonomy with diamond") taxonomic_assignment_time, assignments = self._assign_taxonomy_with_diamond(\ base_list, db_search_results, gpkg, self.gmf, self.args.diamond_performance_parameters) aln_time = 'n/a' else: raise Exception("Unexpected assignment method encountered: %s" % self.args.placement_method) self.summarise(base_list, assignments, REVERSE_PIPE, [search_time, aln_time, taxonomic_assignment_time], hit_read_count_list, self.args.max_samples_for_krona)
def test_version3_unaligned_fasta(self): pkg = GraftMPackage.acquire(os.path.join(path_to_data, 'mcrA.gpkg')) self.assertEqual(3, pkg.version) self.assertEqual(os.path.join(path_to_data, 'mcrA.gpkg','mcrA.faa'), pkg.unaligned_sequence_database_path())
def main(self): if self.args.subparser_name == 'graft': if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART: print(''' GRAFT Joel Boyd, Ben Woodcroft __/__ ______| _- - _ ________| |_____/ - - - | |____/_ - _ >>>> - >>>> ____| - _- - - | ______ - _ |_____| - |______ ''') self.graft() elif self.args.subparser_name == 'create': if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART: print(''' CREATE Joel Boyd, Ben Woodcroft / >a / ------------- / >b | | -------- >>> | GPKG | >c |________| ---------- ''') if self.args.dereplication_level < 0: logging.error( "Invalid dereplication level selected! please enter a positive integer" ) exit(1) else: if not self.args.sequences: if not self.args.alignment and not self.args.rerooted_annotated_tree \ and not self.args.rerooted_tree: logging.error( "Some sort of sequence data must be provided to run graftM create" ) exit(1) if self.args.taxonomy: if self.args.rerooted_annotated_tree: logging.error( "--taxonomy is incompatible with --rerooted_annotated_tree" ) exit(1) if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo: logging.error( "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --taxonomy" ) exit(1) elif self.args.rerooted_annotated_tree: if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo: logging.error( "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --rerooted_annotated_tree" ) exit(1) else: if not self.args.taxtastic_taxonomy or not self.args.taxtastic_seqinfo: logging.error( "--taxonomy, --rerooted_annotated_tree or --taxtastic_taxonomy/--taxtastic_seqinfo is required" ) exit(1) if bool(self.args.taxtastic_taxonomy) ^ bool( self.args.taxtastic_seqinfo): logging.error( "Both or neither of --taxtastic_taxonomy and --taxtastic_seqinfo must be defined" ) exit(1) if self.args.alignment and self.args.hmm: logging.warn( "Using both --alignment and --hmm is rarely useful, but proceding on the assumption you understand." ) if len([ _f for _f in [ self.args.rerooted_tree, self.args.rerooted_annotated_tree, self.args.tree ] if _f ]) > 1: logging.error("Only 1 input tree can be specified") exit(1) self.create.main( dereplication_level=self.args.dereplication_level, sequences=self.args.sequences, alignment=self.args.alignment, taxonomy=self.args.taxonomy, rerooted_tree=self.args.rerooted_tree, unrooted_tree=self.args.tree, tree_log=self.args.tree_log, prefix=self.args.output, rerooted_annotated_tree=self.args.rerooted_annotated_tree, min_aligned_percent=float(self.args.min_aligned_percent) / 100, taxtastic_taxonomy=self.args.taxtastic_taxonomy, taxtastic_seqinfo=self.args.taxtastic_seqinfo, hmm=self.args.hmm, search_hmm_files=self.args.search_hmm_files, force=self.args.force, threads=self.args.threads) elif self.args.subparser_name == 'update': logging.info( "GraftM package %s specified to update with sequences in %s" % (self.args.graftm_package, self.args.sequences)) if self.args.regenerate_diamond_db: gpkg = GraftMPackage.acquire(self.args.graftm_package) logging.info("Regenerating diamond DB..") gpkg.create_diamond_db() logging.info("Diamond database regenerated.") return elif not self.args.sequences: logging.error( "--sequences is required unless regenerating the diamond DB" ) exit(1) if not self.args.output: if self.args.graftm_package.endswith(".gpkg"): self.args.output = self.args.graftm_package.replace( ".gpkg", "-updated.gpkg") else: self.args.output = self.args.graftm_package + '-update.gpkg' Update( ExternalProgramSuite([ 'taxit', 'FastTreeMP', 'hmmalign', 'mafft' ])).update(input_sequence_path=self.args.sequences, input_taxonomy_path=self.args.taxonomy, input_graftm_package_path=self.args.graftm_package, output_graftm_package_path=self.args.output) elif self.args.subparser_name == 'expand_search': args = self.args if not args.graftm_package and not args.search_hmm_files: logging.error( "expand_search mode requires either --graftm_package or --search_hmm_files" ) exit(1) if args.graftm_package: pkg = GraftMPackage.acquire(args.graftm_package) else: pkg = None expandsearcher = ExpandSearcher( search_hmm_files=args.search_hmm_files, maximum_range=args.maximum_range, threads=args.threads, evalue=args.evalue, min_orf_length=args.min_orf_length, graftm_package=pkg) expandsearcher.generate_expand_search_database_from_contigs( args.contigs, args.output_hmm, search_method=ExpandSearcher.HMM_SEARCH_METHOD) elif self.args.subparser_name == 'tree': if self.args.graftm_package: # shim in the paths from the graftm package, not overwriting # any of the provided paths. gpkg = GraftMPackage.acquire(self.args.graftm_package) if not self.args.rooted_tree: self.args.rooted_tree = gpkg.reference_package_tree_path() if not self.args.input_greengenes_taxonomy: if not self.args.input_taxtastic_seqinfo: self.args.input_taxtastic_seqinfo = gpkg.taxtastic_seqinfo_path( ) if not self.args.input_taxtastic_taxonomy: self.args.input_taxtastic_taxonomy = gpkg.taxtastic_taxonomy_path( ) if self.args.rooted_tree: if self.args.unrooted_tree: logging.error( "Both a rooted tree and an un-rooted tree were provided, so it's unclear what you are asking GraftM to do. \ If you're unsure see graftM tree -h") exit(1) elif self.args.reference_tree: logging.error( "Both a rooted tree and reference tree were provided, so it's unclear what you are asking GraftM to do. \ If you're unsure see graftM tree -h") exit(1) if not self.args.decorate: logging.error( "It seems a rooted tree has been provided, but --decorate has not been specified so it is unclear what you are asking graftM to do." ) exit(1) dec = Decorator(tree_path=self.args.rooted_tree) elif self.args.unrooted_tree and self.args.reference_tree: logging.debug( "Using provided reference tree %s to reroot %s" % (self.args.reference_tree, self.args.unrooted_tree)) dec = Decorator(reference_tree_path=self.args.reference_tree, tree_path=self.args.unrooted_tree) else: logging.error( "Some tree(s) must be provided, either a rooted tree or both an unrooted tree and a reference tree" ) exit(1) if self.args.output_taxonomy is None and self.args.output_tree is None: logging.error( "Either an output tree or taxonomy must be provided") exit(1) if self.args.input_greengenes_taxonomy: if self.args.input_taxtastic_seqinfo or self.args.input_taxtastic_taxonomy: logging.error( "Both taxtastic and greengenes taxonomy were provided, so its unclear what taxonomy you want graftM to decorate with" ) exit(1) logging.debug("Using input GreenGenes style taxonomy file") dec.main(self.args.input_greengenes_taxonomy, self.args.output_tree, self.args.output_taxonomy, self.args.no_unique_tax, self.args.decorate, None) elif self.args.input_taxtastic_seqinfo and self.args.input_taxtastic_taxonomy: logging.debug("Using input taxtastic style taxonomy/seqinfo") dec.main(self.args.input_taxtastic_taxonomy, self.args.output_tree, self.args.output_taxonomy, self.args.no_unique_tax, self.args.decorate, self.args.input_taxtastic_seqinfo) else: logging.error( "Either a taxtastic taxonomy or seqinfo file was provided. GraftM cannot continue without both." ) exit(1) elif self.args.subparser_name == 'archive': # Back slashes in the ASCII art are escaped. if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART: print(""" ARCHIVE Joel Boyd, Ben Woodcroft ____.----. ____.----' \\ \\ \\ \\ \\ \\ \\ \\ ____.----'`--.__ \\___.----' | `--.____ /`-._ | __.-' \\ / `-._ ___.---' \\ / `-.____.---' \\ +------+ / / | \\ \\ |`. |`. / / | \\ _.--' <===> | `+--+---+ `-. / | \\ __.--' | | | | `-._ / | \\ __.--' | | | | | | `-./ | \\_.-' | +---+--+ | | | | `. | `. | | | | `+------+ | | | | | | | | | | | | | | | `-. | _.-' `-. | __..--' `-. | __.-' `-|__.--' """) if self.args.create: if self.args.extract: logging.error( "Please specify whether to either create or export a GraftM package" ) exit(1) if not self.args.graftm_package: logging.error( "Creating a GraftM package archive requires an package to be specified" ) exit(1) if not self.args.archive: logging.error( "Creating a GraftM package archive requires an output archive path to be specified" ) exit(1) archive = Archive() archive.create(self.args.graftm_package, self.args.archive, force=self.args.force) elif self.args.extract: archive = Archive() archive.extract(self.args.archive, self.args.graftm_package, force=self.args.force) else: logging.error( "Please specify whether to either create or export a GraftM package" ) exit(1) else: raise Exception("Unexpected subparser name %s" % self.args.subparser_name)
def update(self, **kwargs): ''' Update an existing GraftM package with new sequences and taxonomy. If no taxonomy is provided, attempt to decorate the new sequences with pre-existing taxonomy. Parameters ---------- input_sequence_path: str Path to FASTA file containing sequences to add to the update GraftM package input_taxonomy_path: str Taxonomy corresponding to the sequences in input_sequence_path. If None, then attempt to assign taxonomy by decorating the tree made out of all sequences. input_graftm_package_path: str Path to the directory of the GraftM package that is to be updated output_graftm_package_path: str Path to the directory to which the new GraftM package will be written to ''' input_sequence_path = kwargs.pop('input_sequence_path') input_taxonomy_path = kwargs.pop('input_taxonomy_path', None) input_graftm_package_path = kwargs.pop('input_graftm_package_path') output_graftm_package_path = kwargs.pop('output_graftm_package_path') threads = kwargs.pop('threads', UpdateDefaultOptions.threads) #TODO: add to user options if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Reading previous GraftM package") old_gpkg = GraftMPackage.acquire(input_graftm_package_path) min_input_version = 3 if old_gpkg.version < min_input_version: raise InsufficientGraftMPackageVersion( "GraftM below version %s cannot be updated using the update function." % min_input_version + " Unaligned sequences are not included in these packages, therefore no new" " alignment/HMM/Tree can be created") new_gpkg = UpdatedGraftMPackage() new_gpkg.output = output_graftm_package_path new_gpkg.name = output_graftm_package_path.replace(".gpkg", "") ####################################### ### Collect all unaligned sequences ### logging.info("Concatenating unaligned sequence files") new_gpkg.unaligned_sequences = "%s_sequences.fa" % (new_gpkg.name) #TODO: replace hard-coded paths like this with tempfiles self._concatenate_file([old_gpkg.unaligned_sequence_database_path(), input_sequence_path], new_gpkg.unaligned_sequences) ######################################################### ### Parse taxonomy info up front so errors come early ### if input_taxonomy_path: logging.info("Reading new taxonomy information") input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path) original_taxonomy_hash = old_gpkg.taxonomy_hash() total_taxonomy_hash = original_taxonomy_hash.copy() total_taxonomy_hash.update(input_taxonomy.taxonomy) num_duplicate_taxonomies = len(total_taxonomy_hash) - \ len(input_taxonomy.taxonomy) - \ len(original_taxonomy_hash) logging.debug("Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies) if num_duplicate_taxonomies > 0: logging.warn("Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies) ############################### ### Re-construct alignments ### logging.info("Multiple sequence aligning all sequences") new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name) self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads) ######################## ### Re-construct HMM ### logging.info("Creating HMM from alignment") new_gpkg.hmm = "%s.hmm" % (new_gpkg.name) new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name) self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment) ######################### ### Re-construct tree ### logging.info("Generating phylogenetic tree") new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name) new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name) new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(old_gpkg.alignment_hmm_path()) new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \ self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name, new_gpkg.package_type, self.fasttree) ############################################## ### Re-root and decorate tree if necessary ### if input_taxonomy_path: new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree else: logging.info("Finding taxonomy for new sequences") rerooter = Rerooter() old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(), schema='newick') new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree, schema='newick') old_tree = rerooter.reroot(old_tree) new_tree = rerooter.reroot(new_tree) # TODO: Shouldn't call an underscore method, eventually use # Rerooter instead. rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree) new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name td = TreeDecorator( rerooted_tree, old_gpkg.taxtastic_taxonomy_path(), old_gpkg.taxtastic_seqinfo_path()) with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy: td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) total_taxonomy_hash = GreenGenesTaxonomy.read_file(taxonomy.name).taxonomy ################################ ### Generating tree log file ### logging.info("Generating phylogenetic tree log file") new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name self._generate_tree_log_file(new_gpkg.unrooted_tree, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.package_type, self.fasttree) ################################ ### Creating taxtastic files ### logging.info("Writing new taxonomy files") new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name gtns = Getaxnseq() gtns.write_taxonomy_and_seqinfo_files( total_taxonomy_hash, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo) ###################### ### Compile refpkg ### logging.info("Compiling pplacer refpkg") new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name) refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo, new_gpkg.refpkg, True) ##################################### ### Re-construct diamond database ### logging.info("Recreating DIAMOND DB") new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name) self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name) #################### ### Compile gpkg ### logging.info("Compiling GraftM package") new_gpkg.name = "%s.gpkg" % new_gpkg.name GraftMPackageVersion3.compile(new_gpkg.name, new_gpkg.refpkg, new_gpkg.hmm, new_gpkg.diamond_database, self._define_range(new_gpkg.unaligned_sequences), new_gpkg.unaligned_sequences, search_hmm_files=old_gpkg.search_hmm_paths()) ################### ### Test it out ### logging.info("Testing newly updated GraftM package works") self._test_package(new_gpkg.name) logging.info("Finished")
def graftm_package(self): if self.graftm_package_cache is None: self.graftm_package_cache = GraftMPackage.acquire(self.graftm_package_path()) return self.graftm_package_cache
def graftm_package(self): if self.graftm_package_cache is None: self.graftm_package_cache = GraftMPackage.acquire( self.graftm_package_path()) return self.graftm_package_cache
def create(self, **kwargs): input_graftm_package_path = kwargs.pop('input_graftm_package') output_singlem_package_path = kwargs.pop('output_singlem_package') hmm_position = kwargs.pop('hmm_position') window_size = kwargs.pop('window_size') force = kwargs.pop('force') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) if force and os.path.exists(output_singlem_package_path): shutil.rmtree(output_singlem_package_path) # For protein packages, remove sequences from diamond database that are # not in the tree so that hits can be mapped onto the tree and used for # alpha and beta diversity metrics. gpkg = GraftMPackage.acquire(input_graftm_package_path) is_protein_package = SingleMPackageVersion2.graftm_package_is_protein(gpkg) logging.info("Detected package type as %s" % ('protein' if is_protein_package else 'nucleotide')) if is_protein_package: tree_leaves = set() for node in dendropy.Tree.get( path=gpkg.reference_package_tree_path(), schema='newick').leaf_node_iter(): # need to replace here because otherwise they don't line up with the # diamond database IDs node_name = node.taxon.label.replace(' ','_') if node_name in tree_leaves: raise Exception("Found duplicate tree leaf name in graftm package " "tree. Currently this case is not handled, sorry") tree_leaves.add(node_name) for name in tree_leaves: #I don't think there is a 'peek' ? eg_name = name break logging.info("Read in %i tree tip names e.g. %s" % ( len(tree_leaves), eg_name)) # Make a new fasta file of all the sequences that are leaves found_sequence_names = set() num_seqs_unaligned = 0 filtered_aligned_tempfile = tempfile.NamedTemporaryFile(prefix='singlem_package_creator', suffix='.fasta') for s in SeqIO.parse(gpkg.unaligned_sequence_database_path(), "fasta"): num_seqs_unaligned += 1 if s.id in tree_leaves: if s.id in found_sequence_names: raise Exception("Found duplicate sequence names in graftm unaligned" " sequence fasta file. Currently this case is not handled," " sorry") SeqIO.write([s], filtered_aligned_tempfile, "fasta") found_sequence_names.add(s.id) filtered_aligned_tempfile.flush() if len(tree_leaves) != len(found_sequence_names): for t in tree_leaves: if t not in found_sequence_names: raise Exception("Found some sequences that were in the tree but not the" " unaligned sequences database e.g. %s. Something is" " likely amiss with the input GraftM package" % t) raise Exception("Programming error, shouldn't get here") logging.info("All %i sequences found in tree extracted successfully from unaligned" " sequences fasta file, which originally had %i sequences" % ( len(found_sequence_names), num_seqs_unaligned)) # Create a new diamond database dmnd_tf = tempfile.NamedTemporaryFile(prefix='singlem_package_creator',suffix='.dmnd') cmd = "diamond makedb --in '%s' -d '%s'" % (filtered_aligned_tempfile.name, dmnd_tf.name) logging.info("Creating DIAMOND database") extern.run(cmd) # Compile the final graftm/singlem package if len(gpkg.search_hmm_paths()) == 1 and \ gpkg.search_hmm_paths()[0] == gpkg.alignment_hmm_path(): search_hmms = None else: search_hmms = gpkg.search_hmm_paths() with tempdir.TempDir() as tmpdir: gpkg_name = os.path.join( tmpdir, os.path.basename( os.path.abspath(input_graftm_package_path)).replace('.gpkg','')) GraftMPackageVersion3.compile(gpkg_name, gpkg.reference_package_path(), gpkg.alignment_hmm_path(), dmnd_tf.name if is_protein_package else None, gpkg.maximum_range(), filtered_aligned_tempfile.name if is_protein_package else \ gpkg.unaligned_sequence_database_path(), gpkg.use_hmm_trusted_cutoff(), search_hmms) logging.debug("Finished creating GraftM package for conversion to SingleM package") SingleMPackageVersion2.compile(output_singlem_package_path, gpkg_name, hmm_position, window_size) shutil.rmtree(gpkg_name) if is_protein_package: filtered_aligned_tempfile.close() dmnd_tf.close() logging.info("SingleM-compatible package creation finished")
def update(self, **kwargs): ''' Update an existing GraftM package with new sequences and taxonomy. If no taxonomy is provided, attempt to decorate the new sequences with pre-existing taxonomy. Parameters ---------- input_sequence_path: str Path to FASTA file containing sequences to add to the update GraftM package input_taxonomy_path: str Taxonomy corresponding to the sequences in input_sequence_path. If None, then attempt to assign taxonomy by decorating the tree made out of all sequences. input_graftm_package_path: str Path to the directory of the GraftM package that is to be updated output_graftm_package_path: str Path to the directory to which the new GraftM package will be written to ''' input_sequence_path = kwargs.pop('input_sequence_path') input_taxonomy_path = kwargs.pop('input_taxonomy_path', None) input_graftm_package_path = kwargs.pop('input_graftm_package_path') output_graftm_package_path = kwargs.pop('output_graftm_package_path') threads = kwargs.pop( 'threads', UpdateDefaultOptions.threads) #TODO: add to user options if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) logging.info("Reading previous GraftM package") old_gpkg = GraftMPackage.acquire(input_graftm_package_path) min_input_version = 3 if old_gpkg.version < min_input_version: raise InsufficientGraftMPackageVersion( "GraftM below version %s cannot be updated using the update function." % min_input_version + " Unaligned sequences are not included in these packages, therefore no new" " alignment/HMM/Tree can be created") new_gpkg = UpdatedGraftMPackage() new_gpkg.output = output_graftm_package_path new_gpkg.name = output_graftm_package_path.replace(".gpkg", "") ####################################### ### Collect all unaligned sequences ### logging.info("Concatenating unaligned sequence files") new_gpkg.unaligned_sequences = "%s_sequences.fa" % ( new_gpkg.name ) #TODO: replace hard-coded paths like this with tempfiles self._concatenate_file( [old_gpkg.unaligned_sequence_database_path(), input_sequence_path], new_gpkg.unaligned_sequences) ######################################################### ### Parse taxonomy info up front so errors come early ### if input_taxonomy_path: logging.info("Reading new taxonomy information") input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path) original_taxonomy_hash = old_gpkg.taxonomy_hash() total_taxonomy_hash = original_taxonomy_hash.copy() total_taxonomy_hash.update(input_taxonomy.taxonomy) num_duplicate_taxonomies = len(total_taxonomy_hash) - \ len(input_taxonomy.taxonomy) - \ len(original_taxonomy_hash) logging.debug( "Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies) if num_duplicate_taxonomies > 0: logging.warn( "Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies) ############################### ### Re-construct alignments ### logging.info("Multiple sequence aligning all sequences") new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name) self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads) ######################## ### Re-construct HMM ### logging.info("Creating HMM from alignment") new_gpkg.hmm = "%s.hmm" % (new_gpkg.name) new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name) self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment) ######################### ### Re-construct tree ### logging.info("Generating phylogenetic tree") new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name) new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name) new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type( old_gpkg.alignment_hmm_path()) new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \ self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name, new_gpkg.package_type, self.fasttree) ############################################## ### Re-root and decorate tree if necessary ### if input_taxonomy_path: new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree else: logging.info("Finding taxonomy for new sequences") rerooter = Rerooter() old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(), schema='newick') new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree, schema='newick') old_tree = rerooter.reroot(old_tree) new_tree = rerooter.reroot(new_tree) # TODO: Shouldn't call an underscore method, eventually use # Rerooter instead. rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree) new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name td = TreeDecorator(rerooted_tree, old_gpkg.taxtastic_taxonomy_path(), old_gpkg.taxtastic_seqinfo_path()) with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy: td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) total_taxonomy_hash = GreenGenesTaxonomy.read_file( taxonomy.name).taxonomy ################################ ### Generating tree log file ### logging.info("Generating phylogenetic tree log file") new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name self._generate_tree_log_file(new_gpkg.unrooted_tree, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.package_type, self.fasttree) ################################ ### Creating taxtastic files ### logging.info("Writing new taxonomy files") new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name gtns = Getaxnseq() gtns.write_taxonomy_and_seqinfo_files(total_taxonomy_hash, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo) ###################### ### Compile refpkg ### logging.info("Compiling pplacer refpkg") new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name) refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment, new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log, new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo, new_gpkg.refpkg, True) ##################################### ### Re-construct diamond database ### logging.info("Recreating DIAMOND DB") new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name) self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name) #################### ### Compile gpkg ### logging.info("Compiling GraftM package") new_gpkg.name = "%s.gpkg" % new_gpkg.name GraftMPackageVersion3.compile( new_gpkg.name, new_gpkg.refpkg, new_gpkg.hmm, new_gpkg.diamond_database, self._define_range(new_gpkg.unaligned_sequences), new_gpkg.unaligned_sequences, search_hmm_files=old_gpkg.search_hmm_paths()) ################### ### Test it out ### logging.info("Testing newly updated GraftM package works") self._test_package(new_gpkg.name) logging.info("Finished")
#!/usr/bin/env python2.7 import argparse import os import json import itertools import dendropy from graftm.graftm_package import GraftMPackage parser = argparse.ArgumentParser() parser.add_argument('--graftm_package', help='package to look at', required=True) args = parser.parse_args() gpkg = GraftMPackage.acquire(args.graftm_package) taxonomy_hash = gpkg.taxonomy_hash() taxonomy_to_leaves = {} for name, taxonomy in taxonomy_hash.items(): for i in range(len(taxonomy)): tax = '; '.join(taxonomy[:(i+1)]) if tax not in taxonomy_to_leaves: taxonomy_to_leaves[tax] = [] taxonomy_to_leaves[tax].append(name) refpkg_contents = os.path.join(gpkg.reference_package_path(),'CONTENTS.json') refpkg = json.loads(open(refpkg_contents).read()) tree_file = os.path.join(gpkg.reference_package_path(),refpkg['files']['tree']) tree = dendropy.Tree.get(path=tree_file, schema='newick') print "\t".join([
#!/usr/bin/env python2.7 import argparse import os import json import itertools import dendropy from graftm.graftm_package import GraftMPackage parser = argparse.ArgumentParser() parser.add_argument('--graftm_package', help='package to look at', required=True) args = parser.parse_args() gpkg = GraftMPackage.acquire(args.graftm_package) taxonomy_hash = gpkg.taxonomy_hash() taxonomy_to_leaves = {} for name, taxonomy in taxonomy_hash.items(): for i in range(len(taxonomy)): tax = '; '.join(taxonomy[:(i + 1)]) if tax not in taxonomy_to_leaves: taxonomy_to_leaves[tax] = [] taxonomy_to_leaves[tax].append(name) refpkg_contents = os.path.join(gpkg.reference_package_path(), 'CONTENTS.json') refpkg = json.loads(open(refpkg_contents).read()) tree_file = os.path.join(gpkg.reference_package_path(), refpkg['files']['tree'])
def set_attributes(self, args): # Read graftM package and assign HMM and refpkg file if args.no_merge_reads: setattr(args, 'merge_reads', False) else: if args.reverse: setattr(args, 'merge_reads', True) else: setattr(args, 'merge_reads', False) if args.graftm_package: if not os.path.isdir(args.graftm_package): raise Exception( "%s does not exist. Are you sure you provided the correct path?" % args.graftm_package) else: gpkg = GraftMPackage.acquire(args.graftm_package) if hasattr( args, 'search_hmm_files' ): # If a hmm is specified, overwrite the one graftM package setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path()) setattr(args, 'reference_package', gpkg.reference_package_path()) else: setattr(args, 'search_hmm_files', []) for hmm in gpkg.search_hmm_paths(): args.search_hmm_files.append(hmm) setattr(args, 'aln_hmm_file', gpkg.alignment_hmm_path()) setattr(args, 'reference_package', gpkg.reference_package_path()) elif hasattr(args, 'search_diamond_files'): if args.search_method == self.DIAMOND_SEARCH_METHOD: if hasattr(args, 'aln_hmm_file'): pass else: raise Exception("aln_hmm_file not specified") else: raise Exception( "Specified DIAMOND databases when not using the diamond search pipeline. Using: %s" % (args.search_method)) elif hasattr(args, 'search_hmm_files'): if args.search_method == self.HMMSEARCH_SEARCH_METHOD: if not hasattr(args, 'aln_hmm_file'): if len(args.search_hmm_files) == 1: if not args.search_only: setattr(args, 'aln_hmm_file', args.search_hmm_files[0]) else: raise Exception( "Multiple search HMMs specified, but aln_hmm_file not specified" ) else: raise Exception( "Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method)) elif hasattr(args, 'search_hmm_list_file'): if args.search_method == self.HMMSEARCH_SEARCH_METHOD: setattr(args, 'search_hmm_files', [ x.rstrip() for x in open(args.search_hmm_list_file).readlines() ]) if not hasattr(args, 'aln_hmm_file'): if not args.search_only: raise Exception( "Multiple search HMMs specified, but aln_hmm_file not specified" ) else: raise Exception( "Specified HMM search_hmm_files when not using the hmmsearch pipeline. Using: %s" % (args.search_method)) else: if args.search_only: if args.search_diamond_file: args.search_method = self.DIAMOND_SEARCH_METHOD args.search_hmm_files = None else: raise Exception( 'No gpkg, HMM, or DIAMOND database was specified, so there is no reference database to search with.' )
def regenerate(self, **kwargs): input_singlem_package = kwargs.pop('input_singlem_package') output_singlem_package = kwargs.pop('output_singlem_package') working_directory = kwargs.pop('working_directory') euk_sequences = kwargs.pop('euk_sequences') euk_taxonomy = kwargs.pop('euk_taxonomy') intermediate_archaea_graftm_package = kwargs.pop('intermediate_archaea_graftm_package') intermediate_bacteria_graftm_package = kwargs.pop('intermediate_bacteria_graftm_package') input_taxonomy = kwargs.pop('input_taxonomy') type_strains_list_file = kwargs.pop('type_strains_list_file') if len(kwargs) > 0: raise Exception("Unexpected arguments detected: %s" % kwargs) original_pkg = SingleMPackage.acquire(input_singlem_package) original_hmm_path = original_pkg.hmm_path() basename = original_pkg.graftm_package_basename() # Run GraftM on the euk sequences with the bacterial set euk_graftm_output = os.path.join(working_directory, "%s-euk_graftm" % basename) cmd = "graftM graft --graftm_package '%s' --search_and_align_only --forward '%s' --output %s --force" % ( original_pkg.graftm_package_path(), euk_sequences, euk_graftm_output) extern.run(cmd) # Extract hit sequences from that set euk_result = GraftMResult(euk_graftm_output) hit_paths = euk_result.unaligned_sequence_paths(require_hits=True) if len(hit_paths) != 1: raise Exception( "Unexpected number of hits against euk in graftm") euk_hits_path = hit_paths.values()[0] # Concatenate euk, archaea and bacterial sequences archaeal_intermediate_pkg = GraftMPackage.acquire( intermediate_archaea_graftm_package) bacterial_intermediate_pkg = GraftMPackage.acquire( intermediate_bacteria_graftm_package) num_euk_hits = 0 final_sequences_path = os.path.join(working_directory, "%s_final_sequences.faa" % basename) archeal_seqs = archaeal_intermediate_pkg.unaligned_sequence_database_path() bacterial_seqs = bacterial_intermediate_pkg.unaligned_sequence_database_path() with open(type_strains_list_file) as f: type_strain_identifiers = [s.strip() for s in f.readlines()] logging.info("Read in %i type strain IDs e.g. %s" % ( len(type_strain_identifiers), type_strain_identifiers[0])) with open(final_sequences_path, 'w') as final_seqs_fp: with open(euk_hits_path) as euk_seqs_fp: for name, seq, _ in SeqReader().readfq(euk_seqs_fp): if name.find('_split_') == -1: num_euk_hits += 1 #TODO: Dereplicate at some level final_seqs_fp.write(">%s\n%s\n" % (name, seq)) logging.info("Found %i eukaryotic sequences to include in the package" % \ num_euk_hits) # Dereplicate hit sequences on the species level, choosing type strains # where applicable. dereplicator = Dereplicator() for gpkg in [archaeal_intermediate_pkg, bacterial_intermediate_pkg]: tax = gpkg.taxonomy_hash() species_dereplicated_ids = dereplicator.dereplicate( list(tax.keys()), 8, # root, kingdom, phylum, c o f g s tax, type_strain_identifiers) logging.debug("Dereplicator returned %i entries" % len(species_dereplicated_ids)) num_total = 0 num_written = 0 with open(gpkg.unaligned_sequence_database_path()) as seqs: for name, seq, _ in SeqReader().readfq(seqs): num_total += 1 if name in species_dereplicated_ids: final_seqs_fp.write(">%s\n%s\n" % (name, seq)) num_written += 1 logging.info( "Of %i sequences in gpkg %s, %i species-dereplicated were included in the final package." %( num_total, gpkg, num_written)) # Concatenate euk and input taxonomy final_taxonomy_file = os.path.join(working_directory, "%s_final_taxonomy.csv" % basename) extern.run("cat %s %s > %s" % ( euk_taxonomy, input_taxonomy, final_taxonomy_file)) # Run graftm create to get the final package final_gpkg = os.path.join(working_directory, "%s_final.gpkg" % basename) cmd = "graftM create --force --sequences %s --taxonomy %s --search_hmm_files %s %s --hmm %s --output %s" % ( final_sequences_path, final_taxonomy_file, ' '.join(archaeal_intermediate_pkg.search_hmm_paths()), ' '.join(bacterial_intermediate_pkg.search_hmm_paths()), original_hmm_path, final_gpkg) extern.run(cmd) ############################################################################## # Remove sequences from the diamond DB that are not in the tree i.e. # those that are exact duplicates, so that the diamond_example hits are # always in the tree. # Read the list of IDs in the tree with dendropy final_gpkg_object = GraftMPackage.acquire(final_gpkg) unaligned_seqs = final_gpkg_object.unaligned_sequence_database_path() tree = dendropy.Tree.get(path=final_gpkg_object.reference_package_tree_path(), schema='newick') leaf_names = [l.taxon.label.replace(' ','_') for l in tree.leaf_node_iter()] logging.debug("Read in final tree with %i leaves" % len(leaf_names)) # Extract out of the sequences file in the graftm package final_seqs = SequenceExtractor().extract_and_read( leaf_names, unaligned_seqs) if len(final_seqs) != len(leaf_names): raise Exception("Do not appear to have extracted the expected number of sequences from the unaligned fastat file") # Write the reads into sequences file in place with open(unaligned_seqs, 'w') as f: for s in final_seqs: f.write(">%s\n" % s.name) f.write(s.seq) f.write("\n") # Regenerate the diamond DB final_gpkg_object.create_diamond_db() ############################################################################## # Run singlem create to put the final package together SingleMPackageVersion2.compile( output_singlem_package, final_gpkg, original_pkg.singlem_position(), original_pkg.window_size()) logging.info("SingleM package generated.")