def test_build_blast_db_from_seqs(self): """build_blast_db_from_seqs convenience function works as expected """ blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1, output_dir='/tmp') self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_generate_new_otus_stats(self): """Test generating new OTU stats on valid input data.""" exp = [('New.CleanUp.ReferenceOTU972', 'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG' 'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;baz', 2, 60.0, 4.349999999999994, {'Env2': 25.0, 'Env1': 35.0}), ('New.CleanUp.ReferenceOTU969', 'ATACGTAGGTCCCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAG' 'TCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGC', 'foo;bar;baz', 2, 14.0, 12.5, {'Env2': 8.0, 'Env1': 6.0}), ('New.CleanUp.ReferenceOTU964', 'ATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGACGGCGAAGCAAG' 'TCTGAAGTGAAAGCCCGGGGCTCAACCGCGGGACTGC', 'foo;bar;baz', 2, 5.0, 14.769999999999996, {'Env2': 2.0, 'Env1': 3.0}), ('New.CleanUp.ReferenceOTU999', 'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG' 'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;bazz', 1, 99.0, 4.349999999999994, {'Env2': 0.0, 'Env1': 99.0})] ref_seqs_db, ref_seqs_db_files_to_remove = \ build_blast_db_from_fasta_path(self.ref_seqs_f.name) obs = _generate_new_otus_stats(self.otu_table_f, self.rep_set_f, self.ref_seqs_f, ref_seqs_db, self.mapping_f, self.grouping_category, self.top_n) remove_files(ref_seqs_db_files_to_remove) self.assertFloatEqual(obs, exp)
def tearDown(self): remove_files(set(self.files_to_remove)) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): if self._files_to_remove: remove_files(self._files_to_remove) if exists(self.output_dir): rmtree(self.output_dir) if exists(self.input_dir): rmtree(self.input_dir)
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k,'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1,1000): filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_plot_heatmap(self): plot_heatmap(self.otu_table, self.otu_table.ObservationIds, self.otu_table.SampleIds, filename=self.tmp_heatmap_fpath) self.assertEqual(exists(self.tmp_heatmap_fpath), True) remove_files(set([self.tmp_heatmap_fpath]))
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which('mothur'), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), 'mothur.log') command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(' ')[1].strip('v.') try: version = tuple(map(int, version_string.split('.'))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
def test_ParsInsert_supported_version(self): """ParsInsert is in path and version is supported """ acceptable_version = ["1.04"] self.assertTrue( which('ParsInsert'), "ParsInsert not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.") command = "ParsInsert -v | grep App | awk '{print $3}'" proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT) stdout = proc.stdout.read() # remove log file generated remove_files(['ParsInsert.log'], error_on_missing=False) version_string = stdout.strip() try: pass_test = version_string in acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported ParsInsert version. %s is required, but running %s." % ('.'.join(map(str, acceptable_version)), version_string))
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def check_options(parser, options): """Check to insure required options have been supplied""" if options.percent_aligned > 1.0: parser.error(\ "Please check -p option: should be between 0.0(0%) and 1.0(100%)") if options.querydb is None: parser.error(\ "Please check -i option: must specify path to a FASTA file") try: f = open(options.querydb, 'r') f.close() except IOError: parser.error(\ "Please check -i option: cannot read from query FASTA filepath") if options.subjectdb is None: parser.error(\ "Please check -d option: must specify path to a FASTA file") try: f = open(options.subjectdb, 'r') f.close() except IOError: parser.error(\ "Please check -d option: cannot read from subject FASTA filepath") if options.outputfilename is None: parser.error(\ "Please check -o option: must specify base output path") try: f = open(options.outputfilename, 'w') f.close() remove_files([FilePath(options.outputfilename)]) except IOError: parser.error(\ "Please check -o option: cannot write to output file")
def test_ParsInsert_supported_version(self): """ParsInsert is in path and version is supported """ acceptable_version = ["1.04"] self.assertTrue( which("ParsInsert"), "ParsInsert not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.", ) command = "ParsInsert -v | grep App | awk '{print $3}'" proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT) stdout = proc.stdout.read() # remove log file generated remove_files(["ParsInsert.log"], error_on_missing=False) version_string = stdout.strip() try: pass_test = version_string in acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported ParsInsert version. %s is required, but running %s." % (".".join(map(str, acceptable_version)), version_string), )
def test_mothur_supported_version(self): """mothur is in path and version is supported """ acceptable_version = (1, 25, 0) self.assertTrue( which("mothur"), "mothur not found. This may or may not be a problem depending on " + "which components of QIIME you plan to use.", ) # mothur creates a log file in cwd, so create a tmp and cd there first log_file = join(get_qiime_temp_dir(), "mothur.log") command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file stdout, stderr, exit_Status = qiime_system_call(command) # remove log file remove_files([log_file], error_on_missing=False) version_string = stdout.strip().split(" ")[1].strip("v.") try: version = tuple(map(int, version_string.split("."))) pass_test = version == acceptable_version except ValueError: pass_test = False version_string = stdout self.assertTrue( pass_test, "Unsupported mothur version. %s is required, but running %s." % (".".join(map(str, acceptable_version)), version_string), )
def check_options(parser,options): """Check to insure required options have been supplied""" if options.percent_aligned > 1.0: parser.error(\ "Please check -p option: should be between 0.0(0%) and 1.0(100%)") if options.querydb is None: parser.error(\ "Please check -i option: must specify path to a FASTA file") try: f=open(options.querydb,'r') f.close() except IOError: parser.error(\ "Please check -i option: cannot read from query FASTA filepath") if options.subjectdb is None: parser.error(\ "Please check -d option: must specify path to a FASTA file") try: f=open(options.subjectdb,'r') f.close() except IOError: parser.error(\ "Please check -d option: cannot read from subject FASTA filepath") if options.outputfilename is None: parser.error(\ "Please check -o option: must specify base output path") try: f=open(options.outputfilename,'w') f.close() remove_files([FilePath(options.outputfilename)]) except IOError: parser.error(\ "Please check -o option: cannot write to output file")
def clean_up_raw_data_files(raw_data_files, raw_data_dirs): for raw_data_fp_glob in raw_data_files: remove_files(glob(raw_data_fp_glob)) for raw_data_dir_glob in raw_data_dirs: for dir_to_remove in glob(raw_data_dir_glob): rmtree(dir_to_remove)
def qiime_blast_seqs(seqs, blast_constructor=Blastall, blast_program='blastn', blast_db=None, refseqs=None, refseqs_fp=None, blast_mat_root=None, params={}, WorkingDir=None, seqs_per_blast_run=1000, HALT_EXEC=False): """Blast list of sequences. seqs: a list (or object with list-like interace) of (seq_id, seq) tuples (e.g., the output of MinimalFastaParser) """ assert blast_db or refseqs_fp or refseqs, \ 'Must provide either a blast_db or a fasta '+\ 'filepath containing sequences to build one.' if refseqs_fp: blast_db, db_files_to_remove =\ build_blast_db_from_fasta_path(refseqs_fp,output_dir=WorkingDir) elif refseqs: blast_db, db_files_to_remove =\ build_blast_db_from_fasta_file(refseqs,output_dir=WorkingDir) else: db_files_to_remove = [] params["-d"] = blast_db params["-p"] = blast_program blast_app = blast_constructor(params=params, blast_mat_root=blast_mat_root, InputHandler='_input_as_seq_id_seq_pairs', WorkingDir=WorkingDir, SuppressStderr=True, HALT_EXEC=HALT_EXEC) current_seqs = [] blast_results = BlastResult([]) for seq in seqs: current_seqs.append(seq) if len(current_seqs) % seqs_per_blast_run == 0: if blast_results: blast_results.update(\ BlastResult(blast_app(current_seqs)['StdOut'])) else: blast_results = BlastResult(blast_app(current_seqs)['StdOut']) current_seqs = [] # clean-up run: blast the remaining sequences blast_results.update(\ BlastResult(blast_app(current_seqs)['StdOut'])) remove_files(db_files_to_remove) return blast_results
def check_write_load(self, orig_data): orig = RegionCollection(**orig_data) orig.writeToFile('test_data') recovered = RegionCollection(filename='test_data') for key in orig_data: self.assertEqual(getattr(recovered, key), orig_data[key]) remove_files(['test_data'], error_on_missing=False)
def tearDown(self): """Clean up tmp files.""" remove_files(self.files_to_remove, False) if self.tmpdir: rmtree(self.tmpdir) # clean up the file from init_flowgram_file if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)): remove(self.tmp_filename)
def tearDown(self): """ """ disable_timeout() remove_files(self.files_to_remove, error_on_missing=False) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Removes temporary directories and files.""" remove_files(self.files_to_remove) # Remove directories last, so we don't get errors trying to remove # files which may be in the directories. for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Clean up tmp files.""" remove_files(self.files_to_remove, False) if self.tmpdir: rmtree(self.tmpdir) #clean up the file from init_flowgram_file if (hasattr(self,"tmp_filename") and exists(self.tmp_filename)): remove(self.tmp_filename)
def tearDown(self): """ """ disable_timeout() remove_files(self.files_to_remove,error_on_missing=False) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """ """ # turn off the alarm signal.alarm(0) remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """Remove temporary files/dirs created by tests.""" # Change back to the start dir - some workflows change directory. chdir(self.start_dir) remove_files(self.files_to_remove) # Remove directories last, so we don't get errors trying to remove # files which may be in the directories. for d in self.dirs_to_remove: if exists(d): rmtree(d)
def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None): self.log_lines = [] if not blast_db: self.blast_db, self.db_files_to_remove = \ build_blast_db_from_fasta_path(refseqs_fp) self.log_lines.append('Reference seqs fp (to build blast db): %s'%\ refseqs_fp) else: self.blast_db = blast_db self.db_files_to_remove = [] self.log_lines.append('Blast database: %s' % self.blast_db) clusters, failures = self._cluster_seqs(\ MinimalFastaParser(open(seq_path))) self.log_lines.append('Num OTUs: %d' % len(clusters)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, 'w') for cluster_id, cluster in clusters.items(): of.write('%s\t%s\n' % (cluster_id, '\t'.join(cluster))) of.close() result = None self.log_lines.append('Result path: %s\n' % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = clusters self.log_lines.append('Result path: None, returned as dict.') if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') self.log_lines = [str(self)] + self.log_lines log_file.write('\n'.join(self.log_lines)) failures.sort() log_file.write('Num failures: %d\n' % len(failures)) log_file.write('Failures: %s\n' % '\t'.join(failures)) remove_files(self.db_files_to_remove, error_on_missing=False) # return the result (note this is None if the data was # written to file) return result
def test_export_table(self): """correctly generates table file""" orig_data = dict(counts=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]], ranks=[0, 1, 2, 3, 4], labels=['a', 'b', 'c', 'd', 'e']) coll = RegionCollection(**orig_data) expect = coll.toTable().getRawData() coll.writeToFile('testdata', as_table=True) got = LoadTable('testdata', sep='\t') self.assertEqual(got.getRawData(), expect) remove_files(['testdata'], error_on_missing=False)
def tearDown(self): disable_timeout() # reset sys.stderr sys.stderr = self.saved_stderr remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def test_tree_collection_read_write_file(self): """should correctly read / write a collection from a file""" def eval_klass(coll): coll.writeToFile('sample.trees') read = LoadTrees('sample.trees') self.assertTrue(type(read) == type(coll)) eval_klass(LogLikelihoodScoredTreeCollection(self.scored_trees)) # convert lnL into p eval_klass(WeightedTreeCollection([(exp(s), t) for s,t in self.scored_trees])) remove_files(['sample.trees'], error_on_missing=False)
def tearDown(self): """ """ disable_timeout() # change back to the start dir - some workflows change directory chdir(self.start_dir) remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def tearDown(self): """ """ disable_timeout() # reset sys.stderr sys.stderr = self.saved_stderr remove_files(self.files_to_remove) # remove directories last, so we don't get errors # trying to remove files which may be in the directories for d in self.dirs_to_remove: if exists(d): rmtree(d)
def remove_pyronoise_intermediates(basename): """Removes all intermediate pyronoise files. basename: the prefix for all files. """ remove_files(map(lambda a: basename+a, [ ".otu", ".tau", "_cd.fa", ".fdist", ".pout", ".tree", "_cd2.fa", ".cen", ".list", ".qual", ".z", "_cf.fa"]), True) for filename in listdir(basename): remove(basename+"/"+filename) rmdir(basename)
def remove_intermediate_files(self): """Remove all intermediate files.""" # tmp files are written in the current dir, # app controller always jumps into dir specified via exec_dir # Note: blast intermediates are not removed exec_dir = str(self.Parameters['--exec_dir'].Value) inp_file_name = str(self.Parameters['--query_NAST'].Value) exec_dir = exec_dir.rstrip('"') exec_dir = exec_dir.lstrip('"') inp_file_name = inp_file_name.rstrip('"') inp_file_name = inp_file_name.lstrip('"') tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx", ".CPS.CPC.wTaxons", ".cidx"] cs_tmp_files = [ exec_dir + '/' + inp_file_name + x for x in tmp_suffixes] remove_files(cs_tmp_files, error_on_missing=False) db_param = self.Parameters['--db_NAST'] if db_param.isOn(): nast_db_name = str(db_param.Value) nast_db_name = nast_db_name.rstrip('"') nast_db_name = nast_db_name.lstrip('"') # Better do not remove this file since other ChimeraSlayer # instances running on the same ref set might use this file # Should be rather deleted in the calling function # remove_files([nast_db_name + ".cidx"], # error_on_missing=False) fasta_param = self.Parameters['--db_FASTA'] if fasta_param.isOn(): fasta_name = str(fasta_param.Value) fasta_name = fasta_name.rstrip('"') fasta_name = fasta_name.lstrip('"') blast_db_files = [ fasta_name + x for x in [ ".nsq", ".nin", ".nhr", ".cidx"]] remove_files(blast_db_files, error_on_missing=False)
def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None): self.log_lines = [] if not blast_db: self.blast_db, self.db_files_to_remove = build_blast_db_from_fasta_path(refseqs_fp) self.log_lines.append("Reference seqs fp (to build blast db): %s" % refseqs_fp) else: self.blast_db = blast_db self.db_files_to_remove = [] self.log_lines.append("Blast database: %s" % self.blast_db) clusters, failures = self._cluster_seqs(MinimalFastaParser(open(seq_path))) self.log_lines.append("Num OTUs: %d" % len(clusters)) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, "w") for cluster_id, cluster in clusters.items(): of.write("%s\t%s\n" % (cluster_id, "\t".join(cluster))) of.close() result = None self.log_lines.append("Result path: %s\n" % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = clusters self.log_lines.append("Result path: None, returned as dict.") if log_path: # if the user provided a log file path, log the run log_file = open(log_path, "w") self.log_lines = [str(self)] + self.log_lines log_file.write("\n".join(self.log_lines)) failures.sort() log_file.write("Num failures: %d\n" % len(failures)) log_file.write("Failures: %s\n" % "\t".join(failures)) remove_files(self.db_files_to_remove, error_on_missing=False) # return the result (note this is None if the data was # written to file) return result
def tearDown(self): """Clean up tmp files.""" # turn off the alarm signal.alarm(0) remove_files(self.files_to_remove, False) if self.server_socket: self.server_socket.close() # give clients time to clean up sleep(1) if exists(self.tmp_dir): try: rmdir(self.tmp_dir) except OSError: # give clients some more time, fail if still error sleep(5) rmdir(self.tmp_dir)
def tearDown(self): """Clean up tmp files.""" # turn off the alarm signal.alarm(0) remove_files(self.files_to_remove, False) if self.server_socket: self.server_socket.close() #give clients time to clean up sleep(1) if exists(self.tmp_dir): try: rmdir(self.tmp_dir) except OSError: #give clients some more time, fail if still error sleep(5) rmdir(self.tmp_dir)
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)] self.assertEqual(actual,expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db, self.in_aln1_fp) expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ temp_fasta_fp = get_tmp_filename(prefix="QiimeScriptUtilTests", suffix=".fasta") temp_fasta = [">seq", "AAACCCCAAATTGG"] * 25 open(temp_fasta_fp, "w").write("\n".join(temp_fasta)) actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25) actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2) actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10) actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5) actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40) remove_files([temp_fasta_fp]) self.assertEqual(actual_25, 1) self.assertEqual(actual_2, 13) self.assertEqual(actual_10, 3) self.assertEqual(actual_5, 5) self.assertEqual(actual_40, 1)
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ temp_fasta_fp = get_tmp_filename(\ prefix='QiimeScriptUtilTests',suffix='.fasta') temp_fasta = ['>seq','AAACCCCAAATTGG'] * 25 open(temp_fasta_fp,'w').write('\n'.join(temp_fasta)) actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp,25) actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp,2) actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp,10) actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp,5) actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp,40) remove_files([temp_fasta_fp]) self.assertEqual(actual_25,1) self.assertEqual(actual_2,13) self.assertEqual(actual_10,3) self.assertEqual(actual_5,5) self.assertEqual(actual_40,1)
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db, self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_compute_seqs_per_file(self): """compute_seqs_per_file functions as expected """ temp_fasta_fp = get_tmp_filename(\ prefix='QiimeScriptUtilTests',suffix='.fasta') temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25 open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta)) actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25) actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2) actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10) actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5) actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40) remove_files([temp_fasta_fp]) self.assertEqual(actual_25, 1) self.assertEqual(actual_2, 13) self.assertEqual(actual_10, 3) self.assertEqual(actual_5, 5) self.assertEqual(actual_40, 1)
def test_build_blast_db_from_fasta_path_aln(self): """build_blast_db_from_fasta_path works with alignment as input """ blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp) self.assertEqual(blast_db, self.in_aln1_fp) expected_db_files = set([blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_seqs(self): """build_blast_db_from_seqs convenience function works as expected """ blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1, output_dir="/tmp") self.assertTrue(blast_db.startswith("/tmp/Blast_tmp_db")) self.assertTrue(blast_db.endswith(".fasta")) expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/') infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual,expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))
def test_build_blast_db_from_fasta_file(self): """build_blast_db_from_fasta_file works with open files as input """ blast_db, db_files = build_blast_db_from_fasta_file(open(self.in_aln1_fp), output_dir="/tmp/") self.assertTrue(blast_db.startswith("/tmp/BLAST_temp_db")) self.assertTrue(blast_db.endswith(".fasta")) expected_db_files = set( [blast_db] + [blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]] ) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_file(self): """build_blast_db_from_fasta_file works with open files as input """ blast_db, db_files = \ build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/') self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db')) self.assertTrue(blast_db.endswith('.fasta')) expected_db_files = set([blast_db] + [blast_db + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def test_build_blast_db_from_fasta_path(self): """build_blast_db_from_fasta_path convenience function works as expected """ blast_db, db_files = \ build_blast_db_from_fasta_path(self.in_seqs1_fp) self.assertEqual(blast_db, self.in_seqs1_fp) expected_db_files = set([self.in_seqs1_fp + ext\ for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']]) self.assertEqual(set(db_files), expected_db_files) # result returned when blasting against new db self.assertEqual(\ len(blastn(self.test_seq,blast_db=blast_db)),1) # Make sure all db_files exist for fp in db_files: self.assertTrue(exists(fp)) # Remove all db_files exist remove_files(db_files) # Make sure nothing weird happened in the remove for fp in db_files: self.assertFalse(exists(fp))
def tearDown(self): for dir in self.dirs_to_remove: if exists(dir): rmdir(dir) remove_files(self.files_to_remove)
def tearDown(self): remove_files(self.files_to_remove)
def get_clusters_from_fasta_filepath(fasta_filepath, original_fasta_path, percent_ID=0.97, max_accepts=1, max_rejects=8, stepwords=8, word_length=8, optimal=False, exact=False, suppress_sort=False, output_dir=None, enable_rev_strand_matching=False, subject_fasta_filepath=None, suppress_new_clusters=False, return_cluster_maps=False, stable_sort=False, save_uc_files=True, HALT_EXEC=False): """ Main convenience wrapper for using uclust to generate cluster files A source fasta file is required for the fasta_filepath. This will be sorted to be in order of longest to shortest length sequences. Following this, the sorted fasta file is used to generate a cluster file in the uclust (.uc) format. Next the .uc file is converted to cd-hit format (.clstr). Finally this file is parsed and returned as a list of lists, where each sublist a cluster of sequences. If an output_dir is specified, the intermediate files will be preserved, otherwise all files created are temporary and will be deleted at the end of this function The percent_ID parameter specifies the percent identity for a clusters, i.e., if 99% were the parameter, all sequences that were 99% identical would be grouped as a cluster. """ # Create readable intermediate filenames if they are to be kept fasta_output_filepath = None uc_output_filepath = None cd_hit_filepath = None if output_dir and not output_dir.endswith('/'): output_dir += '/' if save_uc_files: uc_save_filepath = get_output_filepaths(output_dir, original_fasta_path) else: uc_save_filepath = None sorted_fasta_filepath = "" uc_filepath = "" clstr_filepath = "" # Error check in case any app controller fails files_to_remove = [] try: if not suppress_sort: # Sort fasta input file from largest to smallest sequence sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \ output_filepath=fasta_output_filepath) # Get sorted fasta name from application wrapper sorted_fasta_filepath = sort_fasta['Output'].name files_to_remove.append(sorted_fasta_filepath) else: sort_fasta = None sorted_fasta_filepath = fasta_filepath # Generate uclust cluster file (.uc format) uclust_cluster = uclust_cluster_from_sorted_fasta_filepath( sorted_fasta_filepath, uc_save_filepath, percent_ID=percent_ID, max_accepts=max_accepts, max_rejects=max_rejects, stepwords=stepwords, word_length=word_length, optimal=optimal, exact=exact, suppress_sort=suppress_sort, enable_rev_strand_matching=enable_rev_strand_matching, subject_fasta_filepath=subject_fasta_filepath, suppress_new_clusters=suppress_new_clusters, stable_sort=stable_sort, HALT_EXEC=HALT_EXEC) # Get cluster file name from application wrapper remove_files(files_to_remove) except ApplicationError: remove_files(files_to_remove) raise ApplicationError, ( 'Error running uclust. Possible causes are ' 'unsupported version (current supported version is v1.2.22) is installed or ' 'improperly formatted input file was provided') except ApplicationNotFoundError: remove_files(files_to_remove) raise ApplicationNotFoundError('uclust not found, is it properly '+\ 'installed?') # Get list of lists for each cluster clusters, failures, seeds = \ clusters_from_uc_file(uclust_cluster['ClusterFile']) # Remove temp files unless user specifies output filepath if not save_uc_files: uclust_cluster.cleanUp() if return_cluster_maps: return clusters, failures, seeds else: return clusters.values(), failures, seeds
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()