def setUp(self): self.maxDiff = None self.id_to_taxonomy_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.txt') self.input_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.reference_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.read_1_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.read_2_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up = [ self.id_to_taxonomy_fp, self.input_seqs_fp, self.reference_seqs_fp, self.read_1_seqs_fp, self.read_2_seqs_fp ] a = open(self.id_to_taxonomy_fp, 'w') a.write(rtax_reference_taxonomy) a.close() b = open(self.reference_seqs_fp, 'w') b.write(rtax_reference_fasta) b.close() c = open(self.input_seqs_fp, 'w') c.write(rtax_test_repset_fasta) c.close() d = open(self.read_1_seqs_fp, 'w') d.write(rtax_test_read1_fasta) d.close() e = open(self.read_2_seqs_fp, 'w') e.write(rtax_test_read2_fasta) e.close()
def setUp(self): """Sets up files for testing. """ self.test_db_prot_filename = get_tmp_filename().replace('"', '') self.test_db_prot = open(self.test_db_prot_filename, 'w') self.test_db_dna_filename = get_tmp_filename().replace('"', '') self.test_db_dna = open(self.test_db_dna_filename, 'w') self.test_query_filename = get_tmp_filename().replace('"', '') self.test_query = open(self.test_query_filename, 'w') # write the global variables at the bottom of this file to the # temporary test files. Can't use file-like objects because the # external application needs actual files. self.test_db_prot.write('\n'.join(test_db_prot)) self.test_db_dna.write('\n'.join(test_db_dna)) self.test_query.write('\n'.join(test_query)) # close the files self.test_db_prot.close() self.test_db_dna.close() self.test_query.close() # prepare output file path self.testout = get_tmp_filename().replace('"', '') self.files_to_remove += [self.test_db_prot_filename, self.test_db_dna_filename, self.test_query_filename, self.testout]
def test_call_log_file(self): """GenericRepSetPicker.__call__ writes log when expected """ tmp_log_filepath = get_tmp_filename(\ prefix='GenericRepSetPickerTest.test_call_output_to_file_l_',\ suffix='.txt') tmp_result_filepath = get_tmp_filename(\ prefix='GenericRepSetPickerTest.test_call_output_to_file_r_',\ suffix='.txt') app = GenericRepSetPicker(params=self.params) obs = app(self.tmp_seq_filepath, self.tmp_otu_filepath,\ result_path=tmp_result_filepath,log_path=tmp_log_filepath) log_file = open(tmp_log_filepath) log_file_str = log_file.read() log_file.close() # remove the temp files before running the test, so in # case it fails the temp file is still cleaned up remove(tmp_log_filepath) remove(tmp_result_filepath) log_file_exp = ["GenericRepSetPicker parameters:",\ 'Algorithm:first',\ "Application:None",\ 'ChoiceF:first', 'ChoiceFRequiresSeqs:False', "Result path: %s" % tmp_result_filepath,] # compare data in log file to fake expected log file for i, j in zip(log_file_str.splitlines(), log_file_exp): if not i.startswith('ChoiceF:'): #can't test, different each time self.assertEqual(i, j)
def setUp(self): self.id_to_taxonomy_fp = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.txt') self.input_seqs_fp = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.fasta') self.reference_seqs_fp = get_tmp_filename(\ prefix='BlastTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up =\ [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp] open(self.id_to_taxonomy_fp, 'w').write(id_to_taxonomy_string) open(self.input_seqs_fp, 'w').write(test_seq_coll.toFasta()) self.test_seqs = test_seq_coll.items() open(self.reference_seqs_fp, 'w').write(test_refseq_coll.toFasta()) self.expected1 = { 's1': ('Archaea;Euryarchaeota;Halobacteriales;uncultured', 0.0, "AY800210"), 's2': ('Archaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.', 0.0, "EU883771"), 's3': ('Archaea;Crenarchaeota;uncultured;uncultured', 0.0, "EF503699"), 's4': ('Archaea;Euryarchaeota;Methanobacteriales;Methanobacterium', 0.0, "DQ260310"), 's5': ('Archaea;Crenarchaeota;uncultured;uncultured', 0.0, "EF503697"), 's6': ('No blast hit', None, None), }
def setUp(self): self.maxDiff = None self.id_to_taxonomy_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.txt') self.input_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.reference_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.read_1_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self.read_2_seqs_fp = get_tmp_filename(\ prefix='RtaxTaxonAssignerTests_',suffix='.fasta') self._paths_to_clean_up = [self.id_to_taxonomy_fp,self.input_seqs_fp,self.reference_seqs_fp, self.read_1_seqs_fp,self.read_2_seqs_fp] a = open(self.id_to_taxonomy_fp,'w') a.write(rtax_reference_taxonomy) a.close() b = open(self.reference_seqs_fp,'w') b.write(rtax_reference_fasta) b.close() c = open(self.input_seqs_fp,'w') c.write(rtax_test_repset_fasta) c.close() d = open(self.read_1_seqs_fp,'w') d.write(rtax_test_read1_fasta) d.close() e = open(self.read_2_seqs_fp,'w') e.write(rtax_test_read2_fasta) e.close()
def setUp(self): """ """ #create a tmp tree file self.in_tree1_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk') self.in_tree1_file = open(self.in_tree1_fp,'w') self.in_tree1_file.write(in_tree1) self.in_tree1_file.close() #create a tmp tree file (with underscores in tip names) self.in_tree2_fp = get_tmp_filename(prefix='AceTests',suffix='.nwk') self.in_tree2_file = open(self.in_tree2_fp,'w') self.in_tree2_file.write(in_tree2) self.in_tree2_file.close() #create a tmp trait file self.in_trait1_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv') self.in_trait1_file=open(self.in_trait1_fp,'w') self.in_trait1_file.write(in_trait1) self.in_trait1_file.close() #create another tmp trait file (need to test table with only single column seperately) self.in_trait2_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv') self.in_trait2_file=open(self.in_trait2_fp,'w') self.in_trait2_file.write(in_trait2) self.in_trait2_file.close() #create a tmp trait file (with underscores in tip names) self.in_trait3_fp = get_tmp_filename(prefix='AceTests',suffix='.tsv') self.in_trait3_file=open(self.in_trait3_fp,'w') self.in_trait3_file.write(in_trait3) self.in_trait3_file.close() self.files_to_remove = [self.in_tree1_fp,self.in_trait1_fp,self.in_trait2_fp, self.in_trait3_fp, self.in_tree2_fp]
def setUp(self): self.tmp_unsorted_fasta_filepath = \ get_tmp_filename(prefix="uclust_test", suffix=".fasta") tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath, "w") tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs)) tmp_unsorted_fasta.close() self.tmp_sorted_fasta_filepath = \ get_tmp_filename(prefix="uclust_test", suffix=".fasta") tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath, "w") tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs)) tmp_sorted_fasta.close() self.tmp_uc_filepath = \ get_tmp_filename(prefix="uclust_test", suffix=".uc") tmp_uc = open(self.tmp_uc_filepath, "w") tmp_uc.write('\n'.join(uc_dna_clusters)) tmp_uc.close() self.tmp_clstr_filepath = \ get_tmp_filename(prefix="uclust_test", suffix=".clstr") self.WorkingDir = '/tmp/uclust_test' self.tmpdir = '/tmp/' self.files_to_remove = [ self.tmp_unsorted_fasta_filepath, self.tmp_sorted_fasta_filepath, self.tmp_uc_filepath, self.tmp_clstr_filepath ]
def setUp(self): self.infernal_test1_input_fp = get_tmp_filename( prefix='InfernalAlignerTests_',suffix='.fasta') open(self.infernal_test1_input_fp,'w').write(infernal_test1_input_fasta) self.infernal_test1_template_fp = get_tmp_filename( prefix='InfernalAlignerTests_',suffix='template.sto') open(self.infernal_test1_template_fp,'w').\ write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename( prefix='InfernalAlignerTests_',suffix='.fasta') open(self.result_fp,'w').close() self.log_fp = get_tmp_filename( prefix='InfernalAlignerTests_',suffix='.log') open(self.log_fp,'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = \ LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\ moltype=DNA)
def setUp(self): self.infernal_test1_input_fp = get_tmp_filename( prefix='InfernalAlignerTests_', suffix='.fasta') open(self.infernal_test1_input_fp, 'w').write(infernal_test1_input_fasta) self.infernal_test1_template_fp = get_tmp_filename( prefix='InfernalAlignerTests_', suffix='template.sto') open(self.infernal_test1_template_fp,'w').\ write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename(prefix='InfernalAlignerTests_', suffix='.fasta') open(self.result_fp, 'w').close() self.log_fp = get_tmp_filename(prefix='InfernalAlignerTests_', suffix='.log') open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = \ LoadSeqs(data=infernal_test1_expected_alignment,aligned=Alignment,\ moltype=DNA)
def ace_for_picrust(tree_path,trait_table_path,method='pic',HALT_EXEC=False): '''Runs the Ace application controller given path of tree and trait table and returns a Table''' #initialize Ace app controller ace=Ace(HALT_EXEC=HALT_EXEC) tmp_output_count_path=get_tmp_filename() tmp_output_prob_path=get_tmp_filename() #quote file names tree_path='"{0}"'.format(tree_path) trait_table_path='"{0}"'.format(trait_table_path) as_string = " ".join([tree_path,trait_table_path,method,tmp_output_count_path,tmp_output_prob_path]) #Run ace here result = ace(data=as_string) #Load the output into Table objects try: asr_table=LoadTable(filename=tmp_output_count_path,header=True,sep='\t') except IOError: raise RuntimeError,\ ("R reported an error on stderr:" " %s" % "\n".join(result["StdErr"].readlines())) asr_prob_table=LoadTable(filename=tmp_output_prob_path,header=True,sep='\t') #Remove tmp files remove(tmp_output_count_path) remove(tmp_output_prob_path) return asr_table,asr_prob_table
def setUp(self): """ """ #create a tmp tree file self.in_tree1_fp = get_tmp_filename(prefix='CountTests',suffix='.nwk') self.in_tree1_file = open(self.in_tree1_fp,'w') self.in_tree1_file.write(in_tree1) self.in_tree1_file.close() #create a tmp tree file (with quoted tip names) self.in_tree2_fp = get_tmp_filename(prefix='CountTests',suffix='.nwk') self.in_tree2_file = open(self.in_tree2_fp,'w') self.in_tree2_file.write(in_tree2) self.in_tree2_file.close() #create a tmp trait file self.in_trait1_fp = get_tmp_filename(prefix='CountTests',suffix='.tsv') self.in_trait1_file=open(self.in_trait1_fp,'w') self.in_trait1_file.write(in_trait1) self.in_trait1_file.close() #create a tmp trait file (with quoted tip names) self.in_trait3_fp = get_tmp_filename(prefix='CountTests',suffix='.tsv') self.in_trait3_file=open(self.in_trait3_fp,'w') self.in_trait3_file.write(in_trait3) self.in_trait3_file.close() self.files_to_remove = [self.in_tree1_fp,self.in_trait1_fp]
def setUp(self): self.tmp_unsorted_fasta_filepath = \ get_tmp_filename(prefix="uclust_test", suffix=".fasta") tmp_unsorted_fasta = open(self.tmp_unsorted_fasta_filepath,"w") tmp_unsorted_fasta.write('\n'.join(raw_dna_seqs)) tmp_unsorted_fasta.close() self.tmp_sorted_fasta_filepath = \ get_tmp_filename(prefix = "uclust_test", suffix = ".fasta") tmp_sorted_fasta = open(self.tmp_sorted_fasta_filepath,"w") tmp_sorted_fasta.write('\n'.join(sorted_dna_seqs)) tmp_sorted_fasta.close() self.tmp_uc_filepath = \ get_tmp_filename(prefix = "uclust_test", suffix = ".uc") tmp_uc = open(self.tmp_uc_filepath,"w") tmp_uc.write('\n'.join(uc_dna_clusters)) tmp_uc.close() self.tmp_clstr_filepath = \ get_tmp_filename(prefix = "uclust_test", suffix = ".clstr") self.WorkingDir = '/tmp/uclust_test' self.tmpdir = '/tmp/' self.files_to_remove = [self.tmp_unsorted_fasta_filepath, self.tmp_sorted_fasta_filepath, self.tmp_uc_filepath, self.tmp_clstr_filepath]
def uclust_search_and_align_from_fasta_filepath( query_fasta_filepath, subject_fasta_filepath, percent_ID=0.75, enable_rev_strand_matching=True, max_accepts=8, max_rejects=32, HALT_EXEC=False): """ query seqs against subject fasta using uclust, return global pw alignment of best match """ # Explanation of parameter settings # id - min percent id to count a match # maxaccepts = 8 , searches for best match rather than first match # (0 => infinite accepts, or good matches before # quitting search) # maxaccepts = 32, # libonly = True , does not add sequences to the library if they don't # match something there already. this effectively makes # uclust a search tool rather than a clustering tool params = {'--id':percent_ID, '--maxaccepts':max_accepts, '--maxrejects':max_rejects, '--libonly':True, '--lib':subject_fasta_filepath} if enable_rev_strand_matching: params['--rev'] = True # instantiate the application controller app = Uclust(params,HALT_EXEC=HALT_EXEC) # apply uclust alignment_filepath = \ get_tmp_filename(prefix='uclust_alignments',suffix='.fasta') uc_filepath = \ get_tmp_filename(prefix='uclust_results',suffix='.uc') input_data = {'--input':query_fasta_filepath, '--fastapairs':alignment_filepath, '--uc':uc_filepath} app_result = app(input_data) # yield the pairwise alignments for result in process_uclust_pw_alignment_results( app_result['PairwiseAlignments'],app_result['ClusterFile']): try: yield result except GeneratorExit: break # clean up the temp files that were generated app_result.cleanUp() return
def cdhit_clusters_from_seqs(seqs, moltype, params=None): """Returns the CD-HIT clusters given seqs seqs : dict like collection of sequences moltype : cogent.core.moltype object params : cd-hit parameters NOTE: This method will call CD_HIT if moltype is PROTIEN, CD_HIT_EST if moltype is RNA/DNA, and raise if any other moltype is passed. """ # keys are not remapped. Tested against seq_ids of 100char length seqs = SequenceCollection(seqs, MolType=moltype) #Create mapping between abbreviated IDs and full IDs int_map, int_keys = seqs.getIntMap() #Create SequenceCollection from int_map. int_map = SequenceCollection(int_map,MolType=moltype) # setup params and make sure the output argument is set if params is None: params = {} if '-o' not in params: params['-o'] = get_tmp_filename() # call the correct version of cd-hit base on moltype working_dir = get_tmp_filename() if moltype is PROTEIN: app = CD_HIT(WorkingDir=working_dir, params=params) elif moltype is RNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) elif moltype is DNA: app = CD_HIT_EST(WorkingDir=working_dir, params=params) else: raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA" # grab result res = app(int_map.toFasta()) clusters = parse_cdhit_clstr_file(res['CLSTR'].readlines()) remapped_clusters = [] for c in clusters: curr = [int_keys[i] for i in c] remapped_clusters.append(curr) # perform cleanup res.cleanUp() shutil.rmtree(working_dir) remove(params['-o'] + '.bak.clstr') return remapped_clusters
def setUp(self): self.SimpleTree = \ DndParser("((A:0.02,B:0.01)E:0.05,(C:0.01,D:0.01)F:0.05)root;") self.SimplePolytomyTree = \ DndParser("((A:0.02,B:0.01,B_prime:0.03)E:0.05,(C:0.01,D:0.01)F:0.05)root;") self.SimpleTreeTraits =\ {"A":[1.0,1.0],"E":[1.0,1.0],"F":[0.0,1.0],"D":[0.0,0.0]} self.PartialReconstructionTree =\ DndParser("((((B:0.01,C:0.01)I3:0.01,A:0.01)I2:0.01,D:0.01)I1:0.01)root;") self.CloseToI3Tree =\ DndParser("((((B:0.01,C:0.95)I3:0.01,A:0.01)I2:0.95,D:0.05)I1:0.95)root;") self.CloseToI1Tree =\ DndParser("((((B:0.95,C:0.95)I3:0.95,A:0.01)I2:0.02,D:0.05)I1:0.05)root;") self.BetweenI3AndI1Tree=\ DndParser("((((B:0.01,C:0.1)I3:0.02,A:0.01)I2:0.02,D:0.05)I1:0.02)root;") self.PartialReconstructionTraits =\ {"B":[1.0,1.0],"C":[1.0,1.0],"I3":[1.0,1.0],"I1":[0.0,1.0],"D":[0.0,1.0]} self.GeneCountTraits =\ {"B":[1.0,1.0],"C":[1.0,2.0],"I3":[1.0,1.0],"I1":[0.0,3.0],"D":[0.0,5.0]} #create a tmp trait file self.in_trait1_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv') self.in_trait1_file=open(self.in_trait1_fp,'w') self.in_trait1_file.write(in_trait1) self.in_trait1_file.close() #create another tmp trait file (with columns in different order) self.in_trait2_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv') self.in_trait2_file=open(self.in_trait2_fp,'w') self.in_trait2_file.write(in_trait2) self.in_trait2_file.close() #create a tmp trait file with a incorrect trait name self.in_bad_trait_fp = get_tmp_filename(prefix='Predict_Traits_Tests',suffix='.tsv') self.in_bad_trait_file=open(self.in_bad_trait_fp,'w') self.in_bad_trait_file.write(in_bad_trait) self.in_bad_trait_file.close() self.files_to_remove = [self.in_trait1_fp,self.in_trait2_fp,self.in_bad_trait_fp]
def setUp(self): self.in_seqs1_fp = get_tmp_filename(prefix="FormatDbTests", suffix=".fasta") self.in_seqs1_file = open(self.in_seqs1_fp, "w") self.in_seqs1_file.write(in_seqs1) self.in_seqs1_file.close() self.in_seqs1 = LoadSeqs(self.in_seqs1_fp, aligned=False) self.test_seq = test_seq self.in_aln1_fp = get_tmp_filename(prefix="FormatDbTests", suffix=".fasta") self.in_aln1_file = open(self.in_aln1_fp, "w") self.in_aln1_file.write(in_aln1) self.in_aln1_file.close() self.in_aln1 = LoadSeqs(self.in_aln1_fp) self.files_to_remove = [self.in_seqs1_fp, self.in_aln1_fp]
def setUp(self): """ """ self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.files_to_remove = [] #this is specific to the web-apps only test_dir = abspath(dirname(__file__)) sff_original_fp = os.path.join(test_dir, 'support_files', \ 'Fasting_subset.sff') # copy sff file to working directory self.sff_dir = tempfile.mkdtemp() self.dirs_to_remove.append(self.sff_dir) self.sff_fp = os.path.join(self.sff_dir, 'Fasting_subset.sff') copy(sff_original_fp, self.sff_fp) self.files_to_remove.append(self.sff_fp) tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(tmp_dir): makedirs(tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(tmp_dir) self.wf_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_wf_out',suffix='',result_constructor=str) self.dirs_to_remove.append(self.wf_out) self.fasting_mapping_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_wf_mapping',suffix='.txt') fasting_mapping_f = open(self.fasting_mapping_fp,'w') fasting_mapping_f.write(fasting_map) fasting_mapping_f.close() self.files_to_remove.append(self.fasting_mapping_fp) working_dir = self.qiime_config['working_dir'] or './' jobs_dir = join(working_dir,'jobs') if not exists(jobs_dir): # only clean up the jobs dir if it doesn't already exist self.dirs_to_remove.append(jobs_dir) self.params = parse_qiime_parameters(qiime_parameters_f.split('\n')) signal.signal(signal.SIGALRM, timeout) # set the 'alarm' to go off in allowed_seconds seconds signal.alarm(allowed_seconds_per_test)
def check_chimera(refseqs, target_id, target_seq): """Check if target is a chimera refseqs : something like a dict {id:seq} target_id : target sequence id, string target_seq : the actual target sequence expects the refseqs and target seq to both be aligned against same ref """ assert target_id not in refseqs inputseqs = refseqs.copy() inputseqs[target_id] = target_seq params = { '-o': get_tmp_filename(), '-w': 400, '-t': target_id, '-f': 'full', '-c': 'Huber-Hugenholtz' } app = Bel3(InputHandler='_input_as_seqs', params=params, HALT_EXEC=False) res = app(inputseqs) how_chimeric = parse_bel3_result(res['B3out']) res.cleanUp() return how_chimeric
def test_create_bwa_index_from_fasta_file(self): """Test create_bwa_index_from_fasta_file Makes sure that the file paths are as expected. """ # get a new temp file for the input fasta fasta_in = get_tmp_filename(suffix=".fna") # write the test fasta (see end of this file) to the temp file fasta = open(fasta_in, 'w') fasta.write(test_fasta) fasta.close() # make sure to remove this fasta file upon tearDown self.files_to_remove.append(fasta_in) # run the function results = create_bwa_index_from_fasta_file(fasta_in, {}) # for each of the 5 output files (not counting stdout, stderr, and # the exitStatus), make sure the file paths are as expcted. for filetype, result in results.iteritems(): if filetype not in ('ExitStatus'): # be sure to remove these 5 files self.files_to_remove.append(result.name) if filetype not in ('StdOut', 'ExitStatus', 'StdErr'): self.assertEqual(fasta_in + filetype, result.name)
def setUp(self): self.input_fp = get_tmp_filename(\ prefix='CogentAlignerTests_',suffix='.fasta') open(self.input_fp,'w').write(seqs_for_muscle) self._paths_to_clean_up =\ [self.input_fp]
def setUp(self): self.input_fp = get_tmp_filename(\ prefix='CogentAlignerTests_',suffix='.fasta') open(self.input_fp, 'w').write(seqs_for_muscle) self._paths_to_clean_up =\ [self.input_fp]
def build_blast_db_from_fasta_file(fasta_file,is_protein=False,\ output_dir=None,HALT_EXEC=False): """Build blast db from fasta_path; return db name and list of files created **If using to create temporary blast databases, you can call cogent.util.misc.remove_files(db_filepaths) to clean up all the files created by formatdb when you're done with the database. fasta_path: path to fasta file of sequences to build database from is_protein: True if working on protein seqs (default: False) output_dir: directory where output should be written (default: directory containing fasta_path) HALT_EXEC: halt just before running the formatdb command and print the command -- useful for debugging """ output_dir = output_dir or '.' fasta_path = get_tmp_filename(\ tmp_dir=output_dir, prefix="BLAST_temp_db_", suffix=".fasta") fasta_f = open(fasta_path,'w') for line in fasta_file: fasta_f.write('%s\n' % line.strip()) fasta_f.close() blast_db, db_filepaths = build_blast_db_from_fasta_path(\ fasta_path, is_protein=is_protein, output_dir=None, HALT_EXEC=HALT_EXEC) db_filepaths.append(fasta_path) return blast_db, db_filepaths
def build_blast_db_from_seqs(seqs,is_protein=False,\ output_dir='./',HALT_EXEC=False): """Build blast db from seqs; return db name and list of files created **If using to create temporary blast databases, you can call cogent.util.misc.remove_files(db_filepaths) to clean up all the files created by formatdb when you're done with the database. seqs: sequence collection or alignment object is_protein: True if working on protein seqs (default: False) output_dir: directory where output should be written (default: current directory) HALT_EXEC: halt just before running the formatdb command and print the command -- useful for debugging """ # Build a temp filepath tmp_fasta_filepath = get_tmp_filename(\ prefix='Blast_tmp_db',suffix='.fasta') # open the temp file tmp_fasta_file = open(tmp_fasta_filepath,'w') # write the sequence collection to file tmp_fasta_file.write(seqs.toFasta()) tmp_fasta_file.close() # build the bast database db_name, db_filepaths = build_blast_db_from_fasta_path(\ tmp_fasta_filepath,is_protein=is_protein,\ output_dir=output_dir,HALT_EXEC=HALT_EXEC) # clean-up the temporary file remove(tmp_fasta_filepath) # return the results return db_name, db_filepaths
def build_blast_db_from_seqs(seqs,is_protein=False,\ output_dir='./',HALT_EXEC=False): """Build blast db from seqs; return db name and list of files created **If using to create temporary blast databases, you can call cogent.util.misc.remove_files(db_filepaths) to clean up all the files created by formatdb when you're done with the database. seqs: sequence collection or alignment object is_protein: True if working on protein seqs (default: False) output_dir: directory where output should be written (default: current directory) HALT_EXEC: halt just before running the formatdb command and print the command -- useful for debugging """ # Build a temp filepath tmp_fasta_filepath = get_tmp_filename(\ prefix='Blast_tmp_db',suffix='.fasta') # open the temp file tmp_fasta_file = open(tmp_fasta_filepath, 'w') # write the sequence collection to file tmp_fasta_file.write(seqs.toFasta()) tmp_fasta_file.close() # build the bast database db_name, db_filepaths = build_blast_db_from_fasta_path(\ tmp_fasta_filepath,is_protein=is_protein,\ output_dir=output_dir,HALT_EXEC=HALT_EXEC) # clean-up the temporary file remove(tmp_fasta_filepath) # return the results return db_name, db_filepaths
def make_torque_jobs(commands, job_prefix, queue, jobs_dir="jobs/", walltime="72:00:00", ncpus=1, nodes=1, keep_output="oe"): """prepare qsub text files. command: list of commands job_prefix: a short, descriptive name for the job. queue: name of the queue to submit to jobs_dir: path to directory where job submision scripts are written walltime: the maximal walltime ncpus: number of cpus nodes: number of nodes keep_output: keep standard error, standard out, both, or neither o=std out, e=std err, oe=both, n=neither """ filenames=[] create_dir(jobs_dir) for command in commands: job_name = get_tmp_filename(tmp_dir=jobs_dir, prefix=job_prefix+"_", suffix = ".txt") out_fh = open(job_name,"w") out_fh.write(QSUB_TEXT % (walltime, ncpus, nodes, queue, job_prefix, keep_output, command)) out_fh.close() filenames.append(job_name) return filenames
def test_call_output_to_file(self): """BlastTaxonAssigner.__call__ functions w output to file """ result_path = get_tmp_filename(prefix='BlastTaxonAssignerTests_', suffix='.fasta') self._paths_to_clean_up.append(result_path) p = BlastTaxonAssigner({ 'reference_seqs_filepath': self.reference_seqs_fp, 'id_to_taxonomy_filepath': self.id_to_taxonomy_fp, }) actual = p(self.input_seqs_fp, result_path=result_path) expected_lines = set([ 's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n', 's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n', 's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n', 's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n', 's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n', 's6\tNo blast hit\tNone\tNone\n', ]) f = open(result_path) observed_lines = set(f.readlines()) f.close() self.assertEqual(observed_lines, expected_lines) # Return value is None when result_path is provided (Not sure # if this is what we want yet, or if we would want both so # results could be logged to file...) self.assertEqual(actual, None)
def build_blast_db_from_fasta_file(fasta_file,is_protein=False,\ output_dir=None,HALT_EXEC=False): """Build blast db from fasta_path; return db name and list of files created **If using to create temporary blast databases, you can call cogent.util.misc.remove_files(db_filepaths) to clean up all the files created by formatdb when you're done with the database. fasta_path: path to fasta file of sequences to build database from is_protein: True if working on protein seqs (default: False) output_dir: directory where output should be written (default: directory containing fasta_path) HALT_EXEC: halt just before running the formatdb command and print the command -- useful for debugging """ output_dir = output_dir or '.' fasta_path = get_tmp_filename(\ tmp_dir=output_dir, prefix="BLAST_temp_db_", suffix=".fasta") fasta_f = open(fasta_path, 'w') for line in fasta_file: fasta_f.write('%s\n' % line.strip()) fasta_f.close() blast_db, db_filepaths = build_blast_db_from_fasta_path(\ fasta_path, is_protein=is_protein, output_dir=None, HALT_EXEC=HALT_EXEC) db_filepaths.append(fasta_path) return blast_db, db_filepaths
def test_assign_taxonomy_file_output(self): """ assign_taxonomy wrapper writes correct file output when requested This function tests for sucessful completion of assign_taxonomy when writing to file, that the lines in the file roughly look correct by verifying how many are written (by zipping with expected), and that each line starts with the correct seq id. Actual testing of taxonomy data is performed elsewhere. """ output_fp = get_tmp_filename(\ prefix='RDPAssignTaxonomyTests',suffix='.txt') # convert the expected dict to a list of lines to match # file output expected_file_headers = self.expected_assignments1.keys() expected_file_headers.sort() actual_return_value = assign_taxonomy(\ self.test_input1,min_confidence=0.95,output_fp=output_fp) actual_file_output = list(open(output_fp)) actual_file_output.sort() # remove the output_fp before running the tests, so if they # fail the output file is still cleaned-up remove(output_fp) # None return value on write to file self.assertEqual(actual_return_value,None) # check that each line starts with the correct seq_id -- not # checking the taxonomies or confidences here as these are variable and # tested elsewhere for a,e in zip(actual_file_output,expected_file_headers): self.assertTrue(a.startswith(e))
def dotur_from_file(distance_matrix_file_path, params=None): """Returns dotur results given a distance matrix file. - distance_matrix_file_path: Path to distance matrix file. This file must a PHYLIP formatted square distance matrix. This format is available in cogent.format.table. - IMPORANT NOTE: This distance matrix format allows only 10 characters for the row labels in the distance matrix. Also, the IDs must be unique and ungapped to be useful when using dotur. - NOTE: This function will only return the parsed *.list file, as it contains the OTU identities. Dotur generates 23 output files, so if this is not the one you are looking for, check out the documentation and add the others to the result path. """ # Read out the data from the distance_matrix_file_path. # This is important so we can run dotur in a temp directory and avoid # having to handle all 23 output files. d_matrix_string = open(distance_matrix_file_path, 'U').read() working_dir = get_tmp_filename(suffix='') app = Dotur(InputHandler='_input_as_multiline_string',\ WorkingDir=working_dir,params=params) res = app(d_matrix_string) otu_list = OtuListParser(res['List'].readlines()) shutil.rmtree(app.WorkingDir) return otu_list
def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split())) # generate temp filename tmp_dir = '/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta = splitext(self.outfile)[0] + '.fasta' fastaout = open(self.outfasta, 'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree = splitext(self.outfile)[0] + '.tree' treeout = open(self.outtree, 'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def R_format_otu_table(otu_filepath, output_dir=None, write_to_tmp_file=True): """Formats OTU table for R (remove comments & column 1 header) If write_to_tmp_file, writes formatted file to tmp file and returns path else, returns lines to go in file """ sample_ids, otu_ids, otu_matrix, lineages = \ parse_otu_table(open(otu_filepath,'U').readlines()) # first line is sample ids, no header for first column (how R likes it) lines = ['\t'.join(sample_ids)] for i in xrange(len(otu_ids)): # note: casting array as a string and calling "split" is much faster # than mapping "str" onto the array array_as_strings = str(otu_matrix[i, :])[1:-1].split() lines.append(otu_ids[i] + '\t' + '\t'.join(array_as_strings)) if write_to_tmp_file: if output_dir is None: tmp_fp = get_tmp_filename(prefix='otus_R_format', suffix='.txt') else: tmp_fp = join(output_dir, 'otus_R_format.txt') fout = open(tmp_fp, 'w') fout.write('\n'.join(lines)) fout.close() return tmp_fp else: return lines
def test_single_file_upgma(self): """ single_file_upgma should throw no errors""" titles = ['hi','ho'] distdata = numpy.array([[0,.5],[.5,0.]]) fname = get_tmp_filename(prefix='upgma_',suffix='.txt') f = open(fname,'w') self._paths_to_clean_up.append(fname) f.write(format_distance_matrix(titles, distdata)) f.close() fname2 = get_tmp_filename(prefix='upgma_',suffix='.txt', result_constructor=str) self._paths_to_clean_up.append(fname2) single_file_upgma(fname,fname2) assert(os.path.exists(fname2))
def setUp(self): """setup the test values""" self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.files_to_remove = [] #this is specific to the web-apps only test_dir = abspath(dirname(__file__)) self.fna_original_fp = os.path.join(test_dir, 'support_files', \ 'test.fna') tmp_dir = self.qiime_config['temp_dir'] or '/tmp/' if not exists(tmp_dir): makedirs(tmp_dir) # if test creates the temp dir, also remove it self.dirs_to_remove.append(tmp_dir) self.wf_out = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_wf_out',suffix='',result_constructor=str) if not exists(self.wf_out): makedirs(self.wf_out) self.dirs_to_remove.append(self.wf_out) #print self.wf_out working_dir = self.qiime_config['working_dir'] or './' jobs_dir = join(working_dir,'jobs') if not exists(jobs_dir): # only clean up the jobs dir if it doesn't already exist self.dirs_to_remove.append(jobs_dir) self.params = parse_qiime_parameters(qiime_parameters_f.split('\n')) signal.signal(signal.SIGALRM, timeout) # set the 'alarm' to go off in allowed_seconds seconds signal.alarm(allowed_seconds_per_test)
def uclust_cluster_from_sorted_fasta_filepath( fasta_filepath, uc_save_filepath=None, percent_ID=0.97, max_accepts=1, max_rejects=8, optimal = False, exact = False, suppress_sort = False, enable_rev_strand_matching=False, subject_fasta_filepath=None, suppress_new_clusters=False, stable_sort=False, HALT_EXEC=False): """ Returns clustered uclust file from sorted fasta""" output_filepath = uc_save_filepath or \ get_tmp_filename(prefix='uclust_clusters',suffix='.uc') params = {'--id':percent_ID, '--maxaccepts':max_accepts, '--maxrejects':max_rejects} app = Uclust(params,HALT_EXEC=HALT_EXEC) # Set any additional parameters specified by the user if enable_rev_strand_matching: app.Parameters['--rev'].on() if optimal: app.Parameters['--optimal'].on() if exact: app.Parameters['--exact'].on() if suppress_sort: app.Parameters['--usersort'].on() if subject_fasta_filepath: app.Parameters['--lib'].on(subject_fasta_filepath) if suppress_new_clusters: app.Parameters['--libonly'].on() if stable_sort: app.Parameters['--stable_sort'].on() app_result = app({'--input':fasta_filepath,'--uc':output_filepath}) return app_result
def dotur_from_file(distance_matrix_file_path,params=None): """Returns dotur results given a distance matrix file. - distance_matrix_file_path: Path to distance matrix file. This file must a PHYLIP formatted square distance matrix. This format is available in cogent.format.table. - IMPORANT NOTE: This distance matrix format allows only 10 characters for the row labels in the distance matrix. Also, the IDs must be unique and ungapped to be useful when using dotur. - NOTE: This function will only return the parsed *.list file, as it contains the OTU identities. Dotur generates 23 output files, so if this is not the one you are looking for, check out the documentation and add the others to the result path. """ # Read out the data from the distance_matrix_file_path. # This is important so we can run dotur in a temp directory and avoid # having to handle all 23 output files. d_matrix_string = open(distance_matrix_file_path,'U').read() working_dir = get_tmp_filename(suffix='') app = Dotur(InputHandler='_input_as_multiline_string',\ WorkingDir=working_dir,params=params) res = app(d_matrix_string) otu_list = OtuListParser(res['List'].readlines()) shutil.rmtree(app.WorkingDir) return otu_list
def setUp(self): # create a list of files to cleanup self._paths_to_clean_up = [] self._dirs_to_clean_up = [] # load query seqs self.seqs = Alignment(MinimalFastaParser(QUERY_SEQS.split())) # generate temp filename tmp_dir='/tmp' self.outfile = get_tmp_filename(tmp_dir) # create and write out reference sequence file self.outfasta=splitext(self.outfile)[0]+'.fasta' fastaout=open(self.outfasta,'w') fastaout.write(REF_SEQS) fastaout.close() self._paths_to_clean_up.append(self.outfasta) # create and write out starting tree file self.outtree=splitext(self.outfile)[0]+'.tree' treeout=open(self.outtree,'w') treeout.write(REF_TREE) treeout.close() self._paths_to_clean_up.append(self.outtree)
def test_assign_taxonomy_file_output(self): """ assign_taxonomy wrapper writes correct file output when requested This function tests for sucessful completion of assign_taxonomy when writing to file, that the lines in the file roughly look correct by verifying how many are written (by zipping with expected), and that each line starts with the correct seq id. Actual testing of taxonomy data is performed elsewhere. """ output_fp = get_tmp_filename(\ prefix='RDPAssignTaxonomyTests',suffix='.txt') # convert the expected dict to a list of lines to match # file output expected_file_headers = self.expected_assignments1.keys() expected_file_headers.sort() actual_return_value = assign_taxonomy(\ self.test_input1,min_confidence=0.95,output_fp=output_fp) actual_file_output = list(open(output_fp)) actual_file_output.sort() # remove the output_fp before running the tests, so if they # fail the output file is still cleaned-up remove(output_fp) # None return value on write to file self.assertEqual(actual_return_value, None) # check that each line starts with the correct seq_id -- not # checking the taxonomies or confidences here as these are variable and # tested elsewhere for a, e in zip(actual_file_output, expected_file_headers): self.assertTrue(a.startswith(e))
def test_call_output_to_file(self): """BlastTaxonAssigner.__call__ functions w output to file """ result_path = get_tmp_filename( prefix='BlastTaxonAssignerTests_', suffix='.fasta') self._paths_to_clean_up.append(result_path) p = BlastTaxonAssigner({ 'reference_seqs_filepath': self.reference_seqs_fp, 'id_to_taxonomy_filepath': self.id_to_taxonomy_fp, }) actual = p(self.input_seqs_fp, result_path=result_path) expected_lines = set([ 's1\tArchaea;Euryarchaeota;Halobacteriales;uncultured\t0.0\tAY800210\n', 's2\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel.\t0.0\tEU883771\n', 's3\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503699\n', 's4\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium\t0.0\tDQ260310\n', 's5\tArchaea;Crenarchaeota;uncultured;uncultured\t0.0\tEF503697\n', 's6\tNo blast hit\tNone\tNone\n', ]) f = open(result_path) observed_lines = set(f.readlines()) f.close() self.assertEqual(observed_lines, expected_lines) # Return value is None when result_path is provided (Not sure # if this is what we want yet, or if we would want both so # results could be logged to file...) self.assertEqual(actual, None)
def setUp(self): self.pynast_test1_input_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='.fasta') open(self.pynast_test1_input_fp,'w').write(pynast_test1_input_fasta) self.pynast_test1_template_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='template.fasta') open(self.pynast_test1_template_fp,'w').\ write(pynast_test1_template_fasta) self.pynast_test_template_w_dots_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='template.fasta') open(self.pynast_test_template_w_dots_fp,'w').\ write(pynast_test1_template_fasta.replace('-','.')) self.pynast_test_template_w_u_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='template.fasta') open(self.pynast_test_template_w_u_fp,'w').\ write(pynast_test1_template_fasta.replace('T','U')) self.pynast_test_template_w_lower_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='template.fasta') open(self.pynast_test_template_w_lower_fp,'w').\ write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='.fasta') open(self.result_fp,'w').close() self.failure_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='.fasta') open(self.failure_fp,'w').close() self.log_fp = get_tmp_filename( prefix='PyNastAlignerTests_',suffix='.log') open(self.log_fp,'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = \ LoadSeqs(data=pynast_test1_expected_alignment,aligned=DenseAlignment) self.pynast_test1_expected_fail = \ LoadSeqs(data=pynast_test1_expected_failure,aligned=False)
def setUp(self): self.in_seqs1_fp =\ get_tmp_filename(prefix='FormatDbTests',suffix='.fasta') self.in_seqs1_file = open(self.in_seqs1_fp, 'w') self.in_seqs1_file.write(in_seqs1) self.in_seqs1_file.close() self.in_seqs1 = LoadSeqs(self.in_seqs1_fp, aligned=False) self.test_seq = test_seq self.in_aln1_fp =\ get_tmp_filename(prefix='FormatDbTests',suffix='.fasta') self.in_aln1_file = open(self.in_aln1_fp, 'w') self.in_aln1_file.write(in_aln1) self.in_aln1_file.close() self.in_aln1 = LoadSeqs(self.in_aln1_fp) self.files_to_remove = [self.in_seqs1_fp, self.in_aln1_fp]
def setUp(self): # create the temporary input files self.tmp_seq_filepath = get_tmp_filename(\ prefix='GenericRepSetPickerTest_',\ suffix='.fasta') seq_file = open(self.tmp_seq_filepath, 'w') seq_file.write(dna_seqs) seq_file.close() self.tmp_otu_filepath = get_tmp_filename(\ prefix='GenericRepSetPickerTest_',\ suffix='.otu') otu_file = open(self.tmp_otu_filepath, 'w') otu_file.write(otus) otu_file.close() self.params = {'Algorithm': 'first', 'ChoiceF': first_id}
def setUp(self): """ """ self.qiime_config = load_qiime_config() self.dirs_to_remove = [] self.files_to_remove = [] #this is specific to the web-apps only test_dir = abspath(dirname(__file__)) sff_original_fp = os.path.join(test_dir, 'support_files', \ 'Fasting_subset.sff') self.sff_fp = os.path.join('/%s/' % environ['HOME'], 'Fasting_subset.sff') self.files_to_remove.append(self.sff_fp) copy(sff_original_fp, self.sff_fp) self.illumina_fps = [os.path.join(test_dir, 'support_files', \ 's_8_1_sequence_100_records.txt'), os.path.join(test_dir, 'support_files', \ 's_8_2_sequence_100_records.txt')] self.illumina_map_fp = os.path.join(test_dir, 'support_files', \ 's8_map_incomplete.txt') self.fasta_fps=[os.path.join(test_dir,'support_files', 'test_split_lib_seqs.fasta')] self.fasta_map_fp = os.path.join(test_dir, 'support_files', \ 'fasta_mapping_file.txt') tmp_dir = "/%s/test_wf" % environ['HOME'] self.dirs_to_remove.append(tmp_dir) #self.qiime_config['temp_dir'] or '/tmp/' if not exists(tmp_dir): makedirs(tmp_dir) # if test creates the temp dir, also remove it #self.dirs_to_remove.append(tmp_dir) self.wf_out="/%s/test_processed_data" % environ['HOME'] #print self.wf_out self.dirs_to_remove.append(self.wf_out) self.gg_out=os.path.join(self.wf_out,'gg_97_otus') if not exists(self.gg_out): makedirs(self.gg_out) #self.dirs_to_remove.append(self.gg_out) self.fasting_mapping_fp = get_tmp_filename(tmp_dir=tmp_dir, prefix='qiime_wf_mapping',suffix='.txt') fasting_mapping_f = open(self.fasting_mapping_fp,'w') fasting_mapping_f.write(fasting_map) fasting_mapping_f.close() self.files_to_remove.append(self.fasting_mapping_fp) self.params = parse_qiime_parameters(qiime_parameters_f) signal.signal(signal.SIGALRM, timeout) # set the 'alarm' to go off in allowed_seconds seconds signal.alarm(allowed_seconds_per_test)
def cluster_seqs(seqs, neighbor_join=False, params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_chars=1000000, max_hours=1.0, constructor=PhyloNode, clean_up=True ): """Muscle cluster list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError("Muscle requres 2 or more sequences to cluster.") num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 params["-diags1"] = True params["-sv"] = True #params["-distance1"] = "kmer6_6" #params["-distance1"] = "kmer20_3" #params["-distance1"] = "kbit20_3" print("lots of chars, using fast align", num_chars) params["-maxhours"] = max_hours #params["-maxiters"] = 10 #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" params["-cluster"] = True params["-tree1"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) if clean_up: muscle_res.cleanUp() return tree
def cluster_seqs(seqs, neighbor_join=False, params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_chars=1000000, max_hours=1.0, constructor=PhyloNode, clean_up=True ): """Muscle cluster list of sequences. seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. Addl docs coming soon """ num_seqs = len(seqs) if num_seqs < 2: raise ValueError, "Muscle requres 2 or more sequences to cluster." num_chars = sum(map(len, seqs)) if num_chars > max_chars: params["-maxiters"] = 2 params["-diags1"] = True params["-sv"] = True #params["-distance1"] = "kmer6_6" #params["-distance1"] = "kmer20_3" #params["-distance1"] = "kbit20_3" print "lots of chars, using fast align", num_chars params["-maxhours"] = max_hours #params["-maxiters"] = 10 #cluster_type = "upgmb" #if neighbor_join: # cluster_type = "neighborjoining" params["-clusteronly"] = True params["-tree1"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree1Out"], constructor=constructor) if clean_up: muscle_res.cleanUp() return tree
def test_format_blast_db_string_file(self): """Test when path is fasta file""" filename = get_tmp_filename(tmp_dir=self.tmp_dir) copyfile(self.refseqs_fp, filename) self._paths_to_clean_up = [filename] obs = format_blast_db_string(filename) self.assertEqual(obs, filename)
def test_insert_sequences_into_tree(self): """Inserts sequences into Tree using params - test handles tree-insertion""" # generate temp filename for output outfname = splitext(get_tmp_filename('/tmp/'))[0] # create starting tree outtreefname = outfname + '.tre' outtree = open(outtreefname, 'w') outtree.write(REF_TREE) outtree.close() # set params for tree-insertion params = {} params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-f"] = 'v' #params["-G"] = '0.25' params["-t"] = outtreefname params["-m"] = 'GTRGAMMA' aln_ref_query = get_align_for_phylip( StringIO(PHYLIP_FILE_DNA_REF_QUERY)) aln = Alignment(aln_ref_query) seqs, align_map = aln.toPhylip() tree = insert_sequences_into_tree(seqs, DNA, params=params, write_log=False) for node in tree.tips(): removed_query_str = re.sub('QUERY___', '', str(node.Name)) new_node_name = re.sub('___\d+', '', str(removed_query_str)) if new_node_name in align_map: node.Name = align_map[new_node_name] self.assertTrue(isinstance(tree, PhyloNode)) self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE) self.assertEqual(len(tree.tips()), 7) self.assertRaises(NotImplementedError, build_tree_from_alignment, \ self.align1, RNA, True) remove(outtreefname)
def setUp(self): self.pynast_test1_input_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='.fasta') open(self.pynast_test1_input_fp, 'w').write(pynast_test1_input_fasta) self.pynast_test1_template_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test1_template_fp,'w').\ write(pynast_test1_template_fasta) self.pynast_test_template_w_dots_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_dots_fp,'w').\ write(pynast_test1_template_fasta.replace('-','.')) self.pynast_test_template_w_u_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_u_fp,'w').\ write(pynast_test1_template_fasta.replace('T','U')) self.pynast_test_template_w_lower_fp = get_tmp_filename( prefix='PyNastAlignerTests_', suffix='template.fasta') open(self.pynast_test_template_w_lower_fp,'w').\ write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) self.result_fp = get_tmp_filename(prefix='PyNastAlignerTests_', suffix='.fasta') open(self.result_fp, 'w').close() self.failure_fp = get_tmp_filename(prefix='PyNastAlignerTests_', suffix='.fasta') open(self.failure_fp, 'w').close() self.log_fp = get_tmp_filename(prefix='PyNastAlignerTests_', suffix='.log') open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = \ LoadSeqs(data=pynast_test1_expected_alignment,aligned=DenseAlignment) self.pynast_test1_expected_fail = \ LoadSeqs(data=pynast_test1_expected_failure,aligned=False)
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1,100000) params["-x"] = randint(1,100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def build_tree_from_alignment(aln, moltype, best_tree=False, params={}): """Returns a tree from Alignment object aln. aln: an xxx.Alignment object, or data that can be used to build one. moltype: cogent.core.moltype.MolType object best_tree: best_tree suppport is currently not implemented params: dict of parameters to pass in to the RAxML app controller. The result will be an xxx.Alignment object, or None if tree fails. """ if best_tree: raise NotImplementedError if '-m' not in params: if moltype == DNA or moltype == RNA: #params["-m"] = 'GTRMIX' # in version 7.2.3, GTRMIX is no longer supported but says GTRCAT # behaves like GTRMIX (http://www.phylo.org/tools/raxmlhpc2.html) params["-m"] = 'GTRGAMMA' elif moltype == PROTEIN: params["-m"] = 'PROTGAMMAmatrixName' else: raise ValueError("Moltype must be either DNA, RNA, or PROTEIN") if not hasattr(aln, 'toPhylip'): aln = Alignment(aln) seqs, align_map = aln.toPhylip() # generate temp filename for output params["-w"] = "/tmp/" params["-n"] = get_tmp_filename().split("/")[-1] params["-k"] = True params["-p"] = randint(1, 100000) params["-x"] = randint(1, 100000) ih = '_input_as_multiline_string' raxml_app = Raxml(params=params, InputHandler=ih, WorkingDir=None, SuppressStderr=True, SuppressStdout=True) raxml_result = raxml_app(seqs) tree = DndParser(raxml_result['Bootstrap'], constructor=PhyloNode) for node in tree.tips(): node.Name = align_map[node.Name] raxml_result.cleanUp() return tree
def aln_tree_seqs(seqs, input_handler=None, tree_type='neighborjoining', params={}, add_seq_names=True, WorkingDir=None, SuppressStderr=None, SuppressStdout=None, max_hours=5.0, constructor=PhyloNode, clean_up=True ): """Muscle align sequences and report tree from iteration2. Unlike cluster_seqs, returns tree2 which is the tree made during the second muscle iteration (it should be more accurate that the cluster from the first iteration which is made fast based on k-mer words) seqs: either file name or list of sequence objects or list of strings or single multiline string containing sequences. tree_type: can be either neighborjoining (default) or upgmb for UPGMA clean_up: When true, will clean up output files """ params["-maxhours"] = max_hours if tree_type: params["-cluster2"] = tree_type params["-tree2"] = get_tmp_filename(WorkingDir) params["-out"] = get_tmp_filename(WorkingDir) muscle_res = muscle_seqs(seqs, input_handler=input_handler, params=params, add_seq_names=add_seq_names, WorkingDir=WorkingDir, SuppressStderr=SuppressStderr, SuppressStdout=SuppressStdout) tree = DndParser(muscle_res["Tree2Out"], constructor=constructor) aln = [line for line in muscle_res["MuscleOut"]] if clean_up: muscle_res.cleanUp() return tree, aln