def test_blast_record_set(self): # prepare database seqfile_ops.write_fasta(self.db_file, self.db_records) db_records_list = seqfile_ops.load_multifasta(self.db_file) index = 0 for record in db_records_list: self.assertEqual(record.id,self.db_records[index].id) self.assertEqual(str(record.seq),str(self.db_records[index].seq)) index +=1 # make database self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir, self.db_name, self.db_file, 'nucl') self.assertIs(db_report['status'], 0) self.assertEquals(db_report['message'], 'database exists') # run local blast batch (with multiple queries) matches_multi = blasting.blast_record_set(self.dbfile_path, self.multi_records, self.prefs) self.assertIs(len(matches_multi), 3) index = 0 for record in self.multi_records: self.assertEqual(matches_multi[record.id][0]['contig_id'], self.multi_records[index].id) self.assertEqual(matches_multi[record .id][0]['details']['match_p100'], 100) index +=1
def test_local_blastn(self): # prepare query seqfile_ops.write_fasta(self.single_q_file, self.single_record) query_record = seqfile_ops.load_fasta(self.single_q_file) self.assertEqual(query_record.id,self.record_1.id) self.assertEqual(str(query_record.seq),str(self.record_1.seq)) # prepare database seqfile_ops.write_fasta(self.db_file, self.db_records) records_list = seqfile_ops.load_multifasta(self.db_file) index = 0 for record in records_list: self.assertEqual(record.id,self.db_records[index].id) self.assertEqual(str(record.seq),str(self.db_records[index].seq)) index +=1 # make database self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir, self.db_name, self.db_file, 'nucl') self.assertIs(db_report['status'], 0) self.assertEquals(db_report['message'], 'database exists') # run local blast with single query self.status = blasting.local_blastn(self.single_q_file, self.single_out_file, self.dbfile_path, self.prefs) self.assertEquals(self.status['output'], '') self.assertIsNone(self.status['error']) # parse blast output matches_single = blasting.parse_blast_out6(self.single_out_file, self.prefs) self.assertIs(len(matches_single), 1) self.assertEqual(matches_single[0]['contig_id'], self.single_record.id) self.assertEqual(matches_single[0]['details']['match_p100'], 100)
def test_write_and_load_multifasta(self): count = seqfile_ops.write_fasta(self.fas_filename, self.three_records) self.assertIs(count, 3) fas_records = seqfile_ops.load_multifasta(self.fas_filename) for index in range (0,2): self.assertEqual(fas_records[index].id, self.three_records[index].id)
def seq_subset_load(infile, subset_mode, subset_args): """Load a subset of sequence segments from a sequence file.""" from analysis.sequence_ops import feat_collect, feature_coords, \ coord_chop, get_seq_subset_by_coords from analysis.seqfile_ops import load_multifasta, surefmt_load, \ write_fasta from analysis.text_manipulation import adaptive_list_load if subset_mode is 'flatfile': # in this case the sequence file MUST be multifasta try: subset = load_multifasta(infile) except: raise else: print "set of", len(subset), "sequence segments" subset_file = infile else: # load the query single sequence file (convert format if necessary) try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna') except: raise else: print "query sequence loaded from", infile # load or generate coordinate pairs for target segments if subset_mode is 'coordinates': try: coords_file = subset_args['file'] header = subset_args['header'] columns = subset_args['columns'] coords_list = adaptive_list_load(coords_file, header, columns) except: raise else: print len(coords_list), "segments loaded from", infile elif subset_mode is 'features': try: feat_mode = subset_args features = feat_collect(infile, feat_mode) coords_list = feature_coords(features) print coords_list except: raise else: print len(coords_list),"features loaded from", infile elif subset_mode is 'size': try: size = subset_args['size'] chop_mode = subset_args['chop_mode'] coords_list = coord_chop(len(seq_record.seq), size, chop_mode) except: raise else: print len(coords_list), "segments generated to fit", size else: print "ERROR: A mode MUST be specified." coords_list = None # collect subset of sequence segments using resulting coords_list try: subset = get_seq_subset_by_coords(seq_record, coords_list) except: raise else: print "subset of", len(subset), "sequence segments" # save subset to multifasta file for later use or reference subset_file = seq_record.id+'_subset.fas' try: write_fasta(subset_file, subset) except: raise else: print "subset written to fasta file", subset_file return subset, subset_file