def test_chop_maxsize_divisor(self): pair_list = sequence_ops.coord_chop(self.length, self.size, "maxsize_divisor") first_pair_len = pair_list[0][1] - pair_list[0][0] last_pair_len = pair_list[-1][1] - pair_list[-1][0] self.assertGreaterEqual(len(pair_list), self.length / self.size) self.assertLessEqual(first_pair_len, self.size) self.assertLessEqual(last_pair_len, self.size)
def test_chop_exactsize(self): pair_list = sequence_ops.coord_chop(self.length, self.size, "exact_size") first_pair_len = pair_list[0][1] - pair_list[0][0] last_pair_len = pair_list[-1][1] - pair_list[-1][0] self.assertGreaterEqual(len(pair_list), self.length / self.size) self.assertEquals(first_pair_len, self.size) self.assertLessEqual(last_pair_len, self.size) self.assertGreaterEqual(first_pair_len, last_pair_len)
def seq_subset_load(infile, subset_mode, subset_args): """Load a subset of sequence segments from a sequence file.""" from analysis.sequence_ops import feat_collect, feature_coords, \ coord_chop, get_seq_subset_by_coords from analysis.seqfile_ops import load_multifasta, surefmt_load, \ write_fasta from analysis.text_manipulation import adaptive_list_load if subset_mode is 'flatfile': # in this case the sequence file MUST be multifasta try: subset = load_multifasta(infile) except: raise else: print "set of", len(subset), "sequence segments" subset_file = infile else: # load the query single sequence file (convert format if necessary) try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna') except: raise else: print "query sequence loaded from", infile # load or generate coordinate pairs for target segments if subset_mode is 'coordinates': try: coords_file = subset_args['file'] header = subset_args['header'] columns = subset_args['columns'] coords_list = adaptive_list_load(coords_file, header, columns) except: raise else: print len(coords_list), "segments loaded from", infile elif subset_mode is 'features': try: feat_mode = subset_args features = feat_collect(infile, feat_mode) coords_list = feature_coords(features) print coords_list except: raise else: print len(coords_list),"features loaded from", infile elif subset_mode is 'size': try: size = subset_args['size'] chop_mode = subset_args['chop_mode'] coords_list = coord_chop(len(seq_record.seq), size, chop_mode) except: raise else: print len(coords_list), "segments generated to fit", size else: print "ERROR: A mode MUST be specified." coords_list = None # collect subset of sequence segments using resulting coords_list try: subset = get_seq_subset_by_coords(seq_record, coords_list) except: raise else: print "subset of", len(subset), "sequence segments" # save subset to multifasta file for later use or reference subset_file = seq_record.id+'_subset.fas' try: write_fasta(subset_file, subset) except: raise else: print "subset written to fasta file", subset_file return subset, subset_file
def test_chop_none(self): pair_list = sequence_ops.coord_chop(self.length, None, None) first_pair_len = pair_list[0][1] - pair_list[0][0] self.assertIs(len(pair_list), 1) self.assertEqual(first_pair_len, self.length)
def test_chop_count_divisor(self): pair_list = sequence_ops.coord_chop(self.length, self.count, "count_divisor") first_pair_len = pair_list[0][1] - pair_list[0][0] last_pair_len = pair_list[-1][1] - pair_list[-1][0] self.assertLessEqual(abs(len(pair_list) - self.count), 1) self.assertGreaterEqual(first_pair_len, last_pair_len)