def test_get_orf_base(self): expected = pd.DataFrame(index=range(1)) expected['id'] = 'Single_FA' expected['aa_sequence'] = 'MIMIKL*P' expected['frame'] = 1 expected['strand'] = '+' expected['seq_length'] = 8 expected['seq_length_nt'] = 26 expected['orf_sequence'] = 'MIMIKL' expected['start_site'] = 1 expected['stop_site'] = 7 expected['orf_length'] = 6 expected['start_site_nt'] = 1 expected['stop_site_nt'] = 21 expected['utr3_length'] = 5 expected['first_MET'] = 'M' expected['final_stop'] = 'STOP' expected['isoform_number'] = 1 expected['orf_class'] = 'complete' expected['fasta_id'] = '>Single_FA.orf1 complete:1-21 strand:+' all_sequences = read_fasta('test_data/test_getorfs.fa') orf_df = get_orfs(all_sequences, min_orf_length=5) self.assertTrue(orf_df.equals(expected))
def test_get_orf_all(self): expected = pd.DataFrame(index=range(2)) expected['id'] = ['Single_FA', 'Single_FA'] expected['aa_sequence'] = ['MIMIKL*P', 'GLQLNHDH'] expected['frame'] = [1, 3] expected['strand'] = ['+', '-'] expected['seq_length'] = [8, 8] expected['seq_length_nt'] = [26, 26] expected['orf_sequence'] = ['MIMIKL', 'GLQLNHDH'] expected['start_site'] = [1, 1] expected['stop_site'] = [7, 8] expected['orf_length'] = [6, 8] expected['start_site_nt'] = [1, 3] expected['stop_site_nt'] = [21, 26] expected['utr3_length'] = [5, 0] expected['first_MET'] = ['M', 'ALT'] expected['final_stop'] = ['STOP', 'ALT'] expected['isoform_number'] = [1, 2] expected['orf_class'] = ['complete', 'incomplete'] expected['fasta_id'] = [ '>Single_FA.orf1 complete:1-21 strand:+', '>Single_FA.orf2 incomplete:3-26 strand:-' ] all_sequences = read_fasta('test_data/test_getorfs.fa') orf_df = get_orfs(all_sequences, min_orf_length=5, both_strands=True, all_orfs=True) self.assertTrue(orf_df.equals(expected))
def test_convert_nt_output_format(self): # tests that a length 3 tupple output, and each is the correct numpy # array type sequences = read_fasta('test_data/test_frames.fa') ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames( sequences, both_strands=False) orf_sequence, start_sites, stop_sites, orf_length, last_aa_is_stop = find_longest_orfs( aa_frames) # filter data by minimum orf length keep = orf_length >= 6 aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length = filter_objects( keep, aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length) output = convert_start_stop_to_nt(start_sites, stop_sites, seq_length_nt, orf_length, frame, last_aa_is_stop) t_len = len(output) == 3 # test numpy types of all outputs t0 = output[0].dtype == 'int64' t1 = output[1].dtype == 'int64' t2 = output[2].dtype == 'int64' all_right_types = t0 and t1 and t2 and t_len self.assertTrue(all_right_types)
def test_read_fasta(self): # check that files are read into correct format" read_sequence = read_fasta('test_data/test_mutliple_frame_orfs.fa') seq_array = [str(x.seq) for x in read_sequence] # check sequence matches # (only check first/last few nts, and total length) t_start = seq_array[0][0:20] == 'GCTTCGGGTTGGTGTCATGG' t_end = seq_array[0][-1:-20:-1] == 'AGTTGTGTTACCGGGACGG' t_len = len(seq_array[0]) == 2757 self.assertTrue(t_start and t_end and t_len)
def test_translate_bothstrands(self): sequences = read_fasta('test_data/test_trans_all_frames.fa') ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames( sequences, both_strands=True) frame_correct = np.all(frame == np.array([1, 1, 2, 2, 3, 3])) strand_correct = np.all( strand == np.array(['+', '-', '+', '-', '+', '-'])) trans_correct = np.all(aa_frames == np.array([ 'MANATEE*', 'LFFGRVRH', 'WRTRPKN', 'YSSVAFA', 'GERDRRI', 'ILRSRSP' ])) self.assertTrue(frame_correct and strand_correct and trans_correct)
def test_convert_utr_nt(self): sequences = read_fasta('test_data/test_frames.fa') ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames( sequences, both_strands=False) orf_sequence, start_sites, stop_sites, orf_length, last_aa_is_stop = find_longest_orfs( aa_frames) # filter data by minimum orf length keep = orf_length >= 6 aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length = filter_objects( keep, aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length) start_site_nt, stop_site_nt, utr3_length = convert_start_stop_to_nt( start_sites, stop_sites, seq_length_nt, orf_length, frame, last_aa_is_stop) self.assertTrue(np.all(utr3_length == np.array([5, 4, 3])))
def test_translate_output_format(self): # tests that a length 3 tupple output, and each is the correct numpy # array type sequences = read_fasta('test_data/test_trans_all_frames.fa') output = translate_all_frames(sequences, both_strands=False) t_len = len(output) == 6 # test numpy types of all outputs t0 = output[0].dtype.type == np.str_ t1 = output[1].dtype.type == np.str_ t2 = output[2].dtype == 'int64' t3 = output[3].dtype.type == np.str_ t4 = output[4].dtype == 'int64' t5 = output[5].dtype == 'int64' all_right_types = t0 and t1 and t2 and t3 and t4 and t5 and t_len self.assertTrue(all_right_types)
def test_translate_seq_length(self): sequences = read_fasta('test_data/test_trans_all_frames.fa') ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames( sequences, both_strands=False) self.assertTrue(np.all(seq_length == np.array([8, 7, 7])))
def test_translate_alltransframes(self): sequences = read_fasta('test_data/test_trans_all_frames.fa') ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames( sequences, both_strands=False) self.assertTrue( np.all(aa_frames == np.array(['MANATEE*', 'WRTRPKN', 'GERDRRI'])))