Esempio n. 1
0
    def test_get_orf_base(self):

        expected = pd.DataFrame(index=range(1))
        expected['id'] = 'Single_FA'
        expected['aa_sequence'] = 'MIMIKL*P'
        expected['frame'] = 1
        expected['strand'] = '+'
        expected['seq_length'] = 8
        expected['seq_length_nt'] = 26
        expected['orf_sequence'] = 'MIMIKL'
        expected['start_site'] = 1
        expected['stop_site'] = 7
        expected['orf_length'] = 6
        expected['start_site_nt'] = 1
        expected['stop_site_nt'] = 21
        expected['utr3_length'] = 5
        expected['first_MET'] = 'M'
        expected['final_stop'] = 'STOP'
        expected['isoform_number'] = 1
        expected['orf_class'] = 'complete'
        expected['fasta_id'] = '>Single_FA.orf1 complete:1-21 strand:+'

        all_sequences = read_fasta('test_data/test_getorfs.fa')
        orf_df = get_orfs(all_sequences, min_orf_length=5)

        self.assertTrue(orf_df.equals(expected))
Esempio n. 2
0
    def test_get_orf_all(self):

        expected = pd.DataFrame(index=range(2))
        expected['id'] = ['Single_FA', 'Single_FA']
        expected['aa_sequence'] = ['MIMIKL*P', 'GLQLNHDH']
        expected['frame'] = [1, 3]
        expected['strand'] = ['+', '-']
        expected['seq_length'] = [8, 8]
        expected['seq_length_nt'] = [26, 26]
        expected['orf_sequence'] = ['MIMIKL', 'GLQLNHDH']
        expected['start_site'] = [1, 1]
        expected['stop_site'] = [7, 8]
        expected['orf_length'] = [6, 8]
        expected['start_site_nt'] = [1, 3]
        expected['stop_site_nt'] = [21, 26]
        expected['utr3_length'] = [5, 0]
        expected['first_MET'] = ['M', 'ALT']
        expected['final_stop'] = ['STOP', 'ALT']
        expected['isoform_number'] = [1, 2]
        expected['orf_class'] = ['complete', 'incomplete']
        expected['fasta_id'] = [
            '>Single_FA.orf1 complete:1-21 strand:+',
            '>Single_FA.orf2 incomplete:3-26 strand:-'
        ]

        all_sequences = read_fasta('test_data/test_getorfs.fa')
        orf_df = get_orfs(all_sequences,
                          min_orf_length=5,
                          both_strands=True,
                          all_orfs=True)

        self.assertTrue(orf_df.equals(expected))
Esempio n. 3
0
    def test_convert_nt_output_format(self):
        # tests that a length 3 tupple output, and each is the correct numpy
        # array type
        sequences = read_fasta('test_data/test_frames.fa')
        ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames(
            sequences, both_strands=False)
        orf_sequence, start_sites, stop_sites, orf_length, last_aa_is_stop = find_longest_orfs(
            aa_frames)
        # filter data by minimum orf length
        keep = orf_length >= 6
        aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length = filter_objects(
            keep, aa_frames, frame, strand, seq_length_nt, ids, seq_length,
            start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length)

        output = convert_start_stop_to_nt(start_sites, stop_sites,
                                          seq_length_nt, orf_length, frame,
                                          last_aa_is_stop)

        t_len = len(output) == 3
        # test numpy types of all outputs
        t0 = output[0].dtype == 'int64'
        t1 = output[1].dtype == 'int64'
        t2 = output[2].dtype == 'int64'

        all_right_types = t0 and t1 and t2 and t_len
        self.assertTrue(all_right_types)
Esempio n. 4
0
    def test_read_fasta(self):

        # check that files are read into correct format"
        read_sequence = read_fasta('test_data/test_mutliple_frame_orfs.fa')
        seq_array = [str(x.seq) for x in read_sequence]
        # check sequence matches
        # (only check first/last few nts, and total length)
        t_start = seq_array[0][0:20] == 'GCTTCGGGTTGGTGTCATGG'
        t_end = seq_array[0][-1:-20:-1] == 'AGTTGTGTTACCGGGACGG'
        t_len = len(seq_array[0]) == 2757

        self.assertTrue(t_start and t_end and t_len)
Esempio n. 5
0
    def test_translate_bothstrands(self):
        sequences = read_fasta('test_data/test_trans_all_frames.fa')
        ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames(
            sequences, both_strands=True)

        frame_correct = np.all(frame == np.array([1, 1, 2, 2, 3, 3]))
        strand_correct = np.all(
            strand == np.array(['+', '-', '+', '-', '+', '-']))
        trans_correct = np.all(aa_frames == np.array([
            'MANATEE*', 'LFFGRVRH', 'WRTRPKN', 'YSSVAFA', 'GERDRRI', 'ILRSRSP'
        ]))

        self.assertTrue(frame_correct and strand_correct and trans_correct)
Esempio n. 6
0
    def test_convert_utr_nt(self):
        sequences = read_fasta('test_data/test_frames.fa')

        ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames(
            sequences, both_strands=False)
        orf_sequence, start_sites, stop_sites, orf_length, last_aa_is_stop = find_longest_orfs(
            aa_frames)
        # filter data by minimum orf length
        keep = orf_length >= 6
        aa_frames, frame, strand, seq_length_nt, ids, seq_length, start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length = filter_objects(
            keep, aa_frames, frame, strand, seq_length_nt, ids, seq_length,
            start_sites, stop_sites, orf_sequence, last_aa_is_stop, orf_length)

        start_site_nt, stop_site_nt, utr3_length = convert_start_stop_to_nt(
            start_sites, stop_sites, seq_length_nt, orf_length, frame,
            last_aa_is_stop)
        self.assertTrue(np.all(utr3_length == np.array([5, 4, 3])))
Esempio n. 7
0
    def test_translate_output_format(self):
        # tests that a length 3 tupple output, and each is the correct numpy
        # array type
        sequences = read_fasta('test_data/test_trans_all_frames.fa')
        output = translate_all_frames(sequences, both_strands=False)

        t_len = len(output) == 6
        # test numpy types of all outputs
        t0 = output[0].dtype.type == np.str_
        t1 = output[1].dtype.type == np.str_
        t2 = output[2].dtype == 'int64'
        t3 = output[3].dtype.type == np.str_
        t4 = output[4].dtype == 'int64'
        t5 = output[5].dtype == 'int64'

        all_right_types = t0 and t1 and t2 and t3 and t4 and t5 and t_len
        self.assertTrue(all_right_types)
Esempio n. 8
0
 def test_translate_seq_length(self):
     sequences = read_fasta('test_data/test_trans_all_frames.fa')
     ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames(
         sequences, both_strands=False)
     self.assertTrue(np.all(seq_length == np.array([8, 7, 7])))
Esempio n. 9
0
 def test_translate_alltransframes(self):
     sequences = read_fasta('test_data/test_trans_all_frames.fa')
     ids, aa_frames, frame, strand, seq_length_nt, seq_length = translate_all_frames(
         sequences, both_strands=False)
     self.assertTrue(
         np.all(aa_frames == np.array(['MANATEE*', 'WRTRPKN', 'GERDRRI'])))