Esempio n. 1
0
    def print_stats(self):
        pretty_names = [
            ('total_seqs', 'Total num seqs', None),
            ('total_passed', 'Total passed as tRNA seq', 'green'),
            ('total_full_length', 'Total full length tRNA seqs', 'green'),
            ('total_rejected', 'Total rejected', 'red'),
            ('anticodon_unknown', 'Total with an unknown anticodon', None)
        ]
        ##                        ('short_rejected', 'Rejected due to short length', None),
        ##                        ('long_rejected', 'Rejected due to long length', None),
        ##                        ('acceptor_seq_rejected', 'Rejected due to acceptor seq', None),
        ##                        ('t_loop_seq_rejected', 'Rejected due to t-loop seq', None),
        ##                        ('both_rejected', 'Rejected due to both', None),
        ##                        ('no_divergence', 'No divergence', None),
        ##                        ('t_loop_divergence', 'Divergence at t-loop', None),
        ####                        ('div_at_0', 't-loop divergence at pos 0', None),
        ####                        ('div_at_1', 't-loop divergence at pos 1', None),
        ####                        ('div_at_2', 't-loop divergence at pos 2', None),
        ####                        ('div_at_3', 't-loop divergence at pos 3', None),
        ####                        ('div_at_8', 't-loop divergence at pos 8', None),
        ##                        ('acceptor_divergence', 'Divergence at acceptor', None)]
        ##                        ('div_at_neg_1', 'Acceptor divergence at pos -1', None),
        ##                        ('div_at_neg_2', 'Acceptor divergence at pos -2', None),
        ##                        ('div_at_neg_3', 'Acceptor divergence at pos -3', None)]

        for elem in filters.IsTRNA("").getFilters():
            pretty_names.append((str(elem), "Failed at " + str(elem), None))
        for key, label, color in pretty_names:
            try:
                if color:
                    self.run.info(label, self.stats[key], mc=color)
                else:
                    self.run.info(label, self.stats[key])
            except KeyError:
                self.run.info(label, 0)
Esempio n. 2
0
    def __init__(self):
        """Initializes variables for the extractor"""
        self.extractor_stats_file = ""
        self.loop_guidelines = filters.IsTRNA("").getAnticodonGuidelines()
        self.extractor_stats = ExtractorStats(
            [self.loop_guidelines[2], self.loop_guidelines[3]])

        self.allowed_pairings = {
            "G": ["C", "T"],
            "T": ["A", "G"],
            "C": ["G"],
            "A": ["T"]
        }
Esempio n. 3
0
    def process(self):
        """Run the sorter."""

        self.sanity_check()

        # creating an empty profile databsae
        profile_db = dbops.tRNADatabase(self.output_db_path)
        profile_db.create(meta_values={'sample_name': self.sample_name})

        # a list buffer to keep results
        results_buffer = []

        # an arbitrary max size to store and reset the buffer
        memory_max = 2000000

        # the filteredSequences directory
        slash_index = self.output_db_path.rfind("/")
        if slash_index == -1:
            slash_index = self.output_db_path.rfind(".")
        folder_output_path = self.output_db_path[:slash_index] + "/filteredSequences/"
        if not os.path.exists(folder_output_path):
            os.makedirs(folder_output_path)

        is_trna = filters.IsTRNA(folder_output_path)
        t_loop_guidelines = is_trna.get_t_loop_and_acceptor_guidelines()
        run_filters = is_trna.getFilters()
        for i in range(2):
            for elem in run_filters[i]:
                temp = open(folder_output_path + elem, "w")
                temp.write("")

        sub_size = 24

        input_fasta = u.SequenceSource(self.input_fasta_path)

        self.run.info('Hi', terminal.get_date(), mc='green')
        self.run.info('Sample name', self.sample_name)
        self.run.info('Input FASTA', self.input_fasta_path)

        table_for_tRNA_seqs = dbops.TableFortRNASequences(self.output_db_path)

        self.progress.new('Profiling tRNAs')
        self.progress.update('...')
        while next(input_fasta):

            self.stats_dict['total_seqs'] += 1
            seq = input_fasta.seq.upper()
            length = len(seq)
            cur_seq_specs = SeqSpecs()
            cur_seq_specs.length = length

            problem = is_trna.istRNA(seq, input_fasta.id)
            if problem != "":
                self.stats_dict[problem] += 1
                self.stats_dict['total_rejected'] += 1
            else:
                #trying to determine length of trailer
                for i in range(length - sub_size + 1):
                    sub_str = seq[-(i + sub_size):(length - i)]
                    missed = []
                    for position_tuple in t_loop_guidelines[0]:
                        if sub_str[position_tuple[0]] != position_tuple[1]:
                            missed.append(position_tuple)
                    if len(missed) < t_loop_guidelines[1] + 1:
                        for elem in missed:
                            self.stats_dict[str(elem)] += 1
                        if len(missed) == 0:
                            self.stats_dict['no_divergence']
                        cur_seq_specs.seq = seq
                        cur_seq_specs.seq_sub = sub_str
                        cur_seq_specs.t_loop_seq = sub_str[0:9]
                        cur_seq_specs.acceptor_seq = sub_str[-3:]
                        cur_seq_specs = self.handle_pass_seq(cur_seq_specs, i)
                        results_buffer.append(
                            ('%s_%d' % (self.sample_name, input_fasta.pos),
                             cur_seq_specs))
                        break

            if sys.getsizeof(results_buffer) > memory_max:
                self.progress.update(
                    'Writing %d items in the buffer to the DB ...' %
                    len(results_buffer))
                table_for_tRNA_seqs.append_sequences(results_buffer)
                results_buffer = []

            if self.stats_dict['total_seqs'] % 1000 == 0:
                t, p = self.stats_dict['total_seqs'], self.stats_dict[
                    'total_passed']
                self.progress.update('%s :: %s (num tRNAs :: num raw reads so far): %.2f%% ...' %\
                                        (pp(p), pp(t), p * 100 / t))

        self.progress.update('Writing %d items in the buffer to the DB ...' %
                             len(results_buffer))
        table_for_tRNA_seqs.append_sequences(results_buffer)
        results_buffer = []

        # essentially we are done here. let's populate the stats table:
        profile_db = dbops.tRNADatabase(self.output_db_path)
        self.progress.update('Writing stats ...')
        for key in self.stats_dict:
            profile_db.db.set_stat_value(key, self.stats_dict[key])
        profile_db.disconnect()

        self.progress.end()

        self.run.info('Total raw seqs processed',
                      self.stats_dict['total_seqs'])
        self.run.info('Total tRNA seqs recovered',
                      self.stats_dict['total_passed'])
        self.run.info('Total full length tRNA seqs',
                      self.stats_dict['total_full_length'])
        self.run.info('Output DB path', self.output_db_path)
        self.run.info('Bye', terminal.get_date(), mc='green')

        self.run.quit()