Esempio n. 1
0
 def initialize_pool_sequence_mappings(self, mapq_cutoff=30):
     if self.get_property(
             'force_recount'
     ) or not self.lib_settings.sequence_counts_exist():
         gene_names = []
         trimmed_sequences = bzUtils.convertFastaToDict(
             self.experiment_settings.get_trimmed_pool_fasta())
         for sequence_name in trimmed_sequences:
             gene_name = sequence_name.split('_')[
                 0]  #TL names are assumed to be of type:YLR350W_-68_651_116
             gene_names.append(gene_name)
             self.pool_sequence_mappings[
                 sequence_name] = pool_sequence_mapping(
                     sequence_name, trimmed_sequences[sequence_name])
         samfile = pysam.Samfile(self.lib_settings.get_mapped_reads(), "rb")
         ra = read_assigner(self.pool_sequence_mappings, samfile,
                            mapq_cutoff)
         for aligned_read in samfile.fetch():
             ra.assign_read(aligned_read)
         samfile.close()
         self.compute_lib_fractions()
         gene_counts = Counter(gene_names)
         for mapping_name in self.pool_sequence_mappings:
             if gene_counts[mapping_name.split('_')[0]] == 1:
                 self.pool_sequence_mappings[mapping_name].is_only_tl = True
             else:
                 assert gene_counts[mapping_name.split('_')[0]] != 0
                 self.pool_sequence_mappings[
                     mapping_name].is_only_tl = False
         bzUtils.makePickle(self.pool_sequence_mappings,
                            self.lib_settings.get_sequence_counts())
     else:
         self.pool_sequence_mappings = bzUtils.unPickle(
             self.lib_settings.get_sequence_counts())
Esempio n. 2
0
    def get_collapsed_read_fractions(self, lib_settings):
        out_name = os.path.join(
            self.experiment_settings.get_rdir(), 'QC', 'collapsed_fracs',
            '%(sample_name)s.collapsed_read_fractions.pkl' %
            {'sample_name': lib_settings.sample_name})
        if not tps_utils.file_exists(
                out_name) and not self.experiment_settings.get_property(
                    'force_recollapse'):
            collapsed_reads_file = lib_settings.get_collapsed_reads()
            read_counts = []
            f = gzip.open(collapsed_reads_file)
            for line in f:
                if not line.strip() == '' and not line.startswith(
                        '#'):  #ignore empty lines and commented out lines
                    if line.startswith(
                            '>'):  #> marks the start of a new sequence
                        num_reads = int(line[1:].strip().split('-')[1])
                        read_counts.append(num_reads)
                    else:
                        continue
            f.close()
            read_fractions = np.array(read_counts) / float(sum(read_counts))
            bzUtils.makePickle(read_fractions, out_name)
        else:
            read_fractions = bzUtils.unPickle(out_name)

        return (lib_settings.sample_name, read_fractions)
Esempio n. 3
0
    def get_collapsed_read_fractions(self, lib_settings):
        out_name = os.path.join(
            self.experiment_settings.get_rdir(),
            "QC",
            "collapsed_fracs",
            "%(sample_name)s.collapsed_read_fractions.pkl" % {"sample_name": lib_settings.sample_name},
        )
        if not tps_utils.file_exists(out_name) and not self.experiment_settings.get_property("force_recollapse"):
            collapsed_reads_file = lib_settings.get_collapsed_reads()
            read_counts = []
            f = gzip.open(collapsed_reads_file)
            for line in f:
                if not line.strip() == "" and not line.startswith("#"):  # ignore empty lines and commented out lines
                    if line.startswith(">"):  # > marks the start of a new sequence
                        num_reads = int(line[1:].strip().split("-")[1])
                        read_counts.append(num_reads)
                    else:
                        continue
            f.close()
            read_fractions = np.array(read_counts) / float(sum(read_counts))
            bzUtils.makePickle(read_fractions, out_name)
        else:
            read_fractions = bzUtils.unPickle(out_name)

        return (lib_settings.sample_name, read_fractions)
Esempio n. 4
0
 def initialize_pool_sequence_mappings(self, mapq_cutoff = 30):
     if self.get_property('force_recount') or not self.lib_settings.sequence_counts_exist():
         gene_names = []
         trimmed_sequences = bzUtils.convertFastaToDict(self.experiment_settings.get_trimmed_pool_fasta())
         for sequence_name in trimmed_sequences:
             gene_name = sequence_name.split('_')[0] #TL names are assumed to be of type:YLR350W_-68_651_116
             gene_names.append(gene_name)
             self.pool_sequence_mappings[sequence_name] = pool_sequence_mapping(sequence_name, trimmed_sequences[sequence_name])
         samfile = pysam.Samfile(self.lib_settings.get_mapped_reads(), "rb" )
         ra = read_assigner(self.pool_sequence_mappings, samfile, mapq_cutoff)
         for aligned_read in samfile.fetch():
             ra.assign_read(aligned_read)
         samfile.close()
         self.compute_lib_fractions()
         gene_counts = Counter(gene_names)
         for mapping_name in self.pool_sequence_mappings:
             if gene_counts[mapping_name.split('_')[0]]==1:
                 self.pool_sequence_mappings[mapping_name].is_only_tl = True
             else:
                 assert gene_counts[mapping_name.split('_')[0]] != 0
                 self.pool_sequence_mappings[mapping_name].is_only_tl = False
         bzUtils.makePickle(self.pool_sequence_mappings, self.lib_settings.get_sequence_counts())
     else:
         self.pool_sequence_mappings = bzUtils.unPickle(self.lib_settings.get_sequence_counts())