Esempio n. 1
0
def produce_transcript_base_compositions():
    gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    left_buffer = 500
    right_buffer = 500
    genes = {}

    windows = [5, 10, 20]

    for transcript in utilities.progress_bar(len(CDSs), CDSs):
        genes[transcript.name] = {}
        transcript.build_coordinate_maps()
        landmarks = {
            'start': 0,
            'start_codon': transcript.transcript_start_codon,
            'stop_codon': transcript.transcript_stop_codon,
            'end': transcript.transcript_length,
        }
        sequence = transcript.get_transcript_sequence(left_buffer,
                                                      right_buffer)

        A_locations = positions.PositionCounts(
            landmarks,
            left_buffer,
            right_buffer,
            data=(sequence.data == 'A'),
        )
        for window in windows:
            recent_As = positions.PositionCounts(
                landmarks,
                left_buffer,
                right_buffer,
            )
            for left_edge in range(
                    -left_buffer,
                    transcript.CDS_length + right_buffer - window):
                num_As = sum(A_locations['start',
                                         left_edge:left_edge + window])
                recent_As['start', left_edge] = num_As

            genes[transcript.name][window] = recent_As

        transcript.delete_coordinate_maps()

    Serialize.read_positions.write_file(genes, composition_fn)
Esempio n. 2
0
    def distribute_analytically(self):
        buffered_codon_counts = self.template_experiment.read_file('buffered_codon_counts')
        
        all_gene_names = sorted(buffered_codon_counts)
        piece_gene_names = Sequencing.Parallel.piece_of_list(all_gene_names,
                                                             self.num_pieces,
                                                             self.which_piece,
                                                            )
        
        simulated_codon_counts = {}
        cds_slice = slice('start_codon', ('stop_codon', 1))
        for i, gene_name in enumerate(piece_gene_names):
            identities = buffered_codon_counts[gene_name]['identities']
            codon_sequence = identities[cds_slice]

            real_counts = buffered_codon_counts[gene_name]['relaxed'][cds_slice]
            total_real_counts = sum(real_counts)

            rates_array = np.array([codon_rates[codon_id] for codon_id in codon_sequence])
            fractions_array = rates_array / sum(rates_array)
            
            simulated_counts = positions.PositionCounts(identities.landmarks,
                                                         identities.left_buffer,
                                                         identities.right_buffer,
                                                        )

            for position, fraction in enumerate(fractions_array):
                simulated_counts['start_codon', position] = np.random.binomial(total_real_counts, fraction)
            
            simulated_codon_counts[gene_name] = {'identities': identities,
                                                 'relaxed': simulated_counts,
                                                }

        self.write_file('simulated_codon_counts', simulated_codon_counts)
Esempio n. 3
0
    def simulate(self):
        buffered_codon_counts = self.template_experiment.read_file('buffered_codon_counts')
        
        codon_means = self.load_codon_means(self.template_experiment)
        if self.perturbation_model == 'change_all':
            perturbed_codon_means = self.load_codon_means(self.new_rates_experiment)
        else:
            perturbed_codon_means = None

        TEs = self.load_TEs()
        initiation_means = {gene_name: self.initiation_mean_numerator / TEs[gene_name] for gene_name in buffered_codon_counts} 

        all_gene_names = sorted(buffered_codon_counts)
        piece_gene_names = Sequencing.Parallel.piece_of_list(all_gene_names,
                                                             self.num_pieces,
                                                             self.which_piece,
                                                            )
        
        simulated_codon_counts = {}
        cds_slice = slice('start_codon', ('stop_codon', 1))
        for i, gene_name in enumerate(piece_gene_names):
            logging.info('Starting {0} ({1:,} / {2:,})'.format(gene_name, i, len(piece_gene_names) - 1))
            identities = buffered_codon_counts[gene_name]['identities']
            codon_sequence = identities[cds_slice]

            real_counts = buffered_codon_counts[gene_name]['relaxed'][cds_slice]
            total_real_counts = sum(real_counts)
            target = int(np.ceil(total_real_counts))

            all_measurements = Counter()
            num_messages = 0
            while sum(all_measurements.values()) < target:
                message = Message(codon_sequence, initiation_means[gene_name], codon_means, self.CHX_mean, perturbed_codon_means=perturbed_codon_means)
                message.evolve_to_steady_state()

                if self.perturbation_model == None:
                    message.introduce_CHX()
                else:
                    message.evolve_perturbed_CHX_model(self.perturbation_model)

                all_measurements.update(message.collect_measurements())
                num_messages += 1

                if num_messages % 10000 == 0:
                    logging.info('{0:,} counts generated for {1} from {2:,} messages (target = {3})'.format(sum(all_measurements.values()), gene_name, num_messages, target))

            simulated_counts = positions.PositionCounts(identities.landmarks,
                                                        identities.left_buffer,
                                                        identities.right_buffer,
                                                       )

            for key, value in all_measurements.items():
                simulated_counts['start_codon', key] = value
            
            simulated_codon_counts[gene_name] = {'identities': identities,
                                                 'relaxed': simulated_counts,
                                                }
            logging.info('{0:,} counts generated for {1} from {2:,} messages'.format(sum(all_measurements.values()), gene_name, num_messages))

        self.write_file('simulated_codon_counts', simulated_codon_counts)
Esempio n. 4
0
    def get_transcript_sequence(self, left_buffer=0, right_buffer=0):
        ''' Get the sequence of the mature transcript.
        '''
        # Remake coordinate maps to guarantee buffer sizes
        self.build_coordinate_maps(left_buffer, right_buffer)

        transcript_positions = range(
            -left_buffer,
            self.transcript_length + right_buffer,
        )
        genomic_positions = [
            self.transcript_to_genomic[t] for t in transcript_positions
        ]

        bases = [
            self.region_fetcher(self.seqname, p, p + 1)
            for p in genomic_positions
        ]
        sequence = ''.join(bases).upper()
        if self.strand == '-':
            sequence = utilities.complement(sequence)
        sequence = np.asarray(sequence, dtype='c')

        landmarks = {
            'start': 0,
            'start_codon': self.transcript_start_codon,
            'stop_codon': self.transcript_stop_codon,
            'end': self.transcript_length,
        }

        transcript_sequence = positions.PositionCounts(
            landmarks,
            left_buffer,
            right_buffer,
            data=sequence,
        )
        return transcript_sequence
Esempio n. 5
0
    def get_extent_sequence(self, left_buffer=0, right_buffer=0):
        ''' Get the sequence of the extent. Useful for looking at gene with
        annotated frameshifts.
        '''
        sequence = self.region_fetcher(
            self.seqname,
            min(self.genomic_to_extent),
            max(self.genomic_to_extent) + 1,
        )
        if self.strand == '-':
            sequence = utilities.reverse_complement(sequence)

        sequence = np.asarray(sequence, dtype='c')

        extent_landmarks = {
            'start': 0,
            'end': self.extent_length,
        }
        return positions.PositionCounts(
            extent_landmarks,
            left_buffer,
            right_buffer,
            data=sequence,
        )
Esempio n. 6
0
    def record_uniqueness(self):
        CDSs, _ = self.get_CDSs()
        uniqueness = {}
        transcripts = {}

        # For any genomic position that participates in a transcript, this will
        # contain a mapping to a set of all transcripts it participates in.
        genomic_to_all_transcripts = defaultdict(set)

        for transcript in CDSs:
            landmarks = {
                'start': 0,
                'start_codon': transcript.transcript_start_codon,
                'stop_codon': transcript.transcript_stop_codon,
                'end': transcript.transcript_length,
            }
            uniqueness[transcript.name] = {
                self.fragment_length:
                positions.PositionCounts(landmarks, self.common_buffer,
                                         self.common_buffer)
            }
            transcript.build_coordinate_maps(left_buffer=self.common_buffer,
                                             right_buffer=self.common_buffer)
            transcripts[transcript.name] = transcript

            for genomic_position, transcript_position in transcript.genomic_to_transcript.iteritems(
            ):
                full_position = (transcript.seqname, transcript.strand,
                                 genomic_position)
                genomic_to_all_transcripts[full_position].add(
                    (transcript.name, transcript_position))

        bam_file = pysam.Samfile(self.file_names['accepted_hits'])

        for read in bam_file:
            # If this read was incorrectly trimmed, don't record it.
            if read.qlen != self.fragment_length:
                continue

            annotation = artifical_annotation.from_prefix_identifier(
                read.qname)
            true_transcript = transcripts[annotation['transcript_name']]
            true_position = annotation['position']
            strand = '-' if read.is_reverse else '+'
            if strand == '+':
                five_prime = read.pos
            else:
                five_prime = read.aend - 1
            full_mapped_position = (bam_file.getrname(read.tid), strand,
                                    five_prime)

            if read.mapq < 50:
                # Flag the true source of the read as nonunique.
                uniqueness[true_transcript.name][self.fragment_length][
                    'start_codon', true_position] = 2

                # Hopefully redundantly, flag the position actually mapped to as
                # nonunqiue.
                for transcript_name, transcript_position in genomic_to_all_transcripts[
                        full_mapped_position]:
                    uniqueness[transcript_name][self.fragment_length][
                        'start_codon', transcript_position] = 2
            else:
                # Check that any read with a MAPQ of 50 is to the expected position.
                full_true_position = (
                    true_transcript.seqname,
                    true_transcript.strand,
                    true_transcript.transcript_to_genomic[true_position],
                )

                if read.mapq == 50 and (full_mapped_position !=
                                        full_true_position):
                    raise ValueError(full_mapped_position, full_true_position)

                # As long as this hasn't been mapped to by some other fragment,
                # mark it as unique.
                if uniqueness[true_transcript.name][self.fragment_length][
                        'start_codon', true_position] == 0:
                    uniqueness[true_transcript.name][self.fragment_length][
                        'start_codon', true_position] = 1

        self.write_file('uniqueness', uniqueness)
Esempio n. 7
0
def plot_mRNA_metagene_unaveraged(from_end, min_length, max_length):
    bmap = brewer2mpl.get_map('Set1', 'qualitative', 9)
    colors = cycle(bmap.mpl_colors[:5] + bmap.mpl_colors[6:])

    experiments = select_work.build_all_experiments(verbose=False)
    mRNA_experiments = [  #('WT_mRNA_1', 'polyA', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_mRNA_1', 'polyA', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_mRNA_1', 'stop_codon', 0, experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_mRNA_1', 'stop_codon', 'nonzero', experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_mRNA_1', 'start', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_mRNA_1', 'start_codon', 'all', experiments['belgium_2014_12_10']['WT_1_mRNA']),
        #('WT_cDNA_mRNA', 'cap', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
        #('WT_cDNA_mRNA', 'start_codon', 'all', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
        #('R98S_1_mRNA', 'cap', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']),
        #('R98S_1_mRNA', 'start_codon', 'all', experiments['belgium_2014_12_10']['R98S_1_mRNA']),
        ##('WT_mRNA_1 3\'', experiments['belgium_2014_12_10']['WT_1_mRNA']),
        ##('WT_mRNA_2 3\'', experiments['belgium_2014_12_10']['WT_2_mRNA']),
        ##('WT_cDNA_mRNA 3\'', experiments['belgium_2013_08_06']['WT_cDNA_mRNA']),
        #('RiboZero', 'polyA', 0, experiments['weinberg']['RiboZero']),
        #('RiboZero', 'polyA', 'nonzero', experiments['weinberg']['RiboZero']),
        ('RiboZero', 'start', 'all', experiments['weinberg']['RiboZero']),
        ('RiboZero', 'start_codon', 'all',
         experiments['weinberg']['RiboZero']),
        ##('RiboZero', 'stop_codon', 0, experiments['weinberg']['RiboZero']),
        ##('RiboZero', 'stop_codon', 'nonzero', experiments['weinberg']['RiboZero']),
        #('Dynabeads', 'polyA', 0, experiments['weinberg']['Dynabeads']),
        #('Dynabeads', 'polyA', 'nonzero', experiments['weinberg']['Dynabeads']),
        #('Dynabeads', 'cap', 'all', experiments['weinberg']['Dynabeads']),
        #('Dynabeads', 'start_codon', 'all', experiments['weinberg']['Dynabeads']),
        ##('Dynabeads', 'stop_codon', 0, experiments['weinberg']['Dynabeads']),
        ##('Dynabeads', 'stop_codon', 'nonzero', experiments['weinberg']['Dynabeads']),
    ]

    plot_to = 500
    fig_cumulative, ax_cumulative = plt.subplots()

    edge_buffer = 200
    if from_end:
        xs = np.arange(-plot_to, edge_buffer)
    else:
        xs = np.arange(-edge_buffer, plot_to)

    unexpected_counts = {}

    for (name, landmark, key,
         experiment), color in zip(mRNA_experiments, colors):
        print name, landmark, key

        if from_end:
            counts_generator = counts_from_read_positions_fn(
                experiment.file_names['three_prime_read_positions'], key=key)
        else:
            counts_generator = counts_from_read_positions_fn(
                experiment.file_names['read_positions'], key='all')

        landmarks = {
            'start': 0,
            'start_codon': 0,
            'stop_codon': 90000,
            'end': 90000
        }
        expected_counts = positions.PositionCounts(landmarks,
                                                   400,
                                                   400,
                                                   dtype=float)
        actual_counts = positions.PositionCounts(landmarks,
                                                 400,
                                                 400,
                                                 dtype=float)

        for gene_name, counts in counts_generator:
            if not min_length <= counts.CDS_length <= max_length:
                continue

            num_positions = counts.CDS_length + edge_buffer

            if from_end:
                edge_slice = (landmark, slice(-counts.CDS_length, edge_buffer))
            else:
                edge_slice = (landmark, slice(-edge_buffer, counts.CDS_length))
                unexpected_slice = (landmark, slice(-edge_buffer, 0))

            r_g = counts[edge_slice].sum()
            uniform_counts = np.ones(num_positions) * r_g / num_positions

            actual_counts[edge_slice] += counts[edge_slice]
            expected_counts[edge_slice] += uniform_counts

            unexpected_counts[gene_name] = counts[unexpected_slice].sum()

        print actual_counts.sum()
        print expected_counts.sum()

        most_unexpected = sorted(unexpected_counts,
                                 key=unexpected_counts.get,
                                 reverse=True)

        for n in most_unexpected[:10]:
            print n, unexpected_counts[n]

        if from_end:
            plot_slice = (landmark, slice(-plot_to, edge_buffer))
        else:
            plot_slice = ('start_codon', slice(-edge_buffer, plot_to))

        ax_cumulative.plot(xs, expected_counts[plot_slice], '--', color=color)
        ax_cumulative.plot(xs,
                           actual_counts[plot_slice],
                           'o-',
                           color=color,
                           markersize=2,
                           markeredgewidth=0,
                           label='{0}, {1}, {2}, actual'.format(
                               name, landmark, key))
        #ax_cumulative.plot(xs, smoothed(actual_counts[-49:plot_to], 15) / expected_counts[0], '-', label='{0}'.format(name), color=color)
        #ax_cumulative.set_ylim(0.8, 1.5)

    #ax_cumulative.plot(xs, np.zeros(plot_to), 'k--')
    ax_cumulative.legend(loc='upper left', framealpha=0.5)
    if from_end:
        xlabel = 'Position relative to {0}'.format(landmark)
    else:
        xlabel = 'Position relative to start of CDS'
    ax_cumulative.set_xlabel(xlabel)
    ax_cumulative.set_xlim(min(xs), max(xs))
    ax_cumulative.set_ylabel('Mapped read counts, normalized across data sets')
    #ax_cumulative.set_title('Read counts in the final {0} bases of CDSs at least {0} long'.format(min_length))

    fig_cumulative.set_size_inches(18, 12)
Esempio n. 8
0
if __name__ == '__main__':
    gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    import select_work
    exps = select_work.build_all_experiments(verbose=False)

    reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[
        'three_prime_read_positions']
    reads_fh = h5py.File(reads_fn, 'r')

    meta_counts = positions.PositionCounts({'A': 0},
                                           left_buffer=100000,
                                           right_buffer=100000)

    f = h5py.File(composition_fn, 'r')
    for t in utilities.progress_bar(len(CDSs), CDSs):
        if t.name not in reads_fh:
            continue
        gene = Serialize.read_positions.build_gene(f[t.name])
        t.build_coordinate_maps()

        if t.transcript_length < 301:
            continue
        end = t.transcript_length - 200
        sl = ('start', np.arange(100, end))
        A_rich_position = gene[10].argmax_over_slice(*sl)
        if gene[10]['start', A_rich_position] > 9: