Esempio n. 1
0
    def get_CDSs(self, force_all=False):
        all_CDSs = gff.get_CDSs(
            self.file_names['genes'],
            self.file_names['genome'],
        )

        if self.transcripts_file_name == None:
            CDSs = all_CDSs
        else:
            transcripts = {
                line.strip()
                for line in open(self.transcripts_file_name)
            }
            CDSs = [t for t in all_CDSs if t.name in transcripts]

        max_gene_length = 0
        for CDS in CDSs:
            CDS.build_coordinate_maps()
            max_gene_length = max(max_gene_length, CDS.transcript_length)
            CDS.delete_coordinate_maps()

        if force_all:
            piece_CDSs = CDSs
        else:
            piece_CDSs = piece_of_list(CDSs, self.num_pieces, self.which_piece)

        return piece_CDSs, max_gene_length
Esempio n. 2
0
def produce_transcript_base_compositions():
    gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    left_buffer = 500
    right_buffer = 500
    genes = {}

    windows = [5, 10, 20]

    for transcript in utilities.progress_bar(len(CDSs), CDSs):
        genes[transcript.name] = {}
        transcript.build_coordinate_maps()
        landmarks = {
            'start': 0,
            'start_codon': transcript.transcript_start_codon,
            'stop_codon': transcript.transcript_stop_codon,
            'end': transcript.transcript_length,
        }
        sequence = transcript.get_transcript_sequence(left_buffer,
                                                      right_buffer)

        A_locations = positions.PositionCounts(
            landmarks,
            left_buffer,
            right_buffer,
            data=(sequence.data == 'A'),
        )
        for window in windows:
            recent_As = positions.PositionCounts(
                landmarks,
                left_buffer,
                right_buffer,
            )
            for left_edge in range(
                    -left_buffer,
                    transcript.CDS_length + right_buffer - window):
                num_As = sum(A_locations['start',
                                         left_edge:left_edge + window])
                recent_As['start', left_edge] = num_As

            genes[transcript.name][window] = recent_As

        transcript.delete_coordinate_maps()

    Serialize.read_positions.write_file(genes, composition_fn)
Esempio n. 3
0
    def get_CDSs(self, force_all=False):
        all_CDSs = gff.get_CDSs(self.file_names["genes"], self.file_names["genome"])

        if self.transcripts_file_name == None:
            CDSs = all_CDSs
        else:
            transcripts = {line.strip() for line in open(self.transcripts_file_name)}
            CDSs = [t for t in all_CDSs if t.name in transcripts]

        max_gene_length = 0
        for CDS in CDSs:
            CDS.build_coordinate_maps()
            max_gene_length = max(max_gene_length, CDS.transcript_length)
            CDS.delete_coordinate_maps()

        if force_all:
            piece_CDSs = CDSs
        else:
            piece_CDSs = piece_of_list(CDSs, self.num_pieces, self.which_piece)

        return piece_CDSs, max_gene_length
def produce_transcript_base_compositions():
    gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    left_buffer = 500
    right_buffer = 500
    genes = {}

    windows = [5, 10, 20]

    for transcript in utilities.progress_bar(len(CDSs), CDSs):
        genes[transcript.name] = {}
        transcript.build_coordinate_maps()
        landmarks = {
            "start": 0,
            "start_codon": transcript.transcript_start_codon,
            "stop_codon": transcript.transcript_stop_codon,
            "end": transcript.transcript_length,
        }
        sequence = transcript.get_transcript_sequence(left_buffer, right_buffer)

        A_locations = positions.PositionCounts(landmarks, left_buffer, right_buffer, data=(sequence.data == "A"))
        for window in windows:
            recent_As = positions.PositionCounts(landmarks, left_buffer, right_buffer)
            for left_edge in range(-left_buffer, transcript.CDS_length + right_buffer - window):
                num_As = sum(A_locations["start", left_edge : left_edge + window])
                recent_As["start", left_edge] = num_As

            genes[transcript.name][window] = recent_As

        transcript.delete_coordinate_maps()

    Serialize.read_positions.write_file(genes, composition_fn)
            total = gene["all"]["start_codon", xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))


if __name__ == "__main__":
    gff_fn = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff"
    genome_dir = "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/"
    composition_fn = (
        "/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5"
    )
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    import select_work

    exps = select_work.build_all_experiments(verbose=False)

    reads_fn = exps["belgium_2014_12_10"]["WT_1_mRNA"].file_names["three_prime_read_positions"]
    reads_fh = h5py.File(reads_fn, "r")

    meta_counts = positions.PositionCounts({"A": 0}, left_buffer=100000, right_buffer=100000)

    f = h5py.File(composition_fn, "r")
    for t in utilities.progress_bar(len(CDSs), CDSs):
        if t.name not in reads_fh:
            continue
        gene = Serialize.read_positions.build_gene(f[t.name])
Esempio n. 6
0
            argmaxes[name][argmax] += 1
            most = gene['all']['start_codon', argmax]
            total = gene['all']['start_codon', xs].sum()
            if total == 0:
                print transcript
            if total > 9:
                fraction = np.true_divide(most, total)
                fractions[name].append(fraction)
                joints[name].append((argmax, fraction))


if __name__ == '__main__':
    gff_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gff'
    genome_dir = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/genome/'
    composition_fn = '/home/jah/projects/ribosomes/data/organisms/saccharomyces_cerevisiae/EF4/transcript_recent_As.hdf5'
    CDSs = gff.get_CDSs(gff_fn, genome_dir)

    import select_work
    exps = select_work.build_all_experiments(verbose=False)

    reads_fn = exps['belgium_2014_12_10']['WT_1_mRNA'].file_names[
        'three_prime_read_positions']
    reads_fh = h5py.File(reads_fn, 'r')

    meta_counts = positions.PositionCounts({'A': 0},
                                           left_buffer=100000,
                                           right_buffer=100000)

    f = h5py.File(composition_fn, 'r')
    for t in utilities.progress_bar(len(CDSs), CDSs):
        if t.name not in reads_fh: