Ejemplo n.º 1
0
def select_transcripts(options):

    # load all transcripts
    transcript_models_dict = load_data.load_gtf(options.gtf_file)
    transcript_models = transcript_models_dict.values()
    T = len(transcript_models)

    # get translation level in all transcripts
    ribo_track = load_data.RiboSeq(options.riboseq_file)
    transcript_translation_rate = [
        c / float(t.mask.sum()) for c, t in zip(
            ribo_track.get_total_counts(transcript_models), transcript_models)
    ]

    # select top transcripts
    transcripts = []
    transcript_bounds = dict()
    order = np.argsort(transcript_translation_rate)[::-1]
    for index in order:
        transcript = transcript_models[index]

        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts([transcript])[0]
        if np.any(exon_counts < 5):
            continue

        # check if transcript overlaps any previous transcript
        # filter out strict overlaps
        overlap = False
        try:
            for bound in transcript_bounds[transcript.chromosome]:
                if not (transcript.stop < bound[0]
                        or transcript.start > bound[1]):
                    overlap = True
                    break
        except KeyError:
            pass
        if overlap:
            continue

        transcripts.append(transcript)
        try:
            transcript_bounds[transcript.chromosome].append(
                [transcript.start, transcript.stop])
        except KeyError:
            transcript_bounds[transcript.chromosome] = [[
                transcript.start, transcript.stop
            ]]

        # select fixed number of transcripts for learning
        if len(transcripts) >= options.batch:
            break

    return transcripts
Ejemplo n.º 2
0
def select_transcripts(options):
    
    # load all transcripts
    transcript_models_dict = load_data.load_gtf(options.gtf_file)
    transcript_models = transcript_models_dict.values()
    T = len(transcript_models)

    # get translation level in all transcripts
    ribo_track = load_data.RiboSeq(options.riboseq_file)
    transcript_translation_rate = [c/float(t.mask.sum()) for c,t in 
        zip(ribo_track.get_total_counts(transcript_models), transcript_models)]

    # select top transcripts
    transcripts = []
    transcript_bounds = dict()
    order = np.argsort(transcript_translation_rate)[::-1]
    for index in order:
        transcript = transcript_models[index]
 
        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts([transcript])[0]
        if np.any(exon_counts<5):
            continue

        # check if transcript overlaps any previous transcript
        # filter out strict overlaps
        overlap = False
        try:
            for bound in transcript_bounds[transcript.chromosome]:
                if not (transcript.stop<bound[0] or transcript.start>bound[1]):
                    overlap = True
                    break
        except KeyError:
            pass
        if overlap:
            continue

        transcripts.append(transcript)
        try:
            transcript_bounds[transcript.chromosome].append([transcript.start, transcript.stop])
        except KeyError:
            transcript_bounds[transcript.chromosome] = [[transcript.start, transcript.stop]]

        # select fixed number of transcripts for learning
        if len(transcripts)>=options.batch:
            break

    return transcripts
Ejemplo n.º 3
0
def infer(options):

    # load the model
    handle = open(options.model_file, 'r')
    transition = cPickle.load(handle)
    emission = cPickle.load(handle)
    handle.close()

    # load transcripts
    transcript_models = load_data.load_gtf(options.gtf_file)
    transcript_names = transcript_models.keys()
    N = len(transcript_names)
    n = int(np.ceil(N / 1000))

    # load data tracks
    genome_track = load_data.Genome(options.fasta_file,
                                    options.mappability_file)
    ribo_track = load_data.RiboSeq(options.riboseq_file)
    if options.rnaseq_file is not None:
        rnaseq_track = load_data.RnaSeq(options.rnaseq_file)

    # open output file handle
    # file in bed12 format
    handle = open(options.output_file, 'w')
    towrite = [
        "chromosome", "start", "stop", "transcript_id", "posterior", "strand",
        "cdstart", "cdstop", "protein_seq", "num_exons", "exon_sizes",
        "exon_starts"
    ]
    handle.write(" ".join(map(str, towrite)) + '\n')

    for n in xrange(N):

        tnames = transcript_names[n * 1000:(n + 1) * 1000]
        alltranscripts = [transcript_models[name] for name in tnames]

        # run inference on both strands independently

        # focus on positive strand
        for t in alltranscripts:
            if t.strand == '-':
                t.mask = t.mask[::-1]
                t.strand = '+'

        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts(alltranscripts)
        transcripts = [
            t for t, e in zip(alltranscripts, exon_counts) if np.all(e >= 5)
        ]
        T = len(transcripts)
        if T > 0:

            # load sequence of transcripts and transform sequence data
            codon_flags = []
            rna_sequences = genome_track.get_sequence(transcripts)
            for rna_sequence in rna_sequences:
                sequence = seq.RnaSequence(rna_sequence)
                codon_flags.append(sequence.mark_codons())

            # load footprint count data in transcripts
            footprint_counts = ribo_track.get_counts(transcripts)

            # load transcript-level rnaseq RPKM
            if options.rnaseq_file is None:
                rna_counts = np.ones((T, ), dtype='float')
            else:
                rna_counts = rnaseq_track.get_total_counts(transcripts)

            # load mappability of transcripts; transform mappability to missingness
            if options.mappability_file is not None:
                rna_mappability = genome_track.get_mappability(transcripts)
            else:
                rna_mappability = [
                    np.ones(c.shape, dtype='bool') for c in footprint_counts
                ]

            # run the learning algorithm
            states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \
                                   rna_counts, rna_mappability, transition, emission)

            # write results
            ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \
                  for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)]

        # focus on negative strand
        for t in alltranscripts:
            t.mask = t.mask[::-1]
            t.strand = '-'

        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts(alltranscripts)
        transcripts = [
            t for t, e in zip(alltranscripts, exon_counts) if np.all(e >= 5)
        ]
        T = len(transcripts)
        if T > 0:

            # load sequence of transcripts and transform sequence data
            codon_flags = []
            rna_sequences = genome_track.get_sequence(transcripts)
            for rna_sequence in rna_sequences:
                sequence = seq.RnaSequence(rna_sequence)
                codon_flags.append(sequence.mark_codons())

            # load footprint count data in transcripts
            footprint_counts = ribo_track.get_counts(transcripts)

            # load transcript-level rnaseq RPKM
            if options.rnaseq_file is None:
                rna_counts = np.ones((T, ), dtype='float')
            else:
                rna_counts = rnaseq_track.get_total_counts(transcripts)

            # load mappability of transcripts; transform mappability to missingness
            if options.mappability_file is not None:
                rna_mappability = genome_track.get_mappability(transcripts)
            else:
                rna_mappability = [
                    np.ones(c.shape, dtype='bool') for c in footprint_counts
                ]

            # run the learning algorithm
            states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \
                                   rna_counts, rna_mappability, transition, emission)

            # write results
            ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \
                  for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)]

    handle.close()
    ribo_track.close()
    if options.rnaseq_file is not None:
        rnaseq_track.close()
    genome_track.close()
        options.output_fastq_prefix = options.gtf_file + '_%d.fq.gz' % options.footprint_length
    else:
        options.output_fastq_prefix = options.output_fastq_prefix + '_%d.fq.gz' % options.footprint_length

    return options


if __name__ == "__main__":

    options = parse_args()

    qual = ''.join(['~' for r in xrange(options.footprint_length)])
    seq_handle = pysam.FastaFile(options.fasta_file)

    # load transcripts
    transcripts = load_data.load_gtf(options.gtf_file)
    tnames = transcripts.keys()

    fastq_handle = gzip.open(options.output_fastq_prefix, 'wb')
    for num, tname in enumerate(tnames):

        transcript = transcripts[tname]

        # get transcript DNA sequence
        sequence = seq_handle.fetch(transcript.chromosome, transcript.start,
                                    transcript.stop).upper()

        # get forward strand reads
        if transcript.strand == "-":
            transcript.mask = transcript.mask[::-1]
            transcript.strand = "+"
    if options.output_fastq_prefix is None:
        options.output_fastq_prefix = options.gtf_file+'_%d.fq.gz'%options.footprint_length
    else:
        options.output_fastq_prefix = options.output_fastq_prefix+'_%d.fq.gz'%options.footprint_length

    return options

if __name__=="__main__":

    options = parse_args()

    qual = ''.join(['~' for r in xrange(options.footprint_length)])
    seq_handle = pysam.FastaFile(options.fasta_file)

    # load transcripts
    transcripts = load_data.load_gtf(options.gtf_file)
    tnames = transcripts.keys()

    fastq_handle = gzip.open(options.output_fastq_prefix, 'wb')
    for num,tname in enumerate(tnames):

        transcript = transcripts[tname]

        # get transcript DNA sequence
        sequence = seq_handle.fetch(transcript.chromosome, transcript.start, transcript.stop).upper()

        # get forward strand reads
        if transcript.strand=="-":
            transcript.mask = transcript.mask[::-1]
            transcript.strand = "+"
Ejemplo n.º 6
0
def infer(options):

    # load the model
    handle = open(options.model_file, 'r')
    transition = cPickle.load(handle)
    emission = cPickle.load(handle)
    handle.close()

    # load transcripts
    transcript_models = load_data.load_gtf(options.gtf_file)
    transcript_names = transcript_models.keys()
    N = len(transcript_names)
    n = int(np.ceil(N/1000))
    
    # load data tracks
    genome_track = load_data.Genome(options.fasta_file, options.mappability_file)
    ribo_track = load_data.RiboSeq(options.riboseq_file)
    if options.rnaseq_file is not None:
        rnaseq_track = load_data.RnaSeq(options.rnaseq_file)

    # open output file handle
    # file in bed12 format
    handle = open(options.output_file,'w')
    towrite = ["chromosome", "start", "stop", "transcript_id", 
               "posterior", "strand", "cdstart", "cdstop", 
               "protein_seq", "num_exons", "exon_sizes", "exon_starts"]
    handle.write(" ".join(map(str,towrite))+'\n')

    for n in xrange(N):

        tnames = transcript_names[n*1000:(n+1)*1000]
        alltranscripts = [transcript_models[name] for name in tnames]

        # run inference on both strands independently

        # focus on positive strand
        for t in alltranscripts:
            if t.strand=='-':
                t.mask = t.mask[::-1]
                t.strand = '+'

        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts(alltranscripts)
        transcripts = [t for t,e in zip(alltranscripts,exon_counts) if np.all(e>=5)]
        T = len(transcripts)
        if T>0:

            # load sequence of transcripts and transform sequence data
            codon_flags = []
            rna_sequences = genome_track.get_sequence(transcripts)
            for rna_sequence in rna_sequences:
                sequence = seq.RnaSequence(rna_sequence)
                codon_flags.append(sequence.mark_codons())

            # load footprint count data in transcripts
            footprint_counts = ribo_track.get_counts(transcripts)

            # load transcript-level rnaseq RPKM
            if options.rnaseq_file is None:
                rna_counts = np.ones((T,), dtype='float')
            else:
                rna_counts = rnaseq_track.get_total_counts(transcripts)

            # load mappability of transcripts; transform mappability to missingness
            if options.mappability_file is not None:
                rna_mappability = genome_track.get_mappability(transcripts)
            else:
                rna_mappability = [np.ones(c.shape,dtype='bool') for c in footprint_counts]

            # run the learning algorithm
            states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \
                                   rna_counts, rna_mappability, transition, emission)

            # write results
            ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \
                  for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)]


        # focus on negative strand
        for t in alltranscripts:
            t.mask = t.mask[::-1]
            t.strand = '-'

        # check if all exons have at least 5 footprints
        exon_counts = ribo_track.get_exon_total_counts(alltranscripts)
        transcripts = [t for t,e in zip(alltranscripts,exon_counts) if np.all(e>=5)]
        T = len(transcripts)
        if T>0:

            # load sequence of transcripts and transform sequence data
            codon_flags = []
            rna_sequences = genome_track.get_sequence(transcripts)
            for rna_sequence in rna_sequences:
                sequence = seq.RnaSequence(rna_sequence)
                codon_flags.append(sequence.mark_codons())

            # load footprint count data in transcripts
            footprint_counts = ribo_track.get_counts(transcripts)

            # load transcript-level rnaseq RPKM
            if options.rnaseq_file is None:
                rna_counts = np.ones((T,), dtype='float')
            else:
                rna_counts = rnaseq_track.get_total_counts(transcripts)

            # load mappability of transcripts; transform mappability to missingness
            if options.mappability_file is not None:
                rna_mappability = genome_track.get_mappability(transcripts)
            else:
                rna_mappability = [np.ones(c.shape,dtype='bool') for c in footprint_counts]

            # run the learning algorithm
            states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \
                                   rna_counts, rna_mappability, transition, emission)

            # write results
            ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \
                  for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)]

    handle.close()
    ribo_track.close()
    if options.rnaseq_file is not None:
        rnaseq_track.close()
    genome_track.close()