def select_transcripts(options): # load all transcripts transcript_models_dict = load_data.load_gtf(options.gtf_file) transcript_models = transcript_models_dict.values() T = len(transcript_models) # get translation level in all transcripts ribo_track = load_data.RiboSeq(options.riboseq_file) transcript_translation_rate = [ c / float(t.mask.sum()) for c, t in zip( ribo_track.get_total_counts(transcript_models), transcript_models) ] # select top transcripts transcripts = [] transcript_bounds = dict() order = np.argsort(transcript_translation_rate)[::-1] for index in order: transcript = transcript_models[index] # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts([transcript])[0] if np.any(exon_counts < 5): continue # check if transcript overlaps any previous transcript # filter out strict overlaps overlap = False try: for bound in transcript_bounds[transcript.chromosome]: if not (transcript.stop < bound[0] or transcript.start > bound[1]): overlap = True break except KeyError: pass if overlap: continue transcripts.append(transcript) try: transcript_bounds[transcript.chromosome].append( [transcript.start, transcript.stop]) except KeyError: transcript_bounds[transcript.chromosome] = [[ transcript.start, transcript.stop ]] # select fixed number of transcripts for learning if len(transcripts) >= options.batch: break return transcripts
def select_transcripts(options): # load all transcripts transcript_models_dict = load_data.load_gtf(options.gtf_file) transcript_models = transcript_models_dict.values() T = len(transcript_models) # get translation level in all transcripts ribo_track = load_data.RiboSeq(options.riboseq_file) transcript_translation_rate = [c/float(t.mask.sum()) for c,t in zip(ribo_track.get_total_counts(transcript_models), transcript_models)] # select top transcripts transcripts = [] transcript_bounds = dict() order = np.argsort(transcript_translation_rate)[::-1] for index in order: transcript = transcript_models[index] # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts([transcript])[0] if np.any(exon_counts<5): continue # check if transcript overlaps any previous transcript # filter out strict overlaps overlap = False try: for bound in transcript_bounds[transcript.chromosome]: if not (transcript.stop<bound[0] or transcript.start>bound[1]): overlap = True break except KeyError: pass if overlap: continue transcripts.append(transcript) try: transcript_bounds[transcript.chromosome].append([transcript.start, transcript.stop]) except KeyError: transcript_bounds[transcript.chromosome] = [[transcript.start, transcript.stop]] # select fixed number of transcripts for learning if len(transcripts)>=options.batch: break return transcripts
def infer(options): # load the model handle = open(options.model_file, 'r') transition = cPickle.load(handle) emission = cPickle.load(handle) handle.close() # load transcripts transcript_models = load_data.load_gtf(options.gtf_file) transcript_names = transcript_models.keys() N = len(transcript_names) n = int(np.ceil(N / 1000)) # load data tracks genome_track = load_data.Genome(options.fasta_file, options.mappability_file) ribo_track = load_data.RiboSeq(options.riboseq_file) if options.rnaseq_file is not None: rnaseq_track = load_data.RnaSeq(options.rnaseq_file) # open output file handle # file in bed12 format handle = open(options.output_file, 'w') towrite = [ "chromosome", "start", "stop", "transcript_id", "posterior", "strand", "cdstart", "cdstop", "protein_seq", "num_exons", "exon_sizes", "exon_starts" ] handle.write(" ".join(map(str, towrite)) + '\n') for n in xrange(N): tnames = transcript_names[n * 1000:(n + 1) * 1000] alltranscripts = [transcript_models[name] for name in tnames] # run inference on both strands independently # focus on positive strand for t in alltranscripts: if t.strand == '-': t.mask = t.mask[::-1] t.strand = '+' # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts(alltranscripts) transcripts = [ t for t, e in zip(alltranscripts, exon_counts) if np.all(e >= 5) ] T = len(transcripts) if T > 0: # load sequence of transcripts and transform sequence data codon_flags = [] rna_sequences = genome_track.get_sequence(transcripts) for rna_sequence in rna_sequences: sequence = seq.RnaSequence(rna_sequence) codon_flags.append(sequence.mark_codons()) # load footprint count data in transcripts footprint_counts = ribo_track.get_counts(transcripts) # load transcript-level rnaseq RPKM if options.rnaseq_file is None: rna_counts = np.ones((T, ), dtype='float') else: rna_counts = rnaseq_track.get_total_counts(transcripts) # load mappability of transcripts; transform mappability to missingness if options.mappability_file is not None: rna_mappability = genome_track.get_mappability(transcripts) else: rna_mappability = [ np.ones(c.shape, dtype='bool') for c in footprint_counts ] # run the learning algorithm states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \ rna_counts, rna_mappability, transition, emission) # write results ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \ for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)] # focus on negative strand for t in alltranscripts: t.mask = t.mask[::-1] t.strand = '-' # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts(alltranscripts) transcripts = [ t for t, e in zip(alltranscripts, exon_counts) if np.all(e >= 5) ] T = len(transcripts) if T > 0: # load sequence of transcripts and transform sequence data codon_flags = [] rna_sequences = genome_track.get_sequence(transcripts) for rna_sequence in rna_sequences: sequence = seq.RnaSequence(rna_sequence) codon_flags.append(sequence.mark_codons()) # load footprint count data in transcripts footprint_counts = ribo_track.get_counts(transcripts) # load transcript-level rnaseq RPKM if options.rnaseq_file is None: rna_counts = np.ones((T, ), dtype='float') else: rna_counts = rnaseq_track.get_total_counts(transcripts) # load mappability of transcripts; transform mappability to missingness if options.mappability_file is not None: rna_mappability = genome_track.get_mappability(transcripts) else: rna_mappability = [ np.ones(c.shape, dtype='bool') for c in footprint_counts ] # run the learning algorithm states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \ rna_counts, rna_mappability, transition, emission) # write results ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \ for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)] handle.close() ribo_track.close() if options.rnaseq_file is not None: rnaseq_track.close() genome_track.close()
options.output_fastq_prefix = options.gtf_file + '_%d.fq.gz' % options.footprint_length else: options.output_fastq_prefix = options.output_fastq_prefix + '_%d.fq.gz' % options.footprint_length return options if __name__ == "__main__": options = parse_args() qual = ''.join(['~' for r in xrange(options.footprint_length)]) seq_handle = pysam.FastaFile(options.fasta_file) # load transcripts transcripts = load_data.load_gtf(options.gtf_file) tnames = transcripts.keys() fastq_handle = gzip.open(options.output_fastq_prefix, 'wb') for num, tname in enumerate(tnames): transcript = transcripts[tname] # get transcript DNA sequence sequence = seq_handle.fetch(transcript.chromosome, transcript.start, transcript.stop).upper() # get forward strand reads if transcript.strand == "-": transcript.mask = transcript.mask[::-1] transcript.strand = "+"
if options.output_fastq_prefix is None: options.output_fastq_prefix = options.gtf_file+'_%d.fq.gz'%options.footprint_length else: options.output_fastq_prefix = options.output_fastq_prefix+'_%d.fq.gz'%options.footprint_length return options if __name__=="__main__": options = parse_args() qual = ''.join(['~' for r in xrange(options.footprint_length)]) seq_handle = pysam.FastaFile(options.fasta_file) # load transcripts transcripts = load_data.load_gtf(options.gtf_file) tnames = transcripts.keys() fastq_handle = gzip.open(options.output_fastq_prefix, 'wb') for num,tname in enumerate(tnames): transcript = transcripts[tname] # get transcript DNA sequence sequence = seq_handle.fetch(transcript.chromosome, transcript.start, transcript.stop).upper() # get forward strand reads if transcript.strand=="-": transcript.mask = transcript.mask[::-1] transcript.strand = "+"
def infer(options): # load the model handle = open(options.model_file, 'r') transition = cPickle.load(handle) emission = cPickle.load(handle) handle.close() # load transcripts transcript_models = load_data.load_gtf(options.gtf_file) transcript_names = transcript_models.keys() N = len(transcript_names) n = int(np.ceil(N/1000)) # load data tracks genome_track = load_data.Genome(options.fasta_file, options.mappability_file) ribo_track = load_data.RiboSeq(options.riboseq_file) if options.rnaseq_file is not None: rnaseq_track = load_data.RnaSeq(options.rnaseq_file) # open output file handle # file in bed12 format handle = open(options.output_file,'w') towrite = ["chromosome", "start", "stop", "transcript_id", "posterior", "strand", "cdstart", "cdstop", "protein_seq", "num_exons", "exon_sizes", "exon_starts"] handle.write(" ".join(map(str,towrite))+'\n') for n in xrange(N): tnames = transcript_names[n*1000:(n+1)*1000] alltranscripts = [transcript_models[name] for name in tnames] # run inference on both strands independently # focus on positive strand for t in alltranscripts: if t.strand=='-': t.mask = t.mask[::-1] t.strand = '+' # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts(alltranscripts) transcripts = [t for t,e in zip(alltranscripts,exon_counts) if np.all(e>=5)] T = len(transcripts) if T>0: # load sequence of transcripts and transform sequence data codon_flags = [] rna_sequences = genome_track.get_sequence(transcripts) for rna_sequence in rna_sequences: sequence = seq.RnaSequence(rna_sequence) codon_flags.append(sequence.mark_codons()) # load footprint count data in transcripts footprint_counts = ribo_track.get_counts(transcripts) # load transcript-level rnaseq RPKM if options.rnaseq_file is None: rna_counts = np.ones((T,), dtype='float') else: rna_counts = rnaseq_track.get_total_counts(transcripts) # load mappability of transcripts; transform mappability to missingness if options.mappability_file is not None: rna_mappability = genome_track.get_mappability(transcripts) else: rna_mappability = [np.ones(c.shape,dtype='bool') for c in footprint_counts] # run the learning algorithm states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \ rna_counts, rna_mappability, transition, emission) # write results ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \ for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)] # focus on negative strand for t in alltranscripts: t.mask = t.mask[::-1] t.strand = '-' # check if all exons have at least 5 footprints exon_counts = ribo_track.get_exon_total_counts(alltranscripts) transcripts = [t for t,e in zip(alltranscripts,exon_counts) if np.all(e>=5)] T = len(transcripts) if T>0: # load sequence of transcripts and transform sequence data codon_flags = [] rna_sequences = genome_track.get_sequence(transcripts) for rna_sequence in rna_sequences: sequence = seq.RnaSequence(rna_sequence) codon_flags.append(sequence.mark_codons()) # load footprint count data in transcripts footprint_counts = ribo_track.get_counts(transcripts) # load transcript-level rnaseq RPKM if options.rnaseq_file is None: rna_counts = np.ones((T,), dtype='float') else: rna_counts = rnaseq_track.get_total_counts(transcripts) # load mappability of transcripts; transform mappability to missingness if options.mappability_file is not None: rna_mappability = genome_track.get_mappability(transcripts) else: rna_mappability = [np.ones(c.shape,dtype='bool') for c in footprint_counts] # run the learning algorithm states, frames = ribohmm.infer_coding_sequence(footprint_counts, codon_flags, \ rna_counts, rna_mappability, transition, emission) # write results ig = [write_inferred_cds(handle, transcript, state, frame, rna_sequence) \ for transcript,state,frame,rna_sequence in zip(transcripts,states,frames,rna_sequences)] handle.close() ribo_track.close() if options.rnaseq_file is not None: rnaseq_track.close() genome_track.close()