def parse_gpdfile(tdir,gpdfile,smoothing_factor): # Go through the long reads and make a genepred if gpdfile != '-': fr = FileBasics.GenericFileReader(gpdfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(line.rstrip()) \ ,smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def parse_refgpd(tdir,geneprednames,simplenames): # get the reference genepreds ready to use in work column_number = 0 entry_number = 0 of_entries = open(tdir+"/entries.txt",'w') for file in geneprednames: column_number += 1 of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w') gfr = FileBasics.GenericFileReader(file) while True: line = gfr.readline() if not line: break if re.match('^#',line): continue entry_number += 1 line = line.rstrip("\n") entry = GenePredBasics.line_to_entry(line) entry_length = 0 for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i] of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n") exon_number = 0 for i in range(0,len(entry['exonStarts'])): exon_number += 1 of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \ + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \ + entry['gene_name'] + "\t" \ + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \ + entry['strand'] + "\t" + str(exon_number) \ + "\n") gfr.close() of_ref.close() of_entries.close()
def gmap_all(reads_filename,index_base_name,outpsl_filename): tdir = FileBasics.make_tempdir2('weirathe','gmap') readformattag = '' corecount = str(multiprocessing.cpu_count()) m = re.match('^(.*)\/([^\/]+)$',index_base_name) if not m: print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in" sys.exit() cmd = 'gmap -D '+m.group(1)+' -f 1 -d '+m.group(2)+' -t '+corecount+' '+reads_filename+' 1> '+tdir+'/all.psl 2>/dev/null' sys.stderr.write(cmd+"\n") os.system(cmd) copyfile(tdir+'/all.psl',outpsl_filename) rmtree(tdir)
def gmap_all(reads_filename, index_base_name, outpsl_filename): tdir = FileBasics.make_tempdir2('weirathe', 'gmap') readformattag = '' corecount = str(multiprocessing.cpu_count()) m = re.match('^(.*)\/([^\/]+)$', index_base_name) if not m: print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in" sys.exit() cmd = 'gmap -D ' + m.group(1) + ' -f 1 -d ' + m.group( 2 ) + ' -t ' + corecount + ' ' + reads_filename + ' 1> ' + tdir + '/all.psl 2>/dev/null' sys.stderr.write(cmd + "\n") os.system(cmd) copyfile(tdir + '/all.psl', outpsl_filename) rmtree(tdir)
def read_fastq_file(self,filename): gfr = FileBasics.GenericFileReader(filename) linecount = 0 while True and linecount < self.max_read_count: line1 = gfr.readline().rstrip() if not line1: break line2 = gfr.readline().rstrip() if not line2: break line3 = gfr.readline().rstrip() if not line3: break line4 = gfr.readline().rstrip() if not line4: break self.record_observation(line4) linecount += 1 gfr.close()
def main(): if len(sys.argv) != 5: print sys.argv[0]+' <long reads fasta> <reference genome> <transcriptome genepred>' return longreadfname = sys.argv[1] genomefname = sys.argv[2] usergenepredfname = sys.argv[3] outbase = sys.argv[4] #get read count readcount = 0 with open(longreadfname) as f: for line in f: if re.match('^>',line): readcount+=1 tdir = FileBasics.make_tempdir2('weirathe','annlong') # 1. Make sure the transcriptome is uniquely named uniquely mapped entries genepredfname = tdir+'/txn.gpd' make_unique_genepred(usergenepredfname,genepredfname) print 'made unique genepred file' # 2. Make a transcriptome to align to. transcriptomefasta = tdir+'/txn.fa' genepred_basics.write_genepred_to_fasta_directionless(genepredfname,genomefname,transcriptomefasta) print 'made transcriptome fasta' # 3. Make a bed file of junction locations in that transcriptome. junctionbedfname = tdir+'/junction.bed' junction_counts = make_junction_bed_file(genepredfname,junctionbedfname) print 'made junction bed file' # 4. Build a gmap index of the transcriptome transcriptomeindex = tdir+'/gmap_txn' aligner_basics.build_gmap_index(transcriptomefasta,transcriptomeindex) print 'made gmap index of transcriptome' # 5. Align the long reads to the transcriptome with gmap alignmentfname = tdir+'/reads.psl' aligner_basics.gmap_all(longreadfname,transcriptomeindex,alignmentfname) print 'made gmap alignment of reads to transcriptome' # 6. Generate get the genepred of the long reads on the transcriptome coordinates. # Smooth that genepred by a smoothing factor # And make a bed file of the best alignment. see function for specifications bestalignmentbedfname = tdir+'/reads.bed' make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname) print 'made best continuous alignment bed file' # 10. Print per-gene count info genenames = genepred_basics.get_transcript_to_gene_name_dictionary(genepredfname) print 'got gene name conversions' # 7. Make a report of all prefilter alignments bestprefilter = tdir+'/prefilter.txt' prefilter_alignments = make_best_alignment_summary(bestalignmentbedfname,junctionbedfname,junction_counts,bestprefilter) print 'made best alignment prefilter summary' report_file = tdir +'/report.txt' orep = open(report_file,'w') orep.write('Basename:'+"\t"+outbase+"\n") orep.write('Temp directory:'+"\t"+tdir+"\n") orep.write('Long Read Count:'+"\t"+str(readcount)+"\n") # 8. Filter the full length alignments full_length_alignments = filter_alignments(prefilter_alignments,'full') full_length_alignment_file = tdir+'/full_length_alignment.txt' [full_length_read_count, full_length_transcript_count] = write_alignments(full_length_alignments,full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped:'+"\t" +str(len(full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped:'+"\t" +str(full_length_transcript_count)+"\n") unambiguous_full_length_alignment_file = tdir+'/unambiguous_full_length_alignment.txt' unambiguous_full_length_alignments = filter_unambiguous_alignments(full_length_alignments) [unambiguous_full_length_read_count, unambiguous_full_length_transcript_count] = write_alignments(unambiguous_full_length_alignments,unambiguous_full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped with unambiguous matches:'+"\t"+str(len(unambiguous_full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped with unambiguous matches:'+"\t"+str(unambiguous_full_length_transcript_count)+"\n") # 9. Filter the full length alignments prepartial_alignments = filter_alignments(prefilter_alignments,'partial') prepartial_alignment_file = tdir+'/prepartial_alignment.txt' write_alignments(prepartial_alignments,prepartial_alignment_file,genenames) partial_alignments = filter_by_priority_alignments(prepartial_alignments) partial_alignment_file = tdir+'/partial_alignment.txt' [partial_read_count, partial_transcript_count] = write_alignments(partial_alignments,partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits best junction and length matches:'+"\t" + str(len(partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits best junction and length matches:'+"\t" + str(partial_transcript_count)+"\n") unambiguous_partial_alignments = filter_unambiguous_alignments(partial_alignments) unambiguous_partial_alignment_file = tdir + '/unambiguous_partial_alignments.txt' [unambiguous_partial_read_count, unambiguous_partial_transcript_count] = write_alignments(unambiguous_partial_alignments,unambiguous_partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits unambiguous matches:'+"\t"+str(len(unambiguous_partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits unambiguous matches:'+"\t"+str(unambiguous_partial_transcript_count)+"\n") partial_gene_counts = get_uniquely_mappable_gene_counts(partial_alignments,genenames) partial_gene_counts_file = tdir+'/partial_match_uniquely_mappable_gene_counts.txt' write_gene_counts(partial_gene_counts,partial_gene_counts_file) full_gene_counts = get_uniquely_mappable_gene_counts(full_length_alignments,genenames) full_gene_counts_file = tdir+'/full_length_match_uniquely_mappable_gene_counts.txt' write_gene_counts(full_gene_counts,full_gene_counts_file) orep.write('Gene count - full length matches uniquely mapped:'+"\t"+str(len(full_gene_counts))+"\n") orep.write('Gene count - partial matches uniquely mapped:'+"\t"+str(len(partial_gene_counts))+"\n") orep.close() copyfile(report_file,outbase+'.Report.txt') copyfile(full_gene_counts_file,outbase+'.FullGeneCounts.txt') copyfile(partial_gene_counts_file,outbase+'.PartialGeneCounts.txt') copyfile(full_length_alignment_file,outbase+'.FullAlignment.txt') copyfile(unambiguous_full_length_alignment_file,outbase+'.UnambiguousFullAlignment.txt') copyfile(partial_alignment_file,outbase+'.PartialAlignment.txt') copyfile(unambiguous_partial_alignment_file,outbase+'.UnambiguousFullAlignment.txt') rmtree(tdir)