def gmap_all(reads_filename,index_base_name,outpsl_filename): tdir = FileBasics.make_tempdir2('weirathe','gmap') readformattag = '' corecount = str(multiprocessing.cpu_count()) m = re.match('^(.*)\/([^\/]+)$',index_base_name) if not m: print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in" sys.exit() cmd = 'gmap -D '+m.group(1)+' -f 1 -d '+m.group(2)+' -t '+corecount+' '+reads_filename+' 1> '+tdir+'/all.psl 2>/dev/null' sys.stderr.write(cmd+"\n") os.system(cmd) copyfile(tdir+'/all.psl',outpsl_filename) rmtree(tdir)
def gmap_all(reads_filename, index_base_name, outpsl_filename): tdir = FileBasics.make_tempdir2('weirathe', 'gmap') readformattag = '' corecount = str(multiprocessing.cpu_count()) m = re.match('^(.*)\/([^\/]+)$', index_base_name) if not m: print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in" sys.exit() cmd = 'gmap -D ' + m.group(1) + ' -f 1 -d ' + m.group( 2 ) + ' -t ' + corecount + ' ' + reads_filename + ' 1> ' + tdir + '/all.psl 2>/dev/null' sys.stderr.write(cmd + "\n") os.system(cmd) copyfile(tdir + '/all.psl', outpsl_filename) rmtree(tdir)
def main(): if len(sys.argv) != 5: print sys.argv[0]+' <long reads fasta> <reference genome> <transcriptome genepred>' return longreadfname = sys.argv[1] genomefname = sys.argv[2] usergenepredfname = sys.argv[3] outbase = sys.argv[4] #get read count readcount = 0 with open(longreadfname) as f: for line in f: if re.match('^>',line): readcount+=1 tdir = FileBasics.make_tempdir2('weirathe','annlong') # 1. Make sure the transcriptome is uniquely named uniquely mapped entries genepredfname = tdir+'/txn.gpd' make_unique_genepred(usergenepredfname,genepredfname) print 'made unique genepred file' # 2. Make a transcriptome to align to. transcriptomefasta = tdir+'/txn.fa' genepred_basics.write_genepred_to_fasta_directionless(genepredfname,genomefname,transcriptomefasta) print 'made transcriptome fasta' # 3. Make a bed file of junction locations in that transcriptome. junctionbedfname = tdir+'/junction.bed' junction_counts = make_junction_bed_file(genepredfname,junctionbedfname) print 'made junction bed file' # 4. Build a gmap index of the transcriptome transcriptomeindex = tdir+'/gmap_txn' aligner_basics.build_gmap_index(transcriptomefasta,transcriptomeindex) print 'made gmap index of transcriptome' # 5. Align the long reads to the transcriptome with gmap alignmentfname = tdir+'/reads.psl' aligner_basics.gmap_all(longreadfname,transcriptomeindex,alignmentfname) print 'made gmap alignment of reads to transcriptome' # 6. Generate get the genepred of the long reads on the transcriptome coordinates. # Smooth that genepred by a smoothing factor # And make a bed file of the best alignment. see function for specifications bestalignmentbedfname = tdir+'/reads.bed' make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname) print 'made best continuous alignment bed file' # 10. Print per-gene count info genenames = genepred_basics.get_transcript_to_gene_name_dictionary(genepredfname) print 'got gene name conversions' # 7. Make a report of all prefilter alignments bestprefilter = tdir+'/prefilter.txt' prefilter_alignments = make_best_alignment_summary(bestalignmentbedfname,junctionbedfname,junction_counts,bestprefilter) print 'made best alignment prefilter summary' report_file = tdir +'/report.txt' orep = open(report_file,'w') orep.write('Basename:'+"\t"+outbase+"\n") orep.write('Temp directory:'+"\t"+tdir+"\n") orep.write('Long Read Count:'+"\t"+str(readcount)+"\n") # 8. Filter the full length alignments full_length_alignments = filter_alignments(prefilter_alignments,'full') full_length_alignment_file = tdir+'/full_length_alignment.txt' [full_length_read_count, full_length_transcript_count] = write_alignments(full_length_alignments,full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped:'+"\t" +str(len(full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped:'+"\t" +str(full_length_transcript_count)+"\n") unambiguous_full_length_alignment_file = tdir+'/unambiguous_full_length_alignment.txt' unambiguous_full_length_alignments = filter_unambiguous_alignments(full_length_alignments) [unambiguous_full_length_read_count, unambiguous_full_length_transcript_count] = write_alignments(unambiguous_full_length_alignments,unambiguous_full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped with unambiguous matches:'+"\t"+str(len(unambiguous_full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped with unambiguous matches:'+"\t"+str(unambiguous_full_length_transcript_count)+"\n") # 9. Filter the full length alignments prepartial_alignments = filter_alignments(prefilter_alignments,'partial') prepartial_alignment_file = tdir+'/prepartial_alignment.txt' write_alignments(prepartial_alignments,prepartial_alignment_file,genenames) partial_alignments = filter_by_priority_alignments(prepartial_alignments) partial_alignment_file = tdir+'/partial_alignment.txt' [partial_read_count, partial_transcript_count] = write_alignments(partial_alignments,partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits best junction and length matches:'+"\t" + str(len(partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits best junction and length matches:'+"\t" + str(partial_transcript_count)+"\n") unambiguous_partial_alignments = filter_unambiguous_alignments(partial_alignments) unambiguous_partial_alignment_file = tdir + '/unambiguous_partial_alignments.txt' [unambiguous_partial_read_count, unambiguous_partial_transcript_count] = write_alignments(unambiguous_partial_alignments,unambiguous_partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits unambiguous matches:'+"\t"+str(len(unambiguous_partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits unambiguous matches:'+"\t"+str(unambiguous_partial_transcript_count)+"\n") partial_gene_counts = get_uniquely_mappable_gene_counts(partial_alignments,genenames) partial_gene_counts_file = tdir+'/partial_match_uniquely_mappable_gene_counts.txt' write_gene_counts(partial_gene_counts,partial_gene_counts_file) full_gene_counts = get_uniquely_mappable_gene_counts(full_length_alignments,genenames) full_gene_counts_file = tdir+'/full_length_match_uniquely_mappable_gene_counts.txt' write_gene_counts(full_gene_counts,full_gene_counts_file) orep.write('Gene count - full length matches uniquely mapped:'+"\t"+str(len(full_gene_counts))+"\n") orep.write('Gene count - partial matches uniquely mapped:'+"\t"+str(len(partial_gene_counts))+"\n") orep.close() copyfile(report_file,outbase+'.Report.txt') copyfile(full_gene_counts_file,outbase+'.FullGeneCounts.txt') copyfile(partial_gene_counts_file,outbase+'.PartialGeneCounts.txt') copyfile(full_length_alignment_file,outbase+'.FullAlignment.txt') copyfile(unambiguous_full_length_alignment_file,outbase+'.UnambiguousFullAlignment.txt') copyfile(partial_alignment_file,outbase+'.PartialAlignment.txt') copyfile(unambiguous_partial_alignment_file,outbase+'.UnambiguousFullAlignment.txt') rmtree(tdir)