def main(): if len(sys.argv) < 4: print sys.argv[0] + " <genepred> <genome fasta> <output fasta> <(optional) 'directionless' for no RC>" sys.exit() genepred_filename = sys.argv[1] genome_filename = sys.argv[2] output_filename = sys.argv[3] dodirectionless = 0 if len(sys.argv) == 5: dodirectionless = 1 if dodirectionless == 1: genepred_basics.write_genepred_to_fasta_directionless(genepred_filename,genome_filename,output_filename) else: genepred_basics.write_genepred_to_fasta(genepred_filename,genome_filename,output_filename)
def main(): if len(sys.argv) < 4: print sys.argv[ 0] + " <genepred> <genome fasta> <output fasta> <(optional) 'directionless' for no RC>" sys.exit() genepred_filename = sys.argv[1] genome_filename = sys.argv[2] output_filename = sys.argv[3] dodirectionless = 0 if len(sys.argv) == 5: dodirectionless = 1 if dodirectionless == 1: genepred_basics.write_genepred_to_fasta_directionless( genepred_filename, genome_filename, output_filename) else: genepred_basics.write_genepred_to_fasta(genepred_filename, genome_filename, output_filename)
def main(): if len(sys.argv) < 6: print sys.argv[ 0] + ' <genome> <uniquely named short reads file> <transcriptome file> <output file> <temp directory>' sys.exit() genome_filename = sys.argv[1] sruniq_filename = sys.argv[2] transcriptome_filename = sys.argv[3] output_file = sys.argv[4] temp_foldername = sys.argv[5] genome_bowtie2_index = '' if len(sys.argv) >= 7: genome_bowtie2_index = sys.argv[6] transcriptome_bowtie2_index = '' if len(sys.argv) == 8: transcriptome_bowtie2_index = sys.argv[7] if not os.path.isdir(temp_foldername): print "Error: Expecting a temporary folder that already exists." print temp_foldername + " does not exist." sys.exit() #1. Make a sub-directory to do our work in local_temp_foldername = temp_foldername.rstrip('/') + '/uniqueness' if not os.path.isdir(local_temp_foldername): print "Creating subdirectory " + local_temp_foldername os.system("mkdir " + local_temp_foldername) #2. map reads to the genome fasta genome_base_name = local_temp_foldername.rstrip('/') + '/genome' sam_filename = local_temp_foldername.rstrip('/') + '/genome.sam' map_reads_to_fasta(genome_filename, sruniq_filename, genome_base_name, genome_bowtie2_index) #3. count number of times we observe reads read_counts = read_map_count(sruniq_filename, sam_filename) #4. get unmapped reads into a fasta unmapped_read_names = get_unmapped_read_names(read_counts) unmapped_sruniq_filename = make_unmapped_short_read_file( sruniq_filename, unmapped_read_names, local_temp_foldername) #4. Make a fasta based on a transcriptome genepred file # first ensure the assumption that the genepred file contains only unqiuely named transcripts transcriptome_uniquename_filename = local_temp_foldername.rstrip( '/') + '/txn_uniq.gpd' genepred_basics.write_uniquely_named_genepred( transcriptome_filename, transcriptome_uniquename_filename) transcriptome_fa = local_temp_foldername.rstrip('/') + '/txn.fa' genepred_basics.write_genepred_to_fasta_directionless( transcriptome_uniquename_filename, genome_filename, transcriptome_fa) #5. Mapping previously unmapped reads to the transcriptome txn_base_name = local_temp_foldername.rstrip('/') + '/txn' txn_sam_filename = local_temp_foldername.rstrip('/') + '/txn.sam' map_reads_to_fasta(transcriptome_fa, unmapped_sruniq_filename, txn_base_name, transcriptome_bowtie2_index) #6. Convert coordinates of the mapped reads back to reference # Note these coordinates are zero indexed for both start and end coordiantes. txn_map_filename = local_temp_foldername.rstrip('/') + '/txn.map' sam_basics.convert_directionless_gpd_alignment_to_reference( txn_sam_filename, transcriptome_uniquename_filename, txn_map_filename) #7. Consolidate repetative read mapping due to repeats junctions among isoforms txn_uniq_map_filename = local_temp_foldername.rstrip('/') + '/txn_uniq.map' # we are only interested in the unique coordinate sets for each entry os.system("cat " + txn_map_filename + " | cut -f 1,3 | sort | uniq > " + txn_uniq_map_filename) #8. Add transcriptome mapping counts transcriptome_read_counts = get_transcriptome_read_counts( txn_uniq_map_filename) #add those transcriptome_read_counts to our previous read counts for name in transcriptome_read_counts: read_counts[name] += transcriptome_read_counts[name] #9. finished! Now we can print the reads and their counts ofile = open(output_file, 'w') for name in read_counts: ofile.write(name + "\t" + str(read_counts[name]) + "\n") ofile.close()
def main(): if len(sys.argv) != 5: print sys.argv[0]+' <long reads fasta> <reference genome> <transcriptome genepred>' return longreadfname = sys.argv[1] genomefname = sys.argv[2] usergenepredfname = sys.argv[3] outbase = sys.argv[4] #get read count readcount = 0 with open(longreadfname) as f: for line in f: if re.match('^>',line): readcount+=1 tdir = FileBasics.make_tempdir2('weirathe','annlong') # 1. Make sure the transcriptome is uniquely named uniquely mapped entries genepredfname = tdir+'/txn.gpd' make_unique_genepred(usergenepredfname,genepredfname) print 'made unique genepred file' # 2. Make a transcriptome to align to. transcriptomefasta = tdir+'/txn.fa' genepred_basics.write_genepred_to_fasta_directionless(genepredfname,genomefname,transcriptomefasta) print 'made transcriptome fasta' # 3. Make a bed file of junction locations in that transcriptome. junctionbedfname = tdir+'/junction.bed' junction_counts = make_junction_bed_file(genepredfname,junctionbedfname) print 'made junction bed file' # 4. Build a gmap index of the transcriptome transcriptomeindex = tdir+'/gmap_txn' aligner_basics.build_gmap_index(transcriptomefasta,transcriptomeindex) print 'made gmap index of transcriptome' # 5. Align the long reads to the transcriptome with gmap alignmentfname = tdir+'/reads.psl' aligner_basics.gmap_all(longreadfname,transcriptomeindex,alignmentfname) print 'made gmap alignment of reads to transcriptome' # 6. Generate get the genepred of the long reads on the transcriptome coordinates. # Smooth that genepred by a smoothing factor # And make a bed file of the best alignment. see function for specifications bestalignmentbedfname = tdir+'/reads.bed' make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname) print 'made best continuous alignment bed file' # 10. Print per-gene count info genenames = genepred_basics.get_transcript_to_gene_name_dictionary(genepredfname) print 'got gene name conversions' # 7. Make a report of all prefilter alignments bestprefilter = tdir+'/prefilter.txt' prefilter_alignments = make_best_alignment_summary(bestalignmentbedfname,junctionbedfname,junction_counts,bestprefilter) print 'made best alignment prefilter summary' report_file = tdir +'/report.txt' orep = open(report_file,'w') orep.write('Basename:'+"\t"+outbase+"\n") orep.write('Temp directory:'+"\t"+tdir+"\n") orep.write('Long Read Count:'+"\t"+str(readcount)+"\n") # 8. Filter the full length alignments full_length_alignments = filter_alignments(prefilter_alignments,'full') full_length_alignment_file = tdir+'/full_length_alignment.txt' [full_length_read_count, full_length_transcript_count] = write_alignments(full_length_alignments,full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped:'+"\t" +str(len(full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped:'+"\t" +str(full_length_transcript_count)+"\n") unambiguous_full_length_alignment_file = tdir+'/unambiguous_full_length_alignment.txt' unambiguous_full_length_alignments = filter_unambiguous_alignments(full_length_alignments) [unambiguous_full_length_read_count, unambiguous_full_length_transcript_count] = write_alignments(unambiguous_full_length_alignments,unambiguous_full_length_alignment_file,genenames) orep.write('Read count - full length reads mapped with unambiguous matches:'+"\t"+str(len(unambiguous_full_length_alignments))+"\n") orep.write('Transcript count - full length reads mapped with unambiguous matches:'+"\t"+str(unambiguous_full_length_transcript_count)+"\n") # 9. Filter the full length alignments prepartial_alignments = filter_alignments(prefilter_alignments,'partial') prepartial_alignment_file = tdir+'/prepartial_alignment.txt' write_alignments(prepartial_alignments,prepartial_alignment_file,genenames) partial_alignments = filter_by_priority_alignments(prepartial_alignments) partial_alignment_file = tdir+'/partial_alignment.txt' [partial_read_count, partial_transcript_count] = write_alignments(partial_alignments,partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits best junction and length matches:'+"\t" + str(len(partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits best junction and length matches:'+"\t" + str(partial_transcript_count)+"\n") unambiguous_partial_alignments = filter_unambiguous_alignments(partial_alignments) unambiguous_partial_alignment_file = tdir + '/unambiguous_partial_alignments.txt' [unambiguous_partial_read_count, unambiguous_partial_transcript_count] = write_alignments(unambiguous_partial_alignments,unambiguous_partial_alignment_file,genenames) orep.write('Read count - reads mapped with partial hits unambiguous matches:'+"\t"+str(len(unambiguous_partial_alignments))+"\n") orep.write('Transcript count - reads mapped with partial hits unambiguous matches:'+"\t"+str(unambiguous_partial_transcript_count)+"\n") partial_gene_counts = get_uniquely_mappable_gene_counts(partial_alignments,genenames) partial_gene_counts_file = tdir+'/partial_match_uniquely_mappable_gene_counts.txt' write_gene_counts(partial_gene_counts,partial_gene_counts_file) full_gene_counts = get_uniquely_mappable_gene_counts(full_length_alignments,genenames) full_gene_counts_file = tdir+'/full_length_match_uniquely_mappable_gene_counts.txt' write_gene_counts(full_gene_counts,full_gene_counts_file) orep.write('Gene count - full length matches uniquely mapped:'+"\t"+str(len(full_gene_counts))+"\n") orep.write('Gene count - partial matches uniquely mapped:'+"\t"+str(len(partial_gene_counts))+"\n") orep.close() copyfile(report_file,outbase+'.Report.txt') copyfile(full_gene_counts_file,outbase+'.FullGeneCounts.txt') copyfile(partial_gene_counts_file,outbase+'.PartialGeneCounts.txt') copyfile(full_length_alignment_file,outbase+'.FullAlignment.txt') copyfile(unambiguous_full_length_alignment_file,outbase+'.UnambiguousFullAlignment.txt') copyfile(partial_alignment_file,outbase+'.PartialAlignment.txt') copyfile(unambiguous_partial_alignment_file,outbase+'.UnambiguousFullAlignment.txt') rmtree(tdir)
def main(): if len(sys.argv) < 6: print sys.argv[0] + ' <genome> <uniquely named short reads file> <transcriptome file> <output file> <temp directory>' sys.exit() genome_filename = sys.argv[1] sruniq_filename = sys.argv[2] transcriptome_filename = sys.argv[3] output_file = sys.argv[4] temp_foldername = sys.argv[5] genome_bowtie2_index = '' if len(sys.argv) >= 7: genome_bowtie2_index = sys.argv[6] transcriptome_bowtie2_index = '' if len(sys.argv) == 8: transcriptome_bowtie2_index = sys.argv[7] if not os.path.isdir(temp_foldername): print "Error: Expecting a temporary folder that already exists." print temp_foldername + " does not exist." sys.exit() #1. Make a sub-directory to do our work in local_temp_foldername = temp_foldername.rstrip('/')+'/uniqueness' if not os.path.isdir(local_temp_foldername): print "Creating subdirectory "+local_temp_foldername os.system("mkdir "+local_temp_foldername) #2. map reads to the genome fasta genome_base_name = local_temp_foldername.rstrip('/')+'/genome' sam_filename = local_temp_foldername.rstrip('/')+'/genome.sam' map_reads_to_fasta(genome_filename,sruniq_filename,genome_base_name,genome_bowtie2_index) #3. count number of times we observe reads read_counts = read_map_count(sruniq_filename, sam_filename) #4. get unmapped reads into a fasta unmapped_read_names = get_unmapped_read_names(read_counts) unmapped_sruniq_filename = make_unmapped_short_read_file(sruniq_filename,unmapped_read_names,local_temp_foldername) #4. Make a fasta based on a transcriptome genepred file # first ensure the assumption that the genepred file contains only unqiuely named transcripts transcriptome_uniquename_filename = local_temp_foldername.rstrip('/')+'/txn_uniq.gpd' genepred_basics.write_uniquely_named_genepred(transcriptome_filename,transcriptome_uniquename_filename) transcriptome_fa = local_temp_foldername.rstrip('/')+'/txn.fa' genepred_basics.write_genepred_to_fasta_directionless(transcriptome_uniquename_filename,genome_filename,transcriptome_fa) #5. Mapping previously unmapped reads to the transcriptome txn_base_name = local_temp_foldername.rstrip('/')+'/txn' txn_sam_filename = local_temp_foldername.rstrip('/')+'/txn.sam' map_reads_to_fasta(transcriptome_fa,unmapped_sruniq_filename,txn_base_name,transcriptome_bowtie2_index) #6. Convert coordinates of the mapped reads back to reference # Note these coordinates are zero indexed for both start and end coordiantes. txn_map_filename = local_temp_foldername.rstrip('/') + '/txn.map' sam_basics.convert_directionless_gpd_alignment_to_reference(txn_sam_filename, transcriptome_uniquename_filename,txn_map_filename) #7. Consolidate repetative read mapping due to repeats junctions among isoforms txn_uniq_map_filename = local_temp_foldername.rstrip('/') + '/txn_uniq.map' # we are only interested in the unique coordinate sets for each entry os.system("cat "+txn_map_filename+" | cut -f 1,3 | sort | uniq > "+txn_uniq_map_filename) #8. Add transcriptome mapping counts transcriptome_read_counts = get_transcriptome_read_counts(txn_uniq_map_filename) #add those transcriptome_read_counts to our previous read counts for name in transcriptome_read_counts: read_counts[name]+=transcriptome_read_counts[name] #9. finished! Now we can print the reads and their counts ofile = open(output_file,'w') for name in read_counts: ofile.write(name + "\t" + str(read_counts[name])+"\n") ofile.close()