def gmap_all(reads_filename,index_base_name,outpsl_filename):
  tdir = FileBasics.make_tempdir2('weirathe','gmap')
  readformattag = ''
  corecount = str(multiprocessing.cpu_count())
  m = re.match('^(.*)\/([^\/]+)$',index_base_name)
  if not m:
    print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in"
    sys.exit()
  cmd = 'gmap -D '+m.group(1)+' -f 1 -d '+m.group(2)+' -t '+corecount+' '+reads_filename+' 1> '+tdir+'/all.psl 2>/dev/null'
  sys.stderr.write(cmd+"\n")
  os.system(cmd)  
  copyfile(tdir+'/all.psl',outpsl_filename)
  rmtree(tdir)
Exemple #2
0
def gmap_all(reads_filename, index_base_name, outpsl_filename):
    tdir = FileBasics.make_tempdir2('weirathe', 'gmap')
    readformattag = ''
    corecount = str(multiprocessing.cpu_count())
    m = re.match('^(.*)\/([^\/]+)$', index_base_name)
    if not m:
        print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in"
        sys.exit()
    cmd = 'gmap -D ' + m.group(1) + ' -f 1 -d ' + m.group(
        2
    ) + ' -t ' + corecount + ' ' + reads_filename + ' 1> ' + tdir + '/all.psl 2>/dev/null'
    sys.stderr.write(cmd + "\n")
    os.system(cmd)
    copyfile(tdir + '/all.psl', outpsl_filename)
    rmtree(tdir)
def main():
  if len(sys.argv) != 5:
    print sys.argv[0]+' <long reads fasta> <reference genome> <transcriptome genepred>'
    return
  longreadfname = sys.argv[1]
  genomefname = sys.argv[2]
  usergenepredfname = sys.argv[3]
  outbase = sys.argv[4]

  #get read count
  readcount = 0
  with open(longreadfname) as f:
    for line in f:
      if re.match('^>',line): readcount+=1

  tdir = FileBasics.make_tempdir2('weirathe','annlong')

  # 1. Make sure the transcriptome is uniquely named uniquely mapped entries
  genepredfname = tdir+'/txn.gpd'
  make_unique_genepred(usergenepredfname,genepredfname)
  print 'made unique genepred file'

  # 2.  Make a transcriptome to align to.
  transcriptomefasta = tdir+'/txn.fa'
  genepred_basics.write_genepred_to_fasta_directionless(genepredfname,genomefname,transcriptomefasta)
  print 'made transcriptome fasta'

  # 3.  Make a bed file of junction locations in that transcriptome.
  junctionbedfname = tdir+'/junction.bed'
  junction_counts = make_junction_bed_file(genepredfname,junctionbedfname)
  print 'made junction bed file'

  # 4.  Build a gmap index of the transcriptome
  transcriptomeindex = tdir+'/gmap_txn'
  aligner_basics.build_gmap_index(transcriptomefasta,transcriptomeindex)
  print 'made gmap index of transcriptome'

  # 5.  Align the long reads to the transcriptome with gmap
  alignmentfname = tdir+'/reads.psl'
  aligner_basics.gmap_all(longreadfname,transcriptomeindex,alignmentfname)
  print 'made gmap alignment of reads to transcriptome'

  # 6.  Generate get the genepred of the long reads on the transcriptome coordinates.
  #     Smooth that genepred by a smoothing factor
  #     And make a bed file of the best alignment.  see function for specifications
  bestalignmentbedfname = tdir+'/reads.bed'
  make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname)
  print 'made best continuous alignment bed file'

  # 10.  Print per-gene count info
  genenames = genepred_basics.get_transcript_to_gene_name_dictionary(genepredfname)
  print 'got gene name conversions'  

  # 7.  Make a report of all prefilter alignments  
  bestprefilter = tdir+'/prefilter.txt'
  prefilter_alignments = make_best_alignment_summary(bestalignmentbedfname,junctionbedfname,junction_counts,bestprefilter)
  print 'made best alignment prefilter summary'

  report_file = tdir +'/report.txt'
  orep = open(report_file,'w')
  orep.write('Basename:'+"\t"+outbase+"\n")
  orep.write('Temp directory:'+"\t"+tdir+"\n")
  orep.write('Long Read Count:'+"\t"+str(readcount)+"\n")
  # 8.  Filter the full length alignments 
  full_length_alignments = filter_alignments(prefilter_alignments,'full')
  full_length_alignment_file = tdir+'/full_length_alignment.txt'
  [full_length_read_count, full_length_transcript_count] = write_alignments(full_length_alignments,full_length_alignment_file,genenames)
  orep.write('Read count - full length reads mapped:'+"\t" +str(len(full_length_alignments))+"\n")
  orep.write('Transcript count - full length reads mapped:'+"\t" +str(full_length_transcript_count)+"\n")
  unambiguous_full_length_alignment_file = tdir+'/unambiguous_full_length_alignment.txt'
  unambiguous_full_length_alignments = filter_unambiguous_alignments(full_length_alignments)
  [unambiguous_full_length_read_count, unambiguous_full_length_transcript_count] = write_alignments(unambiguous_full_length_alignments,unambiguous_full_length_alignment_file,genenames)
  orep.write('Read count - full length reads mapped with unambiguous matches:'+"\t"+str(len(unambiguous_full_length_alignments))+"\n")
  orep.write('Transcript count - full length reads mapped with unambiguous matches:'+"\t"+str(unambiguous_full_length_transcript_count)+"\n")

  # 9.  Filter the full length alignments 
  prepartial_alignments = filter_alignments(prefilter_alignments,'partial')
  prepartial_alignment_file = tdir+'/prepartial_alignment.txt'
  write_alignments(prepartial_alignments,prepartial_alignment_file,genenames)
  partial_alignments = filter_by_priority_alignments(prepartial_alignments)
  partial_alignment_file = tdir+'/partial_alignment.txt'
  [partial_read_count, partial_transcript_count] = write_alignments(partial_alignments,partial_alignment_file,genenames)
  orep.write('Read count - reads mapped with partial hits best junction and length matches:'+"\t" + str(len(partial_alignments))+"\n")
  orep.write('Transcript count - reads mapped with partial hits best junction and length matches:'+"\t" + str(partial_transcript_count)+"\n")
  unambiguous_partial_alignments = filter_unambiguous_alignments(partial_alignments)
  unambiguous_partial_alignment_file = tdir + '/unambiguous_partial_alignments.txt'
  [unambiguous_partial_read_count, unambiguous_partial_transcript_count] = write_alignments(unambiguous_partial_alignments,unambiguous_partial_alignment_file,genenames)
  orep.write('Read count - reads mapped with partial hits unambiguous matches:'+"\t"+str(len(unambiguous_partial_alignments))+"\n")
  orep.write('Transcript count - reads mapped with partial hits unambiguous matches:'+"\t"+str(unambiguous_partial_transcript_count)+"\n")

  partial_gene_counts = get_uniquely_mappable_gene_counts(partial_alignments,genenames)
  partial_gene_counts_file = tdir+'/partial_match_uniquely_mappable_gene_counts.txt'
  write_gene_counts(partial_gene_counts,partial_gene_counts_file)

  full_gene_counts = get_uniquely_mappable_gene_counts(full_length_alignments,genenames)
  full_gene_counts_file = tdir+'/full_length_match_uniquely_mappable_gene_counts.txt'
  write_gene_counts(full_gene_counts,full_gene_counts_file)

  orep.write('Gene count - full length matches uniquely mapped:'+"\t"+str(len(full_gene_counts))+"\n")
  orep.write('Gene count - partial matches uniquely mapped:'+"\t"+str(len(partial_gene_counts))+"\n")
  orep.close()
  copyfile(report_file,outbase+'.Report.txt')
  copyfile(full_gene_counts_file,outbase+'.FullGeneCounts.txt')
  copyfile(partial_gene_counts_file,outbase+'.PartialGeneCounts.txt')
  copyfile(full_length_alignment_file,outbase+'.FullAlignment.txt')
  copyfile(unambiguous_full_length_alignment_file,outbase+'.UnambiguousFullAlignment.txt')
  copyfile(partial_alignment_file,outbase+'.PartialAlignment.txt')
  copyfile(unambiguous_partial_alignment_file,outbase+'.UnambiguousFullAlignment.txt')
  rmtree(tdir)