Ejemplo n.º 1
0
def parse_gpdfile(tdir,gpdfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if gpdfile != '-':
    fr = FileBasics.GenericFileReader(gpdfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(line.rstrip()) \
              ,smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
Ejemplo n.º 2
0
def parse_refgpd(tdir,geneprednames,simplenames):
  # get the reference genepreds ready to use in work
  column_number = 0
  entry_number = 0
  of_entries = open(tdir+"/entries.txt",'w')
  for file in geneprednames:
    column_number += 1
    of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w')
    gfr = FileBasics.GenericFileReader(file)
    while True:
      line = gfr.readline()
      if not line: break
      if re.match('^#',line): continue
      entry_number += 1
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      entry_length = 0
      for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i]
      of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n")
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                   + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \
                   + entry['gene_name'] + "\t" \
                   + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                   + entry['strand'] + "\t" + str(exon_number) \
                   + "\n")
    gfr.close()
    of_ref.close()
  of_entries.close()
Ejemplo n.º 3
0
def gmap_all(reads_filename,index_base_name,outpsl_filename):
  tdir = FileBasics.make_tempdir2('weirathe','gmap')
  readformattag = ''
  corecount = str(multiprocessing.cpu_count())
  m = re.match('^(.*)\/([^\/]+)$',index_base_name)
  if not m:
    print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in"
    sys.exit()
  cmd = 'gmap -D '+m.group(1)+' -f 1 -d '+m.group(2)+' -t '+corecount+' '+reads_filename+' 1> '+tdir+'/all.psl 2>/dev/null'
  sys.stderr.write(cmd+"\n")
  os.system(cmd)  
  copyfile(tdir+'/all.psl',outpsl_filename)
  rmtree(tdir)
Ejemplo n.º 4
0
def gmap_all(reads_filename, index_base_name, outpsl_filename):
    tdir = FileBasics.make_tempdir2('weirathe', 'gmap')
    readformattag = ''
    corecount = str(multiprocessing.cpu_count())
    m = re.match('^(.*)\/([^\/]+)$', index_base_name)
    if not m:
        print "error: path should include both a directory and basename you should be able to use ./mybase if its in the same directory you are currently in"
        sys.exit()
    cmd = 'gmap -D ' + m.group(1) + ' -f 1 -d ' + m.group(
        2
    ) + ' -t ' + corecount + ' ' + reads_filename + ' 1> ' + tdir + '/all.psl 2>/dev/null'
    sys.stderr.write(cmd + "\n")
    os.system(cmd)
    copyfile(tdir + '/all.psl', outpsl_filename)
    rmtree(tdir)
 def read_fastq_file(self,filename):
   gfr = FileBasics.GenericFileReader(filename)
   linecount = 0
   while True and linecount < self.max_read_count:
     line1 = gfr.readline().rstrip()
     if not line1: break
     line2 = gfr.readline().rstrip()
     if not line2: break
     line3 = gfr.readline().rstrip()
     if not line3: break
     line4 = gfr.readline().rstrip()
     if not line4: break
     self.record_observation(line4)
     linecount += 1
   gfr.close()
def main():
  if len(sys.argv) != 5:
    print sys.argv[0]+' <long reads fasta> <reference genome> <transcriptome genepred>'
    return
  longreadfname = sys.argv[1]
  genomefname = sys.argv[2]
  usergenepredfname = sys.argv[3]
  outbase = sys.argv[4]

  #get read count
  readcount = 0
  with open(longreadfname) as f:
    for line in f:
      if re.match('^>',line): readcount+=1

  tdir = FileBasics.make_tempdir2('weirathe','annlong')

  # 1. Make sure the transcriptome is uniquely named uniquely mapped entries
  genepredfname = tdir+'/txn.gpd'
  make_unique_genepred(usergenepredfname,genepredfname)
  print 'made unique genepred file'

  # 2.  Make a transcriptome to align to.
  transcriptomefasta = tdir+'/txn.fa'
  genepred_basics.write_genepred_to_fasta_directionless(genepredfname,genomefname,transcriptomefasta)
  print 'made transcriptome fasta'

  # 3.  Make a bed file of junction locations in that transcriptome.
  junctionbedfname = tdir+'/junction.bed'
  junction_counts = make_junction_bed_file(genepredfname,junctionbedfname)
  print 'made junction bed file'

  # 4.  Build a gmap index of the transcriptome
  transcriptomeindex = tdir+'/gmap_txn'
  aligner_basics.build_gmap_index(transcriptomefasta,transcriptomeindex)
  print 'made gmap index of transcriptome'

  # 5.  Align the long reads to the transcriptome with gmap
  alignmentfname = tdir+'/reads.psl'
  aligner_basics.gmap_all(longreadfname,transcriptomeindex,alignmentfname)
  print 'made gmap alignment of reads to transcriptome'

  # 6.  Generate get the genepred of the long reads on the transcriptome coordinates.
  #     Smooth that genepred by a smoothing factor
  #     And make a bed file of the best alignment.  see function for specifications
  bestalignmentbedfname = tdir+'/reads.bed'
  make_best_continuous_alignment_bed(alignmentfname,bestalignmentbedfname)
  print 'made best continuous alignment bed file'

  # 10.  Print per-gene count info
  genenames = genepred_basics.get_transcript_to_gene_name_dictionary(genepredfname)
  print 'got gene name conversions'  

  # 7.  Make a report of all prefilter alignments  
  bestprefilter = tdir+'/prefilter.txt'
  prefilter_alignments = make_best_alignment_summary(bestalignmentbedfname,junctionbedfname,junction_counts,bestprefilter)
  print 'made best alignment prefilter summary'

  report_file = tdir +'/report.txt'
  orep = open(report_file,'w')
  orep.write('Basename:'+"\t"+outbase+"\n")
  orep.write('Temp directory:'+"\t"+tdir+"\n")
  orep.write('Long Read Count:'+"\t"+str(readcount)+"\n")
  # 8.  Filter the full length alignments 
  full_length_alignments = filter_alignments(prefilter_alignments,'full')
  full_length_alignment_file = tdir+'/full_length_alignment.txt'
  [full_length_read_count, full_length_transcript_count] = write_alignments(full_length_alignments,full_length_alignment_file,genenames)
  orep.write('Read count - full length reads mapped:'+"\t" +str(len(full_length_alignments))+"\n")
  orep.write('Transcript count - full length reads mapped:'+"\t" +str(full_length_transcript_count)+"\n")
  unambiguous_full_length_alignment_file = tdir+'/unambiguous_full_length_alignment.txt'
  unambiguous_full_length_alignments = filter_unambiguous_alignments(full_length_alignments)
  [unambiguous_full_length_read_count, unambiguous_full_length_transcript_count] = write_alignments(unambiguous_full_length_alignments,unambiguous_full_length_alignment_file,genenames)
  orep.write('Read count - full length reads mapped with unambiguous matches:'+"\t"+str(len(unambiguous_full_length_alignments))+"\n")
  orep.write('Transcript count - full length reads mapped with unambiguous matches:'+"\t"+str(unambiguous_full_length_transcript_count)+"\n")

  # 9.  Filter the full length alignments 
  prepartial_alignments = filter_alignments(prefilter_alignments,'partial')
  prepartial_alignment_file = tdir+'/prepartial_alignment.txt'
  write_alignments(prepartial_alignments,prepartial_alignment_file,genenames)
  partial_alignments = filter_by_priority_alignments(prepartial_alignments)
  partial_alignment_file = tdir+'/partial_alignment.txt'
  [partial_read_count, partial_transcript_count] = write_alignments(partial_alignments,partial_alignment_file,genenames)
  orep.write('Read count - reads mapped with partial hits best junction and length matches:'+"\t" + str(len(partial_alignments))+"\n")
  orep.write('Transcript count - reads mapped with partial hits best junction and length matches:'+"\t" + str(partial_transcript_count)+"\n")
  unambiguous_partial_alignments = filter_unambiguous_alignments(partial_alignments)
  unambiguous_partial_alignment_file = tdir + '/unambiguous_partial_alignments.txt'
  [unambiguous_partial_read_count, unambiguous_partial_transcript_count] = write_alignments(unambiguous_partial_alignments,unambiguous_partial_alignment_file,genenames)
  orep.write('Read count - reads mapped with partial hits unambiguous matches:'+"\t"+str(len(unambiguous_partial_alignments))+"\n")
  orep.write('Transcript count - reads mapped with partial hits unambiguous matches:'+"\t"+str(unambiguous_partial_transcript_count)+"\n")

  partial_gene_counts = get_uniquely_mappable_gene_counts(partial_alignments,genenames)
  partial_gene_counts_file = tdir+'/partial_match_uniquely_mappable_gene_counts.txt'
  write_gene_counts(partial_gene_counts,partial_gene_counts_file)

  full_gene_counts = get_uniquely_mappable_gene_counts(full_length_alignments,genenames)
  full_gene_counts_file = tdir+'/full_length_match_uniquely_mappable_gene_counts.txt'
  write_gene_counts(full_gene_counts,full_gene_counts_file)

  orep.write('Gene count - full length matches uniquely mapped:'+"\t"+str(len(full_gene_counts))+"\n")
  orep.write('Gene count - partial matches uniquely mapped:'+"\t"+str(len(partial_gene_counts))+"\n")
  orep.close()
  copyfile(report_file,outbase+'.Report.txt')
  copyfile(full_gene_counts_file,outbase+'.FullGeneCounts.txt')
  copyfile(partial_gene_counts_file,outbase+'.PartialGeneCounts.txt')
  copyfile(full_length_alignment_file,outbase+'.FullAlignment.txt')
  copyfile(unambiguous_full_length_alignment_file,outbase+'.UnambiguousFullAlignment.txt')
  copyfile(partial_alignment_file,outbase+'.PartialAlignment.txt')
  copyfile(unambiguous_partial_alignment_file,outbase+'.UnambiguousFullAlignment.txt')
  rmtree(tdir)