Esempio n. 1
0
def parse_refgpd(tdir,geneprednames,simplenames):
  # get the reference genepreds ready to use in work
  column_number = 0
  entry_number = 0
  of_entries = open(tdir+"/entries.txt",'w')
  for file in geneprednames:
    column_number += 1
    of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w')
    gfr = FileBasics.GenericFileReader(file)
    while True:
      line = gfr.readline()
      if not line: break
      if re.match('^#',line): continue
      entry_number += 1
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      entry_length = 0
      for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i]
      of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n")
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                   + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \
                   + entry['gene_name'] + "\t" \
                   + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                   + entry['strand'] + "\t" + str(exon_number) \
                   + "\n")
    gfr.close()
    of_ref.close()
  of_entries.close()
Esempio n. 2
0
def parse_gpdfile(tdir,gpdfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if gpdfile != '-':
    fr = FileBasics.GenericFileReader(gpdfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(line.rstrip()) \
              ,smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
 def read_fastq_file(self,filename):
   gfr = FileBasics.GenericFileReader(filename)
   linecount = 0
   while True and linecount < self.max_read_count:
     line1 = gfr.readline().rstrip()
     if not line1: break
     line2 = gfr.readline().rstrip()
     if not line2: break
     line3 = gfr.readline().rstrip()
     if not line3: break
     line4 = gfr.readline().rstrip()
     if not line4: break
     self.record_observation(line4)
     linecount += 1
   gfr.close()