def get_nls(bounds,fname,i):
  with open(fname,'rb') as inf:
    inf.seek(bounds[0])
    bytes = inf.read(bounds[1]-bounds[0])
    s = StringIO.StringIO(bytes)
  #v = BGZF_reader(inf,blockStart=bound[0],innerStart=0)
  v = BGZF_reader(s)
  ubytes = v.read(70000) # always less than 65K by definition
  p = re.compile('\n')
  nls = [m.start() for m in p.finditer(ubytes)]
  breaks = []  
  for j in range(len(nls)):
    breaks.append([bounds[0],bounds[1],nls[j]])
  return breaks
Exemple #2
0
def get_nls(bounds, fname, i):
    with open(fname, 'rb') as inf:
        inf.seek(bounds[0])
        bytes = inf.read(bounds[1] - bounds[0])
        s = StringIO.StringIO(bytes)
    #v = BGZF_reader(inf,blockStart=bound[0],innerStart=0)
    v = BGZF_reader(s)
    ubytes = v.read(70000)  # always less than 65K by definition
    p = re.compile('\n')
    nls = [m.start() for m in p.finditer(ubytes)]
    breaks = []
    for j in range(len(nls)):
        breaks.append([bounds[0], bounds[1], nls[j]])
    return breaks
def main():
  global blocks
  parser = argparse.ArgumentParser(description="Take a bgzf compressed fastq file and make an index",formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('input_file',help="BGZF compressed fastq file")
  parser.add_argument('--threads',type=int,default=1,help="number of threads")
  args = parser.parse_args()
  if not is_bgzf(args.input_file):
    sys.stderr.write("ERROR: not a proper BGZF compressed file\n")
    sys.exit()
  z = 0
  sys.stderr.write("scanning block starts\n")
  bs = get_block_bounds(args.input_file)
  blocks[bs[0][0]] = [[bs[0][1],-1]]
  sys.stderr.write("scanning for new lines\n")
  z = 0
  #if args.threads > 1:
  #  p = Pool(processes=args.threads)
  #results = []
  #for xs in [bs[j:j+args.threads*10] for j in range(0,len(bs),args.threads*10)]:
  for bounds in bs:
    #print xs
    #for bounds in xs:
    z += 1
    #if args.threads > 1:
    #  nls = p.apply_async(get_nls,args=(xs,args.input_file,z,))
    #else:
    #  nls = Queue()
    #  nls.put(get_nls(xs,args.input_file,z))
    v = get_nls(bounds,args.input_file,z)
    do_nls_output(v)
    #results.append(nls)
    sys.stderr.write(str(z)+'/'+str(len(bs))+"\r")
    #sys.exit()
  #if args.threads > 1:
  #  p.close()
  #  p.join()
  sys.stderr.write("\n")
  sys.stderr.write("Traverse blocks and writing index\n")
  of = gzip.open(args.input_file+'.bgi','w')
  z = 0
  for block in sorted(blocks):
    z+=1
    sys.stderr.write(str(z)+'/'+str(len(blocks))+"\r")
    if len(blocks[block]) == 0: continue
    bend = blocks[block][0][0]
    starts = [x[1]+1 for x in blocks[block]]
    with open(args.input_file,'rb') as inf:
      inf.seek(block)
      bytes = inf.read(bend-block)
      s = StringIO.StringIO(bytes)
      v = BGZF_reader(s)
      ubytes = v.read(70000)
      # now we can find all the new starts
      # do all but the last
      #print ubytes[starts[-2]:]
      for i in range(len(starts)-1):
        if starts[i] >= len(ubytes): #problem
          sys.stderr.write("Problem start\n")
          sys.exit()
        m = re.match('([^\n]+)\n([^\n]+)(\n[^\n]+\n[^\n]+)',ubytes[starts[i]:])
        if not m:
          sys.stderr.write("Problem overlap\n")
          sys.exit()
        else:
          if m.group(1)[0] != '@':
            sys.stderr.write("failed to parse last\n")
            sys.exit()  
          of.write(m.group(1)[1:]+"\t"+str(block)+"\t"+str(starts[i])+"\t"+str(len(m.group(1))+len(m.group(2))+len(m.group(3))+2)+"\t"+str(len(m.group(2)))+"\n")
    with open(args.input_file,'rb') as inf:
      v2 = BGZF_reader(inf,blockStart=block,innerStart=starts[-1]-1)
      spc = v2.read(1)
      if spc != "\n": 
        sys.stderr.write("expected newline\n")
        sys.exit()
      cur = v2.get_block_start()
      inn = v2.get_inner_start()
      buffer = ''
      for i in range(0,4):
        while True:
          c = v2.read(1)
          if len(c) == 0: break
          buffer += c
          if c == "\n": break
      if buffer == "":
        break
      m = re.match('([^\n]+)\n([^\n]+)',buffer)
      if not m:
        sys.stderr.write("failed to parse last\n"+buffer+"\n")
        sys.exit()
      if m.group(1)[0] != '@':
        sys.stderr.write("failed to parse last\n"+buffer+"\n")
        sys.exit()  
      of.write(m.group(1)[1:]+"\t"+str(cur)+"\t"+str(inn)+"\t"+str(len(buffer))+"\t"+str(len(m.group(2)))+"\n")

  sys.stderr.write("\n")
  sys.exit()
  buffer = ''     
  with open(args.input_file) as inf:
    #inf.seek(bs[i])
    reader = BGZF_reader(inf)
    while True:
        cur = reader.get_block_start()
        inn = reader.get_inner_start()
        fq = readfastq(reader)
        z += 1
        if not fq: break
        if z%1000 == 0: sys.stderr.write("Indexed "+str(z)+" reads\r")
        of.write(fq['name']+"\t"+str(cur)+"\t"+str(inn)+"\n")
    inf.close()
    sys.stderr.write("\n")
  of.close()
Exemple #4
0
def main():
    global blocks
    parser = argparse.ArgumentParser(
        description="Take a bgzf compressed fastq file and make an index",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_file', help="BGZF compressed fastq file")
    parser.add_argument('--threads',
                        type=int,
                        default=1,
                        help="number of threads")
    args = parser.parse_args()
    if not is_bgzf(args.input_file):
        sys.stderr.write("ERROR: not a proper BGZF compressed file\n")
        sys.exit()
    z = 0
    sys.stderr.write("scanning block starts\n")
    bs = get_block_bounds(args.input_file)
    blocks[bs[0][0]] = [[bs[0][1], -1]]
    sys.stderr.write("scanning for new lines\n")
    z = 0
    #if args.threads > 1:
    #  p = Pool(processes=args.threads)
    #results = []
    #for xs in [bs[j:j+args.threads*10] for j in range(0,len(bs),args.threads*10)]:
    for bounds in bs:
        #print xs
        #for bounds in xs:
        z += 1
        #if args.threads > 1:
        #  nls = p.apply_async(get_nls,args=(xs,args.input_file,z,))
        #else:
        #  nls = Queue()
        #  nls.put(get_nls(xs,args.input_file,z))
        v = get_nls(bounds, args.input_file, z)
        do_nls_output(v)
        #results.append(nls)
        sys.stderr.write(str(z) + '/' + str(len(bs)) + "\r")
        #sys.exit()
    #if args.threads > 1:
    #  p.close()
    #  p.join()
    sys.stderr.write("\n")
    sys.stderr.write("Traverse blocks and writing index\n")
    of = gzip.open(args.input_file + '.bgi', 'w')
    z = 0
    for block in sorted(blocks):
        z += 1
        sys.stderr.write(str(z) + '/' + str(len(blocks)) + "\r")
        if len(blocks[block]) == 0: continue
        bend = blocks[block][0][0]
        starts = [x[1] + 1 for x in blocks[block]]
        with open(args.input_file, 'rb') as inf:
            inf.seek(block)
            bytes = inf.read(bend - block)
            s = StringIO.StringIO(bytes)
            v = BGZF_reader(s)
            ubytes = v.read(70000)
            # now we can find all the new starts
            # do all but the last
            #print ubytes[starts[-2]:]
            for i in range(len(starts) - 1):
                if starts[i] >= len(ubytes):  #problem
                    sys.stderr.write("Problem start\n")
                    sys.exit()
                m = re.match('([^\n]+)\n([^\n]+)(\n[^\n]+\n[^\n]+)',
                             ubytes[starts[i]:])
                if not m:
                    sys.stderr.write("Problem overlap\n")
                    sys.exit()
                else:
                    if m.group(1)[0] != '@':
                        sys.stderr.write("failed to parse last\n")
                        sys.exit()
                    of.write(
                        m.group(1)[1:] + "\t" + str(block) + "\t" +
                        str(starts[i]) + "\t" + str(
                            len(m.group(1)) + len(m.group(2)) +
                            len(m.group(3)) + 2) + "\t" +
                        str(len(m.group(2))) + "\n")
        with open(args.input_file, 'rb') as inf:
            v2 = BGZF_reader(inf, blockStart=block, innerStart=starts[-1] - 1)
            spc = v2.read(1)
            if spc != "\n":
                sys.stderr.write("expected newline\n")
                sys.exit()
            cur = v2.get_block_start()
            inn = v2.get_inner_start()
            buffer = ''
            for i in range(0, 4):
                while True:
                    c = v2.read(1)
                    if len(c) == 0: break
                    buffer += c
                    if c == "\n": break
            if buffer == "":
                break
            m = re.match('([^\n]+)\n([^\n]+)', buffer)
            if not m:
                sys.stderr.write("failed to parse last\n" + buffer + "\n")
                sys.exit()
            if m.group(1)[0] != '@':
                sys.stderr.write("failed to parse last\n" + buffer + "\n")
                sys.exit()
            of.write(
                m.group(1)[1:] + "\t" + str(cur) + "\t" + str(inn) + "\t" +
                str(len(buffer)) + "\t" + str(len(m.group(2))) + "\n")

    sys.stderr.write("\n")
    sys.exit()
    buffer = ''
    with open(args.input_file) as inf:
        #inf.seek(bs[i])
        reader = BGZF_reader(inf)
        while True:
            cur = reader.get_block_start()
            inn = reader.get_inner_start()
            fq = readfastq(reader)
            z += 1
            if not fq: break
            if z % 1000 == 0:
                sys.stderr.write("Indexed " + str(z) + " reads\r")
            of.write(fq['name'] + "\t" + str(cur) + "\t" + str(inn) + "\n")
        inf.close()
        sys.stderr.write("\n")
    of.close()