def main():
    parser = argparse.ArgumentParser(
        description=
        "Make a gpd with unique transcript names and a key to their original gpd entry\n"
    )
    parser.add_argument("gpd_infile", help="FILENAME genepred file")
    parser.add_argument("gpd_outfile", help="FILENAME genepred file")
    args = parser.parse_args()
    gfr = GenericFileReader(args.gpd_infile)
    seen = {}
    while True:
        line = gfr.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        f = line.split("\t")
        if f[1] not in seen:
            seen[f[1]] = []
        seen[f[1]].append(line)
    gfr.close()
    of_gpd = open(args.gpd_outfile, 'w')
    of_key = open(args.gpd_outfile + ".key_file", 'w')
    for tx in seen:
        for i in range(0, len(seen[tx])):
            name = tx
            if len(seen[tx]) > 1:
                name = tx + '[' + str(i + 1) + '/' + str(len(seen[tx])) + ']'
            f = seen[tx][i].split("\t")
            f[1] = name
            newline = "\t".join(f)
            of_key.write(name + "\t" + seen[tx][i] + "\n")
            of_gpd.write(newline + "\n")
    of_key.close()
    of_gpd.close()
def main():
  parser = argparse.ArgumentParser(description="Make a gpd with unique transcript names and a key to their original gpd entry\n")
  parser.add_argument("gpd_infile",help="FILENAME genepred file")
  parser.add_argument("gpd_outfile",help="FILENAME genepred file")
  args = parser.parse_args()
  gfr = GenericFileReader(args.gpd_infile)
  seen = {}
  while True:
    line = gfr.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    f = line.split("\t")
    if f[1] not in seen:
      seen[f[1]] = []
    seen[f[1]].append(line)
  gfr.close()
  of_gpd = open(args.gpd_outfile,'w')
  of_key = open(args.gpd_outfile+".key_file",'w')
  for tx in seen:
    for i in range(0,len(seen[tx])):
      name = tx
      if len(seen[tx]) > 1:
        name = tx + '['+str(i+1)+'/'+str(len(seen[tx]))+']'
      f = seen[tx][i].split("\t")
      f[1] = name
      newline = "\t".join(f)
      of_key.write(name + "\t" + seen[tx][i] + "\n")       
      of_gpd.write(newline + "\n")       
  of_key.close()
  of_gpd.close()
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
예제 #4
0
 def __init__(self, filename):
     self.filename = filename
     self.gfr = GenericFileReader(filename)
     self.entries = []
     while True:
         line = self.gfr.readline()
         if not line: break
         if re.match('^#', line): continue
         gpe = GenePredEntry()
         gpe.line_to_entry(line)
         self.entries.append(gpe)
     return
예제 #5
0
 def set_mapping_counts(self,psl_filename):
   self.mapping_counts_set = True
   gfr0 = GenericFileReader(psl_filename)
   qcnts = {}
   while True:
     line = gfr0.readline()
     if not line: break
     try:
       psle = PSLBasics.line_to_entry(line.rstrip())
     except:
       sys.stderr.write("Problem parsing line:\n"+line.rstrip()+"\n")
       continue
     if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0
     qcnts[psle['qName']] += 1
   gfr0.close()
   self.mapping_counts = qcnts
예제 #6
0
 def set_mapping_counts(self, psl_filename):
     self.mapping_counts_set = True
     gfr0 = GenericFileReader(psl_filename)
     qcnts = {}
     while True:
         line = gfr0.readline()
         if not line: break
         try:
             psle = PSLBasics.line_to_entry(line.rstrip())
         except:
             sys.stderr.write("Problem parsing line:\n" + line.rstrip() +
                              "\n")
             continue
         if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0
         qcnts[psle['qName']] += 1
     gfr0.close()
     self.mapping_counts = qcnts
class GenericFastaFileReader:
    def __init__(self, filename):
        self.filename = filename
        self.gfr = GenericFileReader(self.filename)
        self.previous_name = None

    def close(self):
        self.gfr.close()

    def read_entry(self):
        buffer = ''
        original = ''
        t = {}
        t['name'] = ''
        t['seq'] = ''
        t['original'] = ''
        while True:
            newline = self.gfr.readline()
            if not self.previous_name and not newline:
                # no name in the buffer and new data being input, exit
                return None
            if not newline:
                # end of the line, then finish it
                t['name'] = self.previous_name
                t['seq'] = buffer
                t['original'] = original
                self.previous_name = None
                t['original'] = '>' + t['name'] + "\n" + t['original']
                return t
            m = re.match('^>(.*)$', newline.rstrip())
            if not self.previous_name and m:
                self.previous_name = m.group(1)
                #special case of our first entry
                continue
            if m:
                t['name'] = self.previous_name
                t['seq'] = buffer
                t['original'] = original
                self.previous_name = m.group(1)
                t['original'] = '>' + t['name'] + "\n" + t['original']
                return t
            buffer += newline.rstrip()
            original += newline
예제 #8
0
class GenericFastaFileReader:
  def __init__(self,filename):
    self.filename = filename
    self.gfr = GenericFileReader(self.filename)
    self.previous_name = None
  def close(self):
    self.gfr.close()
  def read_entry(self):
    buffer = ''
    original = ''
    t = {}
    t['name'] = ''
    t['seq'] = ''
    t['original'] = ''
    while True:
      newline = self.gfr.readline()
      if not self.previous_name and not newline:
        # no name in the buffer and new data being input, exit
        return None
      if not newline:
        # end of the line, then finish it
        t['name'] = self.previous_name
        t['seq'] = buffer
        t['original'] = original
        self.previous_name = None
        t['original'] = '>'+t['name'] + "\n" + t['original']
        return t
      m = re.match('^>(.*)$',newline.rstrip())
      if not self.previous_name and m:
        self.previous_name = m.group(1)
        #special case of our first entry
        continue
      if m:
        t['name'] = self.previous_name
        t['seq'] = buffer
        t['original'] = original
        self.previous_name = m.group(1)
        t['original'] = '>'+t['name'] + "\n" + t['original']
        return t
      buffer += newline.rstrip()
      original += newline
예제 #9
0
 def __init__(self,filename):
   self.filename = filename
   self.gfr = GenericFileReader(filename)
   self.entries = []
   while True:
     line = self.gfr.readline()
     if not line: break
     if re.match('^#',line): continue
     gpe = GenePredEntry()
     gpe.line_to_entry(line)
     self.entries.append(gpe)
   return
예제 #10
0
class GenericFastqFileReader:
  def __init__(self,filename):
    self.filename = filename
    self.gfr = GenericFileReader(self.filename)
    self.previous_name = None

  def close(self):
    self.gfr.close()

  def read_entry(self):
    line1 = self.gfr.readline()
    if not line1:
      return False
    line2 = self.gfr.readline()
    if not line2:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    line3 = self.gfr.readline()
    if not line3:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    line4 = self.gfr.readline()
    if not line4:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    m = re.match('^@([^\t]+)',line1.rstrip())
    if not m:
      sys.stderr.write("Warning: Could not read name\n")
    t = {}
    t['name'] = m.group(1)
    t['seq'] = line2.rstrip()
    t['quality'] = line4.rstrip()
    return t
예제 #11
0
class GenericFastqFileReader:
  def __init__(self,filename):
    self.filename = filename
    self.gfr = GenericFileReader(self.filename)
    self.previous_name = None

  def close(self):
    self.gfr.close()

  def read_entry(self):
    line1 = self.gfr.readline()
    if not line1:
      return False
    line2 = self.gfr.readline()
    if not line2:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    line3 = self.gfr.readline()
    if not line3:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    line4 = self.gfr.readline()
    if not line4:
      sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n")
    m = re.match('^@([^\t]+)',line1.rstrip())
    if not m:
      sys.stderr.write("Warning: Could not read name\n")
    t = {}
    t['name'] = m.group(1)
    t['seq'] = line2.rstrip()
    t['quality'] = line4.rstrip()
    return t
예제 #12
0
 def __init__(self,filename):
   self.filename = filename
   self.gfr = GenericFileReader(self.filename)
   self.previous_name = None
예제 #13
0
 def __init__(self,filename):
   self.filename = filename
   self.gfr = GenericFileReader(self.filename)
   self.previous_name = None
def main():
    parser = argparse.ArgumentParser(
        description=
        'Split FASTQ file(s) into smaller ones with as many entries as you specify'
    )
    parser.add_argument('size',
                        type=int,
                        help='Number of sequences to put into each file')
    parser.add_argument('output_directory',
                        help='Name of the directory to put sequences')
    parser.add_argument('fastq_files',
                        nargs='+',
                        help='FILENAME(S) for fastq files')
    args = parser.parse_args()
    if len(args.fastq_files) > 2:
        sys.stderr.write("ERROR only two fastq files at most are supported\n")
        return
    if os.path.exists(args.output_directory):
        sys.stderr.write("ERROR output directory exists already\n")
        return
    os.makedirs(args.output_directory)
    if len(args.fastq_files) == 1:
        out_iter = 1
        fcount = 0
        of = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq',
            'w')
        gfr = sys.stdin
        if (args.fastq_files[0] != '-'):
            gfr = GenericFileReader(args.fastq_files[0])
        while True:
            lineA = gfr.readline()
            if not lineA: break
            lineB = gfr.readline()
            lineC = gfr.readline()
            lineD = gfr.readline()
            of.write(lineA)
            of.write(lineB)
            of.write(lineC)
            of.write(lineD)
            fcount += 1
            if args.size <= fcount:
                fcount = 0
                out_iter += 1
                of.close()
                of = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '.fq', 'w')
        gfr.close()
    else:  # we have two fastq files
        out_iter = 1
        fcount = 0
        of1 = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq',
            'w')
        gfr1 = GenericFileReader(args.fastq_files[0])
        of2 = open(
            args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq',
            'w')
        gfr2 = GenericFileReader(args.fastq_files[1])
        while True:
            line1a = gfr1.readline()
            line2a = gfr2.readline()
            if not line1a or not line2a:
                if line1a or line2a:
                    sys.stderr.write(
                        "WARNING paired file lengths appear different\n")
                break
            line1b = gfr1.readline()
            line2b = gfr2.readline()
            line1c = gfr1.readline()
            line2c = gfr2.readline()
            line1d = gfr1.readline()
            line2d = gfr2.readline()
            of1.write(line1a)
            of2.write(line2a)
            of1.write(line1b)
            of2.write(line2b)
            of1.write(line1c)
            of2.write(line2c)
            of1.write(line1d)
            of2.write(line2d)
            fcount += 1
            if args.size <= fcount:
                fcount = 0
                out_iter += 1
                of1.close()
                of2.close()
                of1 = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '_1.fq', 'w')
                of2 = open(
                    args.output_directory.rstrip('/') + '/' + str(out_iter) +
                    '_2.fq', 'w')
        gfr1.close()
        gfr2.close()
예제 #15
0
def main():
  parser = argparse.ArgumentParser(description='Split FASTQ file(s) into smaller ones with as many entries as you specify')
  parser.add_argument('size',type=int,help='Number of sequences to put into each file')
  parser.add_argument('output_directory',help='Name of the directory to put sequences')
  parser.add_argument('fastq_files',nargs='+',help='FILENAME(S) for fastq files')
  args = parser.parse_args()
  if len(args.fastq_files) > 2:
    sys.stderr.write("ERROR only two fastq files at most are supported\n")
    return
  if os.path.exists(args.output_directory):
    sys.stderr.write("ERROR output directory exists already\n")
    return
  os.makedirs(args.output_directory)
  if len(args.fastq_files) == 1:
    out_iter = 1
    fcount = 0
    of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w')
    gfr = sys.stdin
    if(args.fastq_files[0] != '-'):
      gfr = GenericFileReader(args.fastq_files[0])
    while True:
      lineA = gfr.readline()
      if not lineA: break
      lineB = gfr.readline()
      lineC = gfr.readline()
      lineD = gfr.readline()
      of.write(lineA)
      of.write(lineB)
      of.write(lineC)
      of.write(lineD)
      fcount += 1
      if args.size <= fcount:
        fcount = 0
        out_iter += 1
        of.close()
        of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w')   
    gfr.close()
  else: # we have two fastq files
    out_iter = 1
    fcount = 0
    of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w')
    gfr1 = GenericFileReader(args.fastq_files[0])
    of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w')
    gfr2 = GenericFileReader(args.fastq_files[1])
    while True:
      line1a = gfr1.readline()
      line2a = gfr2.readline()
      if not line1a or not line2a: 
        if line1a or line2a:
          sys.stderr.write("WARNING paired file lengths appear different\n")
        break
      line1b = gfr1.readline()
      line2b = gfr2.readline()
      line1c = gfr1.readline()
      line2c = gfr2.readline()
      line1d = gfr1.readline()
      line2d = gfr2.readline()
      of1.write(line1a)
      of2.write(line2a)
      of1.write(line1b)
      of2.write(line2b)
      of1.write(line1c)
      of2.write(line2c)
      of1.write(line1d)
      of2.write(line2d)
      fcount += 1
      if args.size <= fcount:
        fcount = 0
        out_iter += 1
        of1.close()
        of2.close()
        of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w')   
        of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w')   
    gfr1.close()
    gfr2.close()
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)