Ejemplo n.º 1
0
def parse_pslfile(tdir,pslfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if pslfile != '-':
    fr = FileBasics.GenericFileReader(pslfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    gpd_line = PSLBasics.convert_entry_to_genepred_line(PSLBasics.line_to_entry(line.rstrip()))
    if not gpd_line:
      sys.stderr.write("Warning: malformed psl for "+readname+"\n")
      continue
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(gpd_line),smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf):
    sind = 0
    oline = ''
    for seq in seqs:
        sind += 1
        psec = 'P'  #primary or secondary
        if sind > 1: psec = 'S'
        d1 = d.copy()
        d1['rname'] = seq[1]
        if seq[2] == '+': d1['flag'] = 0
        else: d1['flag'] = 16
        d1['pos'] = seq[3]
        d1['cigar'] = seq[4]
        d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
        skips = set(['H', 'D', 'N'])
        total_length = 0
        possible_matches = 0
        indels = 0
        qstart = 0
        if d1['cigar_array'][0]['op'] == 'S':
            qstart = d1['cigar_array'][0]['val']
        if d1['cigar_array'][0]['op'] == 'H':
            qstart = d1['cigar_array'][0]['val']
        for ce in d1['cigar_array']:
            if ce['op'] not in skips:
                total_length += ce['val']
            if ce['op'] == 'M': possible_matches += ce['val']
            elif ce['op'] == 'I':
                indels += ce['val']
            elif ce['op'] == 'D' and ce['val'] < 68:
                indels += ce['val']
        fakeseq = 'N' * total_length
        d1['seq'] = fakeseq
        nline = SamBasics.entry_to_line(d1)
        pline = spcf.convert_line(nline)
        pentry = PSLBasics.line_to_entry(pline)
        #mismatch_count = -1
        #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
        #  for i in range(0,len(pentry['blockSizes'])):
        #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
        #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
        #    print pentry['blockSizes'][i]
        #    print tseq
        #    print qseq
        #    for j in range(0,len(tseq)):
        #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
        gline = PSLBasics.convert_entry_to_genepred_line(pentry)
        gentry = GenePredBasics.line_to_entry(gline)
        gsmooth = GenePredBasics.smooth_gaps(gentry, 68)
        for i in range(0, len(gsmooth['exonStarts'])):
            oline += gsmooth['chrom'] + "\t" + str(
                gsmooth['exonStarts'][i]) + "\t" + str(
                    gsmooth['exonEnds']
                    [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[
                        'name'] + "\t" + str(possible_matches) + "\t" + str(
                            indels) + "\t" + psec + "\t" + str(qstart) + "\n"
    return oline
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
Ejemplo n.º 4
0
def get_exons_from_seqs(seqs,d,spcf):
  sind = 0
  oline = ''
  for seq in seqs:
    sind+=1
    psec = 'P' #primary or secondary
    if sind > 1: psec = 'S'
    d1 = d.copy()
    d1['rname'] = seq[1]
    if seq[2] == '+':  d1['flag'] = 0
    else: d1['flag'] = 16
    d1['pos'] = seq[3]
    d1['cigar'] = seq[4]
    d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
    skips = set(['H','D','N'])
    total_length = 0
    possible_matches = 0
    indels = 0
    qstart = 0
    if d1['cigar_array'][0]['op'] == 'S':
      qstart = d1['cigar_array'][0]['val']
    if d1['cigar_array'][0]['op'] == 'H':
      qstart = d1['cigar_array'][0]['val']
    for ce in d1['cigar_array']:
      if ce['op'] not in skips:
        total_length += ce['val']
      if ce['op'] == 'M': possible_matches += ce['val']
      elif ce['op'] == 'I':
        indels += ce['val']
      elif ce['op'] == 'D' and ce['val'] < 68:
        indels += ce['val']
    fakeseq = 'N'*total_length
    d1['seq'] = fakeseq
    nline = SamBasics.entry_to_line(d1)
    pline = spcf.convert_line(nline)
    pentry = PSLBasics.line_to_entry(pline)
    #mismatch_count = -1
    #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
    #  for i in range(0,len(pentry['blockSizes'])):
    #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
    #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
    #    print pentry['blockSizes'][i]
    #    print tseq
    #    print qseq
    #    for j in range(0,len(tseq)):
    #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
    gline = PSLBasics.convert_entry_to_genepred_line(pentry)
    gentry = GenePredBasics.line_to_entry(gline)
    gsmooth = GenePredBasics.smooth_gaps(gentry,68)
    for i in range(0,len(gsmooth['exonStarts'])):
      oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n"
  return oline
def main():
  parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.")
  parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.")
  parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.")
  args = parser.parse_args()
  
  pslfilehandle = sys.stdin
  if args.input_name != '-':
    pslfilehandle = open(args.input_name)
  with pslfilehandle as infile:
    for line in infile:
      psl_entry = PSLBasics.line_to_entry(line)
      genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry)
      if args.fill_gaps > 0:
        genepred_entry = GenePredBasics.line_to_entry(genepred_line)
        genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps)
        genepred_line = GenePredBasics.entry_to_line(genepred_entry2)
      print genepred_line
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)