def parse_pslfile(tdir,pslfile,smoothing_factor): # Go through the long reads and make a genepred if pslfile != '-': fr = FileBasics.GenericFileReader(pslfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 gpd_line = PSLBasics.convert_entry_to_genepred_line(PSLBasics.line_to_entry(line.rstrip())) if not gpd_line: sys.stderr.write("Warning: malformed psl for "+readname+"\n") continue entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(gpd_line),smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf): sind = 0 oline = '' for seq in seqs: sind += 1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H', 'D', 'N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N' * total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry, 68) for i in range(0, len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str( gsmooth['exonStarts'][i]) + "\t" + str( gsmooth['exonEnds'] [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[ 'name'] + "\t" + str(possible_matches) + "\t" + str( indels) + "\t" + psec + "\t" + str(qstart) + "\n" return oline
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def get_exons_from_seqs(seqs,d,spcf): sind = 0 oline = '' for seq in seqs: sind+=1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H','D','N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N'*total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry,68) for i in range(0,len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n" return oline
def main(): parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.") parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.") parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.") args = parser.parse_args() pslfilehandle = sys.stdin if args.input_name != '-': pslfilehandle = open(args.input_name) with pslfilehandle as infile: for line in infile: psl_entry = PSLBasics.line_to_entry(line) genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry) if args.fill_gaps > 0: genepred_entry = GenePredBasics.line_to_entry(genepred_line) genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps) genepred_line = GenePredBasics.entry_to_line(genepred_entry2) print genepred_line
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)