def parse_refgpd(tdir,geneprednames,simplenames): # get the reference genepreds ready to use in work column_number = 0 entry_number = 0 of_entries = open(tdir+"/entries.txt",'w') for file in geneprednames: column_number += 1 of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w') gfr = FileBasics.GenericFileReader(file) while True: line = gfr.readline() if not line: break if re.match('^#',line): continue entry_number += 1 line = line.rstrip("\n") entry = GenePredBasics.line_to_entry(line) entry_length = 0 for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i] of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n") exon_number = 0 for i in range(0,len(entry['exonStarts'])): exon_number += 1 of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \ + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \ + entry['gene_name'] + "\t" \ + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \ + entry['strand'] + "\t" + str(exon_number) \ + "\n") gfr.close() of_ref.close() of_entries.close()
def parse_gpdfile(tdir,gpdfile,smoothing_factor): # Go through the long reads and make a genepred if gpdfile != '-': fr = FileBasics.GenericFileReader(gpdfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(line.rstrip()) \ ,smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf): sind = 0 oline = '' for seq in seqs: sind += 1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H', 'D', 'N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N' * total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry, 68) for i in range(0, len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str( gsmooth['exonStarts'][i]) + "\t" + str( gsmooth['exonEnds'] [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[ 'name'] + "\t" + str(possible_matches) + "\t" + str( indels) + "\t" + psec + "\t" + str(qstart) + "\n" return oline
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred',help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def get_exons_from_seqs(seqs,d,spcf): sind = 0 oline = '' for seq in seqs: sind+=1 psec = 'P' #primary or secondary if sind > 1: psec = 'S' d1 = d.copy() d1['rname'] = seq[1] if seq[2] == '+': d1['flag'] = 0 else: d1['flag'] = 16 d1['pos'] = seq[3] d1['cigar'] = seq[4] d1['cigar_array'] = SamBasics.parse_cigar(seq[4]) skips = set(['H','D','N']) total_length = 0 possible_matches = 0 indels = 0 qstart = 0 if d1['cigar_array'][0]['op'] == 'S': qstart = d1['cigar_array'][0]['val'] if d1['cigar_array'][0]['op'] == 'H': qstart = d1['cigar_array'][0]['val'] for ce in d1['cigar_array']: if ce['op'] not in skips: total_length += ce['val'] if ce['op'] == 'M': possible_matches += ce['val'] elif ce['op'] == 'I': indels += ce['val'] elif ce['op'] == 'D' and ce['val'] < 68: indels += ce['val'] fakeseq = 'N'*total_length d1['seq'] = fakeseq nline = SamBasics.entry_to_line(d1) pline = spcf.convert_line(nline) pentry = PSLBasics.line_to_entry(pline) #mismatch_count = -1 #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches # for i in range(0,len(pentry['blockSizes'])): # tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]] # qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]] # print pentry['blockSizes'][i] # print tseq # print qseq # for j in range(0,len(tseq)): # if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1 gline = PSLBasics.convert_entry_to_genepred_line(pentry) gentry = GenePredBasics.line_to_entry(gline) gsmooth = GenePredBasics.smooth_gaps(gentry,68) for i in range(0,len(gsmooth['exonStarts'])): oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n" return oline
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred', help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size', type=int, default=68, help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def read_from_fasta_and_genepred(self,genomefastafile,genepredfile): # read in our genome seen_names = {} seen_coords = {} genepred = {} with open(genepredfile) as inf: for line in inf: if re.match('^#',line): continue e = GenePredBasics.line_to_entry(line) hexcoord = hashlib.sha1(e['chrom']+"\t"+e['strand'] + "\t" + str(e['exonStarts'])+"\t" + str(e['exonEnds'])).hexdigest() #print hex #print e['gene_name'] #print e['name'] dupname = 0 dupcoord = 0 if hexcoord in seen_coords: sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " exists at identical coordinates as another entry\n") dupcoord = 1 seen_coords[hexcoord] = 1 currname = e['name'] if e['name'] in seen_names: if dupcoord == 1: sys.stderr.write("skipping perfect duplicate of "+e['name']+"\n") continue newname = e['name'] + "."+str(len(seen_names[e['name']])+1) currname = newname seen_names[e['name']].append(newname) sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " is a duplicate name.. renaming to "+newname+ "\n") dupname = 1 else: seen_names[e['name']] = [] seen_names[e['name']].append(e['name']) genepred[currname] = e #print "reading names and locs" ref = read_fasta_into_hash(genomefastafile) #print "converting sequences" for transcript in genepred: e = genepred[transcript] if e['chrom'] in ref: seq = '' self.transcript_names[transcript] = genepred[transcript]['name'] for i in range(0,e['exonCount']): seq += ref[e['chrom']][e['exonStarts'][i]:e['exonEnds'][i]] if e['strand'] == '-': seq = rc(seq) self.transcripts[transcript] = seq
def main(): parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.") parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.") parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.") args = parser.parse_args() pslfilehandle = sys.stdin if args.input_name != '-': pslfilehandle = open(args.input_name) with pslfilehandle as infile: for line in infile: psl_entry = PSLBasics.line_to_entry(line) genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry) if args.fill_gaps > 0: genepred_entry = GenePredBasics.line_to_entry(genepred_line) genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps) genepred_line = GenePredBasics.entry_to_line(genepred_entry2) print genepred_line
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file', help="use - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_file != '-': inf = open(args.input_file) for line in inf: e = GenePredBasics.line_to_entry(line.rstrip()) matches = 0 qstartslist = [] for i in range(0, len(e['exonStarts'])): mylen = e['exonEnds'][i] - e['exonStarts'][i] matches += mylen qstartslist.append(matches - mylen) qstarts = ','.join([str(x) for x in qstartslist]) + ',' oline = str(matches) + "\t" # 1 oline += "0\t" # 2 oline += "0\t" # 3 oline += "0\t" # 4 oline += "0\t" # 5 oline += "0\t" # 6 oline += "0\t" # 7 oline += "0\t" # 8 oline += e['strand'] + "\t" # 9 oline += e['name'] + "\t" # 10 oline += str(matches) + "\t" # 11 oline += "0\t" # 12 oline += str(matches) + "\t" # 13 oline += str(e['chrom']) + "\t" # 14 oline += str(e['exonEnds'][-1]) + "\t" # 15 oline += str(e['exonStarts'][0]) + "\t" # 16 oline += str(e['exonEnds'][-1]) + "\t" # 17 oline += str(len(e['exonStarts'])) + "\t" # 18 oline += ','.join([ str(e['exonEnds'][x] - e['exonStarts'][x]) for x in range(0, len(e['exonStarts'])) ]) + ',' + "\t" # 19 oline += qstarts + "\t" # 20 oline += ','.join([str(x) for x in e['exonStarts']]) + ',' # 21 print oline inf.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file',help="use - for STDIN") args = parser.parse_args() inf = sys.stdin if args.input_file != '-': inf = open(args.input_file) for line in inf: e = GenePredBasics.line_to_entry(line.rstrip()) matches = 0 qstartslist = [] for i in range(0,len(e['exonStarts'])): mylen = e['exonEnds'][i]-e['exonStarts'][i] matches += mylen qstartslist.append(matches-mylen) qstarts = ','.join([str(x) for x in qstartslist])+',' oline = str(matches)+"\t" # 1 oline += "0\t" # 2 oline += "0\t" # 3 oline += "0\t" # 4 oline += "0\t" # 5 oline += "0\t" # 6 oline += "0\t" # 7 oline += "0\t" # 8 oline += e['strand']+"\t" # 9 oline += e['name']+"\t" # 10 oline += str(matches)+"\t" # 11 oline += "0\t" # 12 oline += str(matches)+"\t" # 13 oline += str(e['chrom'])+"\t" # 14 oline += str(e['exonEnds'][-1])+"\t" # 15 oline += str(e['exonStarts'][0])+"\t" # 16 oline += str(e['exonEnds'][-1])+"\t" # 17 oline += str(len(e['exonStarts']))+"\t" # 18 oline += ','.join([str(e['exonEnds'][x]-e['exonStarts'][x]) for x in range(0,len(e['exonStarts']))])+','+"\t" # 19 oline += qstarts + "\t" # 20 oline += ','.join([str(x) for x in e['exonStarts']])+',' # 21 print oline inf.close()
def break_gpdfile(tdir,job_size): bfcr = BigFileBasics.BigFileChunkReader(tdir+'/longreads.gpd') bfcr.set_chunk_size_bytes(job_size) num_jobs = bfcr.chunk_count for i in range(0,bfcr.chunk_count): oc = bfcr.open_chunk(i) job = i+1 of_bed = open(tdir+'/partreads.'+str(job)+'.bed','w') while True: line = oc.read_line() if not line: break line = line.rstrip("\n") entry = GenePredBasics.line_to_entry(line) exon_number = 0 for i in range(0,len(entry['exonStarts'])): exon_number += 1 of_bed.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \ + str(entry['exonEnds'][i]) + "\t" + entry['name']+"\t" \ + entry['gene_name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \ + entry['strand'] + "\t" + str(exon_number) + "\n") oc.close() of_bed.close() return num_jobs
def nudge(psl_entry,gpd_entry,refjun,args): junctions = [] fcount = 0 if len(gpd_entry['exonStarts']) == 1: #print "no intron 1" return gpd_entry bounds = [] for i in range(1,len(gpd_entry['exonStarts'])): junc_start = gpd_entry['exonEnds'][i-1] junc_finish = gpd_entry['exonStarts'][i]+1 bounds.append([junc_start, junc_finish,i-1]) if len(bounds) < 1: #print "no intron 2" return gpd_entry bestbounds = [] for bound in bounds: best_distance = [10000000,10000000] best_result = None for z1 in range(bound[0]-args.search_size,bound[0]+args.search_size+1): d1 = abs(z1-bound[0]) if z1 in refjun: for z2 in range(bound[1]-args.search_size,bound[1]+args.search_size+args.search_size+1): d2 = abs(z2-bound[1]) if z2 in refjun[z1]: refstrand = refjun[z1][z2] if d1+d2 < best_distance[0]+best_distance[1]: best_distance = [d1,d2] best_result = [z1,z2,refstrand,bound[2]]+best_distance if best_result: bestbounds.append(best_result) if len(bestbounds) < 1: #nothing fixable #sys.stderr.write("nothing fixable\n") return gpd_entry #Now we have a list of nudgable bounds #Lets pick a strand plus_score = 0 minus_score = 0 #print '----' #print bestbounds for bound in bestbounds: if bound[2] == '+': plus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1) else: minus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1) use_strand = '+' #print [plus_score,minus_score] if plus_score < minus_score: use_strand = '-' #print use_strand choice_bounds = [] for bound in bestbounds: if bound[2] == use_strand: choice_bounds.append(bound) #print '---' #print GenePredBasics.entry_to_line(gpd_entry) #print bestbounds #print choice_bounds if len(choice_bounds) < 1: print "ERROR should have choices" sys.exit() replacements = {} for bound in choice_bounds: replacements[bound[3]] = [bound[0],bound[1]] junctions = [] #print "fixed "+str(len(replacements.keys())) for i in range(0,len(bounds)): val = bounds[i] if i in replacements: #sys.stderr.write("use replacement\n") val = replacements[i] fcount += 1 junctions.append([val[0],val[1]]) #print junctions #sys.stderr.write("replace\n") #print junctions new_gpd_line = gpd_entry['gene_name'] + "\t" new_gpd_line += gpd_entry['name'] + "\t" new_gpd_line += gpd_entry['chrom'] + "\t" new_gpd_line += gpd_entry['strand'] + "\t" new_gpd_line += str(gpd_entry['txStart']) + "\t" new_gpd_line += str(gpd_entry['txEnd']) + "\t" new_gpd_line += str(gpd_entry['cdsStart']) + "\t" new_gpd_line += str(gpd_entry['cdsEnd']) + "\t" new_gpd_line += str(len(junctions)+1) + "\t" exon_starts = [gpd_entry['txStart']] exon_ends = [] #gpd_entry['txEnd']] for junc in junctions: exon_starts.append(junc[1]-1) exon_ends.append(junc[0]) exon_ends.append(gpd_entry['txEnd']) new_gpd_line += ','.join([str(x) for x in exon_starts])+','+"\t" new_gpd_line += ','.join([str(x) for x in exon_ends])+','+"\t" #print new_gpd_line new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line) #print "got junctions" #print new_gpd_line #print '.........' return new_gpd_entry
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def nudge(psl_entry, gpd_entry, refjun, args): junctions = [] fcount = 0 if len(gpd_entry['exonStarts']) == 1: #print "no intron 1" return gpd_entry bounds = [] for i in range(1, len(gpd_entry['exonStarts'])): junc_start = gpd_entry['exonEnds'][i - 1] junc_finish = gpd_entry['exonStarts'][i] + 1 bounds.append([junc_start, junc_finish, i - 1]) if len(bounds) < 1: #print "no intron 2" return gpd_entry bestbounds = [] for bound in bounds: best_distance = [10000000, 10000000] best_result = None for z1 in range(bound[0] - args.search_size, bound[0] + args.search_size + 1): d1 = abs(z1 - bound[0]) if z1 in refjun: for z2 in range( bound[1] - args.search_size, bound[1] + args.search_size + args.search_size + 1): d2 = abs(z2 - bound[1]) if z2 in refjun[z1]: refstrand = refjun[z1][z2] if d1 + d2 < best_distance[0] + best_distance[1]: best_distance = [d1, d2] best_result = [z1, z2, refstrand, bound[2] ] + best_distance if best_result: bestbounds.append(best_result) if len(bestbounds) < 1: #nothing fixable #sys.stderr.write("nothing fixable\n") return gpd_entry #Now we have a list of nudgable bounds #Lets pick a strand plus_score = 0 minus_score = 0 #print '----' #print bestbounds for bound in bestbounds: if bound[2] == '+': plus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1) else: minus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1) use_strand = '+' #print [plus_score,minus_score] if plus_score < minus_score: use_strand = '-' #print use_strand choice_bounds = [] for bound in bestbounds: if bound[2] == use_strand: choice_bounds.append(bound) #print '---' #print GenePredBasics.entry_to_line(gpd_entry) #print bestbounds #print choice_bounds if len(choice_bounds) < 1: print "ERROR should have choices" sys.exit() replacements = {} for bound in choice_bounds: replacements[bound[3]] = [bound[0], bound[1]] junctions = [] #print "fixed "+str(len(replacements.keys())) for i in range(0, len(bounds)): val = bounds[i] if i in replacements: #sys.stderr.write("use replacement\n") val = replacements[i] fcount += 1 junctions.append([val[0], val[1]]) #print junctions #sys.stderr.write("replace\n") #print junctions new_gpd_line = gpd_entry['gene_name'] + "\t" new_gpd_line += gpd_entry['name'] + "\t" new_gpd_line += gpd_entry['chrom'] + "\t" new_gpd_line += gpd_entry['strand'] + "\t" new_gpd_line += str(gpd_entry['txStart']) + "\t" new_gpd_line += str(gpd_entry['txEnd']) + "\t" new_gpd_line += str(gpd_entry['cdsStart']) + "\t" new_gpd_line += str(gpd_entry['cdsEnd']) + "\t" new_gpd_line += str(len(junctions) + 1) + "\t" exon_starts = [gpd_entry['txStart']] exon_ends = [] #gpd_entry['txEnd']] for junc in junctions: exon_starts.append(junc[1] - 1) exon_ends.append(junc[0]) exon_ends.append(gpd_entry['txEnd']) new_gpd_line += ','.join([str(x) for x in exon_starts]) + ',' + "\t" new_gpd_line += ','.join([str(x) for x in exon_ends]) + ',' + "\t" #print new_gpd_line new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line) #print "got junctions" #print new_gpd_line #print '.........' return new_gpd_entry