def write_new_bed(gene_list, old_bed, missed_genes,out_file): merge_fh = open(out_file,"wb") hit_list = [hit for hit,qaccn in missed_genes] for i,gene in enumerate(old_bed): if gene["accn"] in hit_list: continue new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line)) for i,new_gene in enumerate(gene_list): ### merge overlapping here updated_feat = gene_list[new_gene] if len(updated_feat["locs"]) > 1: updated_feat = merge_feats(updated_feat) new_line = Bed.row_string(updated_feat) merge_fh.write("{0}\n".format(new_line))
def write_new_bed(gene_list, old_bed, missed_genes, out_file): merge_fh = open(out_file, "wb") hit_list = [hit for hit, qaccn in missed_genes] for i, gene in enumerate(old_bed): if gene["accn"] in hit_list: continue new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line)) for i, new_gene in enumerate(gene_list): ### merge overlapping here updated_feat = gene_list[new_gene] if len(updated_feat["locs"]) > 1: updated_feat = merge_feats(updated_feat) new_line = Bed.row_string(updated_feat) merge_fh.write("{0}\n".format(new_line))
def print_bed(flist, old_path): ipath, ext = op.splitext(old_path) path = "%s.with_new%s" % (ipath, ext) print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext) fh = open(path, 'wb') seen = {} for item in flist: # convert the locs to a tuple. #print >>sys.stderr, item item = list(item) item[6] = tuple(item[6]) item = tuple(item) if item in seen: continue seen[item] = 1 locs = item[6] # tuple(sorted([item[1], item[2]])) row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0], locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".") print >> fh, Bed.row_string(row) fh.close() return Bed(path)
def merge(org_bed, missed, merge_file): """creates blast.all file and updates everything""" merge_fh = open(merge_file, "w") #cds_missed = missed[missed['ftype'] == 'CDS'] #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0] new_rows = [] seen_accns = {} # CDS added to existing gene. for row_missed in missed: if row_missed['accn'] in seen_accns: continue try: org_bed_row = org_bed.accn(row_missed['accn']) # it's a CDS except KeyError: #its a new gene new_rows.append(row_missed) seen_accns[row_missed['accn']] = True continue locs_interval = Intersecter() [locs_interval.add_interval(Feature(start,stop)) for start,stop in org_bed_row['locs']] for missed_start,missed_end in row_missed['locs']: if len(locs_interval.find(missed_start,missed_end)) > 0: # print >>sys.stderr, org_bed_row['accn'] locs_intersects = [(l.start,l.stop) for l in locs_interval.find(missed_start,missed_end)] [org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects] locs_intersects = set(locs_intersects) locs_intersects.add((missed_start,missed_end)) locs_start = min([start for start,end in locs_intersects]) locs_end = max([end for start,end in locs_intersects]) org_bed_row['locs'] = org_bed_row['locs'] + [(locs_start,locs_end)] row_missed['locs'].remove((missed_start,missed_end)) org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs'] #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs) org_bed_row['locs'].sort() org_bed_row['start'] = min(min([start for start,end in org_bed_row['locs']]), org_bed_row['start']) org_bed_row['end'] = max(max([end for start,end in org_bed_row['locs']]), org_bed_row['end']) new_rows.append(org_bed_row) seen_accns[org_bed_row['accn']] =True for org_bed_rw in org_bed: if org_bed_rw['accn'] not in seen_accns: new_rows.append(org_bed_rw) seen_accns[org_bed_rw['accn']] =True def row_cmp(a,b): return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start']) new_rows.sort(cmp=row_cmp) #print >>merge_fh, "\t".join(Bed.names) for i, row in enumerate(new_rows): print >>merge_fh, Bed.row_string(row)
def merge_same_hits(missed, fh_match, org_bed): """ groups genes that hit more then once """ d = {} handle = open(fh_match) matches = handle.read() org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] fh = open('{0}/missed_from_{1}'.format(dirc,org), "wb") for match in matches.split('\n')[:-1]: qaccn,saccn = match.split('\t') #create dictionary try: seqid = missed.accn(qaccn)['seqid'] haccn = missed.accn(qaccn) except KeyError: continue #if near_gene(haccn,org_bed)==True: continue if (seqid,saccn) not in d.keys(): #append whole dict to keys d[(seqid,saccn)]= missed.accn(qaccn) else: #else add locs to exsting one gene_start = min(d[(seqid,saccn)]['locs'])[0] gene_end = max(d[(seqid,saccn)]['locs'])[1] missed_end = missed.accn(qaccn)['locs'][0][1] missed_start = missed.accn(qaccn)['locs'][0][0] if missed_end < gene_start: # if no intervening genes and they are close together... intervening_genes = get_intervening_genes(missed_end,gene_start,seqid, org_bed, d[(seqid,saccn)]['accn']) if intervening_genes is False: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid,saccn)]['start'] = missed_start if 'Os' in qaccn: d[seqid,saccn]['accn'] = qaccn else: d[(seqid,qaccn)] = missed.accn(qaccn) elif gene_end < missed_start: intervening_genes = get_intervening_genes(gene_end,missed_start,seqid, org_bed,d[(seqid,saccn)]["accn"]) if intervening_genes is False: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid,saccn)]['end'] = missed_end if 'Os' in qaccn: d[seqid,saccn]['accn'] = qaccn else: d[(seqid,qaccn)]= missed.accn(qaccn) else: d[(seqid,saccn)]['locs'] = d[(seqid,saccn)]['locs'] + missed.accn(qaccn)['locs'] for key in d.keys(): new_row = d[key]['locs'].sort() row = d[key] print >>fh, Bed.row_string(row)
def merge_flat(new_name, aflat, bflat): """take 2 flat files and return a new one that is the union of the 2 existing""" seen = {} both = [] for flat in (aflat, bflat): for row in flat: key = row['seqid'], row['accn'] if key in seen: continue seen[key] = True both.append(row) both.sort(key=lambda a: (a['seqid'], a['start'])) fh = open(new_name, "w") #print >>fh, "\t".join(Flat.names) for b in both: print >> fh, Bed.row_string(b) fh.close() return Bed(fh.name)
def merge_flat(new_name, aflat, bflat): """take 2 flat files and return a new one that is the union of the 2 existing""" seen = {} both = [] for flat in (aflat, bflat): for row in flat: key = row['seqid'], row['accn'] if key in seen: continue seen[key] = True both.append(row) both.sort(key=lambda a: (a['seqid'],a['start'])) fh = open(new_name, "w") #print >>fh, "\t".join(Flat.names) for b in both: print >>fh, Bed.row_string(b) fh.close() return Bed(fh.name)
def print_bed(flist, old_path): ipath, ext = op.splitext(old_path) path = "%s.with_new%s" % (ipath, ext) print >>sys.stderr, "writing to: %s.with_new%s" % (ipath, ext) fh = open(path, 'wb') seen = {} for item in flist: # convert the locs to a tuple. #print >>sys.stderr, item item = list(item) item[6] = tuple(item[6]) item = tuple(item) if item in seen: continue seen[item] = 1 locs = item[6] # tuple(sorted([item[1], item[2]])) row = dict(accn=item[3], start=item[1], end=item[2], seqid=item[0], locs=locs, score='.', strand=item[5], rgb='.', thickstart='.', thickend=".") print >>fh, Bed.row_string(row) fh.close() return Bed(path)
def merge(org_bed, missed, merge_file): """creates blast.all file and updates everything""" merge_fh = open(merge_file, "w") #cds_missed = missed[missed['ftype'] == 'CDS'] #count = org_bed.shape[0] + missed[missed['ftype'] !='CDS'].shape[0] new_rows = [] seen_accns = {} # CDS added to existing gene. for row_missed in missed: if row_missed['accn'] in seen_accns: continue try: org_bed_row = org_bed.accn(row_missed['accn']) # it's a CDS except KeyError: #its a new gene new_rows.append(row_missed) seen_accns[row_missed['accn']] = True continue locs_interval = Intersecter() [ locs_interval.add_interval(Feature(start, stop)) for start, stop in org_bed_row['locs'] ] for missed_start, missed_end in row_missed['locs']: if len(locs_interval.find(missed_start, missed_end)) > 0: # print >>sys.stderr, org_bed_row['accn'] locs_intersects = [ (l.start, l.stop) for l in locs_interval.find(missed_start, missed_end) ] [ org_bed_row['locs'].remove(locs_intersect) for locs_intersect in locs_intersects ] locs_intersects = set(locs_intersects) locs_intersects.add((missed_start, missed_end)) locs_start = min([start for start, end in locs_intersects]) locs_end = max([end for start, end in locs_intersects]) org_bed_row['locs'] = org_bed_row['locs'] + [ (locs_start, locs_end) ] row_missed['locs'].remove((missed_start, missed_end)) org_bed_row['locs'] = org_bed_row['locs'] + row_missed['locs'] #print >>sys.stderr, "{0},{1}".format(row_missed['accn'], locs) org_bed_row['locs'].sort() org_bed_row['start'] = min( min([start for start, end in org_bed_row['locs']]), org_bed_row['start']) org_bed_row['end'] = max( max([end for start, end in org_bed_row['locs']]), org_bed_row['end']) new_rows.append(org_bed_row) seen_accns[org_bed_row['accn']] = True for org_bed_rw in org_bed: if org_bed_rw['accn'] not in seen_accns: new_rows.append(org_bed_rw) seen_accns[org_bed_rw['accn']] = True def row_cmp(a, b): return cmp(a['seqid'], b['seqid']) or cmp(a['start'], b['start']) new_rows.sort(cmp=row_cmp) #print >>merge_fh, "\t".join(Bed.names) for i, row in enumerate(new_rows): print >> merge_fh, Bed.row_string(row)
def merge_same_hits(missed, fh_match, org_bed): """ groups genes that hit more then once """ d = {} handle = open(fh_match) matches = handle.read() org_bed_path = org_bed.path path = org_bed_path.split('/') dirc = '/'.join(path[:-1]) org = path[-1] fh = open('{0}/missed_from_{1}'.format(dirc, org), "wb") for match in matches.split('\n')[:-1]: qaccn, saccn = match.split('\t') #create dictionary try: seqid = missed.accn(qaccn)['seqid'] haccn = missed.accn(qaccn) except KeyError: continue #if near_gene(haccn,org_bed)==True: continue if (seqid, saccn) not in d.keys(): #append whole dict to keys d[(seqid, saccn)] = missed.accn(qaccn) else: #else add locs to exsting one gene_start = min(d[(seqid, saccn)]['locs'])[0] gene_end = max(d[(seqid, saccn)]['locs'])[1] missed_end = missed.accn(qaccn)['locs'][0][1] missed_start = missed.accn(qaccn)['locs'][0][0] if missed_end < gene_start: # if no intervening genes and they are close together... intervening_genes = get_intervening_genes( missed_end, gene_start, seqid, org_bed, d[(seqid, saccn)]['accn']) if intervening_genes is False: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid, saccn)]['start'] = missed_start if 'Os' in qaccn: d[seqid, saccn]['accn'] = qaccn else: d[(seqid, qaccn)] = missed.accn(qaccn) elif gene_end < missed_start: intervening_genes = get_intervening_genes( gene_end, missed_start, seqid, org_bed, d[(seqid, saccn)]["accn"]) if intervening_genes is False: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] d[(seqid, saccn)]['end'] = missed_end if 'Os' in qaccn: d[seqid, saccn]['accn'] = qaccn else: d[(seqid, qaccn)] = missed.accn(qaccn) else: d[(seqid, saccn)]['locs'] = d[ (seqid, saccn)]['locs'] + missed.accn(qaccn)['locs'] for key in d.keys(): new_row = d[key]['locs'].sort() row = d[key] print >> fh, Bed.row_string(row)
def write_bed(gene,merge_fh): new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line))
def write_bed(gene, merge_fh): new_line = Bed.row_string(gene) merge_fh.write("{0}\n".format(new_line))