def kmer_homology(self, k=10, span=100): """Number of shared k-mers within "span" distance on either side of vertex positions""" seq1 = ''.join([a.capitalize() for a in hg.interval(self.v1.chrom, max(1,self.v1.pos - span), min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), self.v1.strand).sequence()]) seq2 = ''.join([a.capitalize() for a in hg.interval(self.v2.chrom, max(1,self.v2.pos - span), min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), -1 * self.v2.strand).sequence()]) kset1 = Set([seq1[i:i+10] for i in range(len(seq1) - k + 1)]) kset2 = Set([seq2[i:i+10] for i in range(len(seq2) - k + 1)]) return len(kset1.intersection(kset2))
def read_cycle_file(cycle_file): input = open(cycle_file) segment_map = {} interval_map = {} cycle_map = {} for line in input: res = line.strip().split('\t') if res[0] == 'Interval': interval_map[res[1]] = hg19.interval( res[2] if res[2].find('chr') != -1 else "chr%s" % res[2], int(res[3]), int(res[4]), info={'line': res}) if res[0] == 'Segment': segment_map[res[1]] = hg19.interval( res[2] if res[2].find('chr') != -1 else "chr%s" % res[2], int(res[3]), int(res[4]), info={'line': res}) if res[0].find('Cycle') != -1: segments = res[0].split(';') cycle_name = segments[0].split('=')[-1] copy_count = float(segments[1].split('=')[-1]) cycle = segments[2].split('=')[-1].split(',') #Sometimes a source/sink cycle will not start/end with 0, but have the 0 within the cycle #Rotate the result such that 0s always start/end the cycle in those cases if cycle[0][0] != '0' and len([c for c in cycle if c[0] == '0']) > 0: idx = cycle.index([c for c in cycle if c == '0+'][0]) cycle = cycle[idx:] + cycle[:idx] cycle_map[cycle_name] = {'cycle': cycle, 'copy_count': copy_count} input.close() return (segment_map, interval_map, cycle_map)
def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None): if file is not None or file_content is not None: self.segment_list = hg.interval_list([]) self.segment_dict = {} self.cycle_dict = {} self.ilist = hg.interval_list([]) if file_content: lines = file_content.split('\n') else: lines = str(open(file).read().decode()).split('\n') ll = [l.strip().split() for l in lines if len(l.strip()) > 0] for l in ll: if 'Segment' == l[0]: s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]) self.segment_dict[l[1]] = s self.segment_list.append(s) elif 'Cycle=' in l[0]: ls = l[0].split(';') ci = ls[0].split('=')[1] cn = float(ls[1].split('=')[1]) cl = [] for s in ls[2].split('=')[1].split(','): if s[-1] == '+': cl.append((s[:-1], 1)) else: cl.append((s[:-1], -1)) self.cycle_dict[ci] = (ci, cn, cl) elif 'Interval' == l[0]: self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])) elif cycle_list is None: segment_set = hg.interval_list([hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}]) segment_set.sort() self.segment_list = segment_set self.segment_dict = {} seg_id = {} cl = [] for s in enumerate(segment_set): self.segment_dict[str(s[0] + 1)] = s[1] seg_id[(s[1].chrom, s[1].start, s[1].end)] = str(s[0] + 1) for s in segment_list: cl.append((seg_id[(s.chrom, s.start, s.end)], s.strand)) for ii in range(len(self.segment_list)): s = self.segment_list[ii] s.info = [seg_id[(s.chrom, s.start, s.end)]] self.cycle_dict = {'1':('1', 1, cl)} self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)]) for ii in range(len(self.ilist)): self.ilist[ii].info = [str(ii)] else: self.segment_list = segment_list self.segment_dict = {s.info[0]: s for s in segment_list} self.cycle_dict = {c[0]:c for c in cycle_list} if ilist is not None: self.ilist = ilist else: self.ilist = hg.interval_list([s[0] for s in segment_list.merge_clusters(extend=1)]) for ii in range(len(self.ilist)): self.ilist[ii].info = [str(ii)]
def load_bed(bed_file, value = None, log = False, sep='\t'): bed_data = hg19.interval_list() for line in open(bed_file): res = line.split(sep) if value is None: bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':float(res[3]) if not log else 10**float(res[3])})) else: bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':value})) bed_data.sort() return bed_data
def pivot(self, c1, si1, si2): cycle1 = self.cycle_dict[c1] # check if segments overlap if not self.segment_dict[cycle1[2][si1][0]].intersects( self.segment_dict[cycle1[2][si2][0]]): raise Exception("Segments do not overlap") # check if segments have opposite orientation if cycle1[2][si1][1] == cycle1[2][si2][1]: raise Exception("Segments should be in opposite orientation") seg1 = self.segment_dict[cycle1[2][si1][0]] seg2 = self.segment_dict[cycle1[2][si2][0]] seg1_found = False seg2_found = False for i in self.segment_list: if (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): seg1_found = True ns1 = i.info[0] overlap1 = (ns1, cycle1[2][si1][1]) if (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): seg2_found = True ns2 = i.info[0] overlap2 = (ns2, cycle1[2][si2][1]) if not seg1_found: ns1 = self.next_seg_id() overlap1 = (ns1, cycle1[2][si1][1]) self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) self.segment_list.append(self.segment_dict[ns1]) if not seg2_found: ns2 = self.next_seg_id() overlap2 = (ns2, cycle1[2][si2][1]) self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) self.segment_list.append(self.segment_dict[ns2]) cycle1_init = cycle1[2][:si1] if cycle1[2][si1][1] == -1: (overlap1, overlap2, ns1, ns2) = ((overlap2[0], -1 * overlap2[1]), (overlap1[0], -1 * overlap1[1]), ns2, ns1) cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1:si2][::-1]] cycle1_final = cycle1[2][si2 + 1:] mcycle = cycle1_init + [overlap1] + cycle1_span + [overlap2 ] + cycle1_final mcycle_id = self.next_cycle_id() self.cycle_dict[mcycle_id] = (mcycle_id, cycle1[1], mcycle) self.cycle_dict[c1] = (c1, 0.0, cycle1[2]) return
def parse_bed(segment_file): input = open(segment_file, 'r') amplicons = hg19.interval_list() for line in input: res = line.strip().split('\t') amplicons.append(hg19.interval(res[0], int(res[1]), int(res[2]))) return amplicons
def parse_cycles_file(cycles_file, addchr = False): cycles = {} segSeqD = {} with open(cycles_file) as infile: for line in infile: if line.startswith("Segment"): fields = line.rstrip().split() lowerBound = int(fields[3]) upperBound = int(fields[4]) chrom = fields[2] if addchr: chrom = "chr%s" % chrom segNum = fields[1] segSeqD[segNum] = hg19.interval(chrom,lowerBound,upperBound, info={'name':segNum}) elif "Cycle=" in line: curr_cycle = [] fields = line.rstrip().rsplit(";") lineD = {x.rsplit("=")[0]:x.rsplit("=")[1] for x in fields} segs = lineD["Segments"].rsplit(",") #TODO: Need to rotate the segs in case there's a 0 within the path, or else path is incorrect skip = False for i in segs: seg = i[:-1] if seg != "0": strand = i[-1] curr_cycle.append((seg,strand)) else: skip = True if not skip: cycles[lineD["Cycle"]] = curr_cycle return cycles,segSeqD
def build_segments(self, bed_data = None): if bed_data is None: bed_data = self.bed_data points_x = [] points_y = [] colors = [] fpoints_x = [] fpoints_y = [] fcolors = [] previous_end = total_length_with_spacing*(global_rot/360.0) for ind,sp in enumerate(start_points): start_point = int(previous_end - sp) start_angle = start_point/total_length_with_spacing*360 end_angle = (start_point - lens[ind])/total_length_with_spacing*360 #segseqD referenced as global variable here because I'm lazy segment = segSeqD[cycle[ind][0]] strand = cycle[ind][1] hits = [h[0] for h in bed_data.intersection([segment])] if self.color_bed is not None: color_subhits = hg19.interval_list([h[0] for h in self.color_bed.intersection([segment])]) for h in hits: for pos in xrange(h.start, h.end, self.point_spacing): if pos > segment.end or pos < segment.start: continue if self.color_bed is not None: temp = hg19.interval(h.chrom, pos, pos) color_hits = color_subhits.intersection([temp],self.point_spacing) if len(color_hits) != 0: color = color_hits[0][0].info['color'] else: color = self.color if 'color' not in h.info else h.info['color'] else: color = self.color if 'color' not in h.info else h.info['color'] if strand == "+": normStart = start_point - max(0,pos-segment.start) normEnd = start_point - min(segment.end-segment.start,pos-segment.start) else: normEnd = start_point - min(segment.end-segment.start,segment.end-pos) normStart = start_point - max(0,segment.end - pos) hvalue = h.info['value'] if h.info['value'] > self.ymin else self.ymin hvalue = hvalue if hvalue < self.ymax else self.ymax y_scale_value = (1.*hvalue-self.ymin)/(self.ymax-self.ymin) if self.is_log: y_scale_value = (math.log10(hvalue)-math.log10(self.ymin))/(math.log10(self.ymax)-math.log10(self.ymin)) r_scale_value = y_scale_value*(self.track_rmax-self.track_rmin)+self.track_rmin x_s,y_s = pol2cart(r_scale_value,normStart/total_length_with_spacing*2*np.pi) if 'fill' in h.info: foo = fpoints_x.append(x_s) foo = fpoints_y.append(y_s) fcolors.append(color) else: foo = points_x.append(x_s) foo = points_y.append(y_s) colors.append(color) foo = ax.scatter(points_x,points_y,marker='o',s=1,linewidths=0.01,facecolors='none',color=colors) foo = ax.scatter(fpoints_x,fpoints_y,marker='*',s=1,linewidths=0.01,color=fcolors)
def kmer_homology(self, k=10, span=100): seq1 = ''.join([ a.capitalize() for a in hg.interval( self.v1.chrom, max(1, self.v1.pos - span), min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), self.v1.strand).sequence() ]) seq2 = ''.join([ a.capitalize() for a in hg.interval( self.v2.chrom, max(1, self.v2.pos - span), min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), -1 * self.v2.strand).sequence() ]) kset1 = Set([seq1[i:i + 10] for i in range(len(seq1) - k + 1)]) kset2 = Set([seq2[i:i + 10] for i in range(len(seq2) - k + 1)]) return len(kset1.intersection(kset2))
def build_genebed_from_fpkm(fpkm): fpkm_bed = hg19.interval_list() for (g,f) in fpkm.items(): if g not in ensembl_grc37_map: continue gene = ensembl_grc37_map[g] fpkm_bed.append(hg19.interval(gene.chrom, gene.start, gene.end, info={'value':f, 'name':gene.info['Name']})) fpkm_bed.sort() return fpkm_bed
def load_ensembl_grc37(): input = open('/pedigree2/projects/namphuon/data/references/hg19/annotations/Homo_sapiens.GRCh37.64.gtf' ,'r') ensemble_data = hg19.interval_list() ensembl_grc37_map = {} for line in input: res = line.split('\t') info = dict([r.strip().replace('"','').split(' ') for r in res[-1].strip().split('; ') if len(r.split(' ')) == 2]) temp = hg19.interval("chr%s" % res[0],int(res[3]),int(res[4]),info={'data':info}) ensemble_data.append(temp) foo = ensembl_grc37_map.setdefault(info['gene_id'],[]).append(temp) ensembl_grc37 = hg19.interval_list() for g in ensembl_grc37_map: start = min([e.start for e in ensembl_grc37_map[g]]) end = max([e.end for e in ensembl_grc37_map[g]]) name = [e.info['data']['gene_name'] for e in ensembl_grc37_map[g] if 'gene_name' in e.info['data']] name = name[0] if len(name) >= 1 else g ensembl_grc37.append(hg19.interval("%s" % e.chrom, start, end, info={'intervals':ensembl_grc37_map[g],'GeneID':g,'Name':name})) input.close() ensembl_grc37.sort() for e in ensembl_grc37: ensembl_grc37_map[e.info['GeneID']] = e return (ensembl_grc37, ensembl_grc37_map)
def sequence(self, flank_size=-1): if self.edge_type == 'sequence': seq = hg.interval(self.v1.chrom, self.v1.pos, self.v2.pos).sequence() if flank_size > 0: seq = hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1, self.v1.pos).sequence() + seq + hg.interval( self.v2.chrom, self.v2.pos, self.v2.pos + flank_size - 1).sequence() else: if self.hom == None: seq = 'N' * 20 else: seq = self.hom_seq if flank_size == -1: flank_size = 1000 if flank_size > 0: if self.hom is not None and self.hom > 0: hom = self.hom else: hom = 0 if self.edge_type == 'source': if self.v2.strand == -1: right_seq = hg.interval( self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1).sequence() left_seq = '' else: left_seq = hg.interval( self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom).sequence() right_seq = '' elif self.v1.strand == 1: left_seq = hg.interval(self.v1.chrom, self.v1.pos - hom - flank_size + 1, self.v1.pos - hom).sequence() if self.v2.strand == -1: right_seq = hg.interval( self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1).sequence() else: right_seq = hg.interval(self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom, strand=-1).sequence() else: right_seq = hg.interval(self.v1.chrom, self.v1.pos + hom, self.v1.pos + hom + flank_size - 1).sequence() if self.v2.strand == -1: left_seq = hg.interval(self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1, strand=-1).sequence() else: left_seq = hg.interval( self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom).sequence() seq = left_seq + seq + right_seq return seq
else: samp_name = args.sname.rsplit("/")[-1] fname = samp_name bed_feat_dict = {} if args.bed_files: for i,j in zip(args.bed_files,args.feature_labels): print j,i #feature name -> chromosome -> ordered list of positions bed_list = parse_bed_file(i) bed_feat_dict[j] = feat_bed_to_lookup(bed_list) outer_bar = max(bed_track_height*(len(bed_feat_dict)+2),10) bed_data = hg19.interval_list([hg19.interval('chr8', 127638302, 127938302, info={'value':int(random.random()*100)}), hg19.interval('chr8', 128716346,128746346, info={'value':int(random.random()*100)})]) bed_data.sort() args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/COLO320DM' args.cycles_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/onco_amplicon1_cycles.txt' args.fpkm_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.fpkm.csv' args.wgs_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.wgs.1000.pileup.log.bed' cycles_numbers = ['6', '9', '10', '12', '13', '14', '15', '16','19'] args.atac_peak_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/ATAC-seq/SRC1655_summits_250ext_q1e6_nochrM_merged.bed' args.atac_file = '/pedigree2/projects/namphuon/results/paul_gbm39/ATAC/COLO320DM.atac.1000.pileup.log.bed' args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/PC3' args.cycles_file = '/nucleus/pedigree/projects/extrachromosome/data/turner2017/reconstruction/run14/FF-77_amplicon4_cycles.txt' args.fpkm_file = '/pedigree2/projects/namphuon/results/paul_gbm39/rnaseq/PC3.fpkm.csv' args.wgs_file = '/pedigree2/projects/namphuon/results/paul_gbm39/PC3/PC3.wgs.1000.pileup.log.bed'
a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr) ])) or a.size() - sum([ a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr) ]) > 2000000): if (len(hg.interval_list([a]).intersection(cr))) == 0: uc_list.append(a) else: cra = hg.interval_list([a]).intersection(cr) cpos = a.start for crai in cra: if cpos < crai[1].start - 1000000: uc_list.append( hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info)) cpos = crai[1].end + 1000000 if a.end > cpos: uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info)) uc_list = hg.interval_list([ a for a in uc_list if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5 ]) uc_merge = uc_list.merge_clusters(extend=300000) with open(outname, "w") as outfile: for a in uc_merge: if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN: outfile.write('\t'.join([
from collections import defaultdict import pysam import hg19util as hg f = pysam.AlignmentFile("/pedigree2/projects/namphuon/data/SCC090/pacbio/merged.bam") segs = defaultdict(lambda: [], {}) readlen = {} refi = hg.interval_list([hg.interval(i) for i in f.references]) segi = 1 qi = 0 qindex = {} qlist = [] for l in f.fetch(): ref = l.reference_name.split(':')[0] ref_start = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_start ref_end = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_end qstart = l.query_alignment_start qend = l.query_alignment_end if l.query_name not in qindex: qindex[l.query_name] = qi qlist.append(l.query_name) qi += 1 if l.is_reverse: qstart = l.infer_query_length() - l.query_alignment_end qend = l.infer_query_length() - l.query_alignment_start
for a in bamFile: vlist[(a.qname, a.is_read1)].append(a) if a.tid == -1 or bamFile.getrname(a.tid) not in hg19refs: if a.tid == -1: continue vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1)) continue if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid ) and clean_genomic_cluster(clist): clusterList.append(clist) if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid: clist = [] caln = a # if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35: if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10: clist.append(a) if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist): clusterList.append(clist) # clusterList.sort(key=cmp_to_key(lambda x, y: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length()) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1].infer_query_length()))) clusterList.sort( key=lambda x: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length()), reverse=True) vsuper = { v: set([ v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2]) and vreads[v].issubset(vreads[v2])
a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr) ])) or a.size() - sum([ a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr) ]) > 2000000): if (len(hg.interval_list([a]).intersection(cr))) == 0: uc_list.append(a) else: cra = hg.interval_list([a]).intersection(cr) cpos = a.start for crai in cra: if cpos < crai[1].start - 1000000: uc_list.append( hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info)) cpos = crai[1].end + 1000000 if a.end > cpos: uc_list.append( hg.interval(a.chrom, cpos, a.end, info=a.info)) uc_list = hg.interval_list([ a for a in uc_list if float(a.info[1]) * a.segdup_uniqueness() > 5.0 and a.rep_content() < 2.5 ]) uc_merge = uc_list.merge_clusters(extend=300000) all_uc = hg.interval_list([ a[0] for a in uc_merge if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN ])
for a in bamFile: vlist[(a.qname, a.is_read1)].append(a) if a.tid == -1 or bamFile.getrname(a.tid) not in hg19refs: if a.tid == -1: continue vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1)) continue if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid ) and clean_genomic_cluster(clist): clusterList.append(clist) if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid: clist = [] caln = a # if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35: if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10: clist.append(a) if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist): clusterList.append(clist) clusterList.sort(lambda x, y: hg.interval( bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length( )) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1]. infer_query_length())) vsuper = { v: Set([ v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2]) and vreads[v].issubset(vreads[v2]) ])
parser = argparse.\ ArgumentParser(description="Cycles File") parser.add_argument('--cycles', dest='cycles_file', help="File listing cycles in amplicon", metavar='FILE', action='store', type=str, nargs=1) args = parser.parse_args() cycles_file = args.cycles_file[0] ll = [l.strip().split() for l in open(cycles_file) if len(l.strip()) > 0] segments = hg.interval_list([ hg.interval(l[2], int(l[3]), int(l[4]), info=[int(l[1])]) for l in ll if l[0] == 'Segment' ]) for s in segments: if s.chrom[:3] == 'chr': s.info.append('Human') else: s.info.append('Viral') segments.sort() segment_id_dict = {s.info[0]: s for s in segments} cycles = [] for c in [l[0].split(';') for l in ll if 'Cycle=' in l[0]]: c_dict = {cc.split('=')[0]: cc.split('=')[1] for cc in c} new_dict = {} new_dict['Cycle'] = int(c_dict['Cycle'])
def merge(self, c1, c2, si1, si2): cycle1 = self.cycle_dict[c1] cycle2 = self.cycle_dict[c2] # check if atmost 1 cycle has source vertex if '0' in [s[0] for s in cycle1[2]] and '0' in [s[0] for s in cycle2[2]]: raise Exception("Cannot merge 2 cycles with source vertices") # if cycle2 has source vertex, exchange c1,c2 if '0' in [s[0] for s in cycle2[2]]: (c1, c2, si1, si2, cycle1, cycle2) = (c2, c1, si2, si1, cycle2, cycle1) if si1 == 0 or si1 == len(cycle1[2]) - 1: raise Exception("Cannot use source segment for merging") # check if segments overlap if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle2[2][si2][0]]): raise Exception("Segments do not overlap" + str(self.segment_dict[cycle1[2][si1][0]]) + " " + str(self.segment_dict[cycle2[2][si2][0]])) # cnlist: (merged cn, cycle1cn, cycle2cn) if cycle1[1] == 0 or cycle2[1] == 0: raise Exception("Cycle copy numbers should be > 0 to merge") if cycle1[1] > cycle2[1]: cnlist = (cycle2[1], cycle1[1] - cycle2[1], 0.0) else: cnlist = (cycle1[1], 0.0, cycle2[1] - cycle1[1]) seg1 = self.segment_dict[cycle1[2][si1][0]] seg2 = self.segment_dict[cycle2[2][si2][0]] seg1_found = False seg2_found = False for i in self.segment_list: if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): seg1_found = True ns1 = i.info[0] overlap1 = (ns1, cycle1[2][si1][1]) elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): seg1_found = True ns1 = i.info[0] overlap1 = (ns1, cycle1[2][si1][1]) if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): seg2_found = True ns2 = i.info[0] overlap2 = (ns2, cycle1[2][si1][1]) elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): seg2_found = True ns2 = i.info[0] overlap2 = (ns2, cycle1[2][si1][1]) if not seg1_found: ns1 = self.next_seg_id() overlap1 = (ns1, cycle1[2][si1][1]) if cycle1[2][si1][1] == 1: self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) else: self.segment_dict[ns1] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns1]) self.segment_list.append(self.segment_dict[ns1]) if not seg2_found: ns2 = self.next_seg_id() overlap2 = (ns2, cycle1[2][si1][1]) if cycle1[2][si1][1] == 1: self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) else: self.segment_dict[ns2] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns2]) self.segment_list.append(self.segment_dict[ns2]) cycle1_init = cycle1[2][:si1] if not cycle1[2][si1][1]: (overlap1, overlap2, ns1, ns2) = (overlap2, overlap1, ns2, ns1) if cycle1[2][si1][1] == cycle2[2][si2][1]: cycle2_span = cycle2[2][si2 + 1:] + cycle2[2][:si2] else: cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1:][::-1]] cycle1_final = cycle1[2][si1 + 1:] mcycle = cycle1_init + [overlap1] + cycle2_span + [overlap2] + cycle1_final mcycle_id = self.next_cycle_id() self.cycle_dict[mcycle_id] = (mcycle_id, cnlist[0], mcycle) self.cycle_dict[c1] = (c1, cnlist[1], cycle1[2]) self.cycle_dict[c2] = (c2, cnlist[2], cycle2[2]) return
if a.tid == -1: continue vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1)) continue if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist): clusterList.append(clist) if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid: clist = [] caln = a # if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35: # if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10: clist.append(a) if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist): clusterList.append(clist) clusterList.sort(lambda x, y: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length()) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1].infer_query_length())) vsuper = {v: Set([v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2]) and vreads[v].issubset(vreads[v2])]) for v in vreads} vequaldict = {} vequal = [] for v in vreads: inserted = False for vset in vequal: if vreads[v] == vreads[vset[0]]: vset[1].add vequaldict[v] = vset inserted = True break if not inserted: vset = (v, Set([v])) vequal.append(vset)
segments = [] # segments=hg.interval_list(rdAlts.replace('.bed', '_segments.bed'), 'bed') # bandsfile="karyotype.HK359.EGFR.txt" # segments = [(l[2], hg.interval(l[1], int(l[4]), int(l[5])).intersection(i), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr'] if hg.interval(l[1], int(l[4]), int(l[5])).intersects(i)] # segments = [('', hg.interval(l[1], int(l[4]), int(l[5])), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr']] if args.extendmode == 'VIRAL': logging.info("#TIME " + '%.3f\t' % (clock() - TSTART) + "Finding integration sites: " + str(rdList[0])) de = bamFileb2b.interval_discordant_edges(rdList) old_stdout = sys.stdout sys.stdout = mystdout = StringIO() amplist = bamFileb2b.interval_hops(rdList, explore=False) alist = hg.interval_list( [hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos) for e in de] + [hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos) for e in de] + rdList) alist.sort() rdList = hg.interval_list([ i[0] for i in alist.merge_clusters(extend=5000000) if len( hg.interval_list([i[0]]).intersection(amplist) + hg.interval_list([i[0]]).intersection(rdList)) > 0 ]) rdList = hg.interval_list([ hg.interval(i.chrom, max(0, i.start - 10000), min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)])) for i in rdList ]) iout = open(outName + '.integration_search.out', 'w') iout.write(mystdout.getvalue())
def draw_episome(self, input_files, output_file=None, auto_scale=0): cycles_section_top = 30 cycles_section_size = 0 space_between_decompositions = 90 / (1 + auto_scale) bottoms = [] for i in range(len(input_files)): if i != 0: cycles_section_size += space_between_decompositions input_content = input_files[i][1] intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile( input_content) number_of_element = sum(len(x) for x in cycles) cycles_section_size += number_of_element * 10 for cycle in cycles: if cycle[-1] != 0: cycles_section_size += 10 bottoms.append(cycles_section_top + cycles_section_size) tops = [cycles_section_top] + [ bottom + space_between_decompositions for bottom in bottoms[:-1] ] cycles_section_bottom = cycles_section_top + cycles_section_size # print ('cycle section_top:', cycles_section_top) # print ('cycle section_bottom:', cycles_section_bottom) for i in range(len(input_files)): input_content = input_files[i][1] self.file_names.append( Text('%s: %s' % (str(i + 1), input_files[i][0]), 0.5, tops[i] - 27)) # print ('bottom:', bottoms[i]) # print ('top:', tops[i]) intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile( input_content) if i == 0: self.reconstructed_cycles = [cname for cname in cycles_names] self.reconstructed_segments = [ segment_count for segment_count in range(len(segments)) ] self.compute_chr_offsets(chr_offs) if len(intervals) == 0: intervals = self.compute_intervals(segments) ilist = hg.interval_list([ hg.interval(chr_name, start_point, end_point) for chr_name, start_point, end_point in intervals ]) maxIntvl = self.findMaxIntervals(segments) sortedL = {} compact = {} span = {} for ch in maxIntvl.keys(): sortedL[ch] = self.makeListOfSegmentEndPoints(segments, ch) sortedCopy = list(sortedL[ch]) mergeL = self.mergeIntervals(sortedCopy) compact[ch], span[ch] = self.compactIntervals( mergeL, maxIntvl[ch]) newsegs = self.convertSegmentCoordinates(segments, span, compact, maxIntvl, ilist) if i == 0: self.drawSections(intervals, ilist, cycles_section_top, cycles_section_bottom, auto_scale) # self.drawAxesLabels(sortedL, compact, maxIntvl, span, chr_offs, bottoms[i]) # self.drawAxes(span, compact, maxIntvl, chr_offs, bottoms[i]) # self.drawAxesDottedLines(sortedL, compact, maxIntvl, span, chr_offs, tops[i], bottoms[i]) self.drawCycles(newsegs, seg_name_to_index_map, cycles, directions, tops[i], chr_offs, i, copy_counts, cycles_names)