def kmer_homology(self, k=10, span=100):
     """Number of shared k-mers within "span" distance on either side of vertex positions"""
     seq1 = ''.join([a.capitalize() for a in hg.interval(self.v1.chrom, max(1,self.v1.pos - span), min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), self.v1.strand).sequence()])
     seq2 = ''.join([a.capitalize() for a in hg.interval(self.v2.chrom, max(1,self.v2.pos - span), min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), -1 * self.v2.strand).sequence()])
     kset1 = Set([seq1[i:i+10] for i in range(len(seq1) - k + 1)])
     kset2 = Set([seq2[i:i+10] for i in range(len(seq2) - k + 1)])
     return len(kset1.intersection(kset2))
Ejemplo n.º 2
0
def read_cycle_file(cycle_file):
    input = open(cycle_file)
    segment_map = {}
    interval_map = {}
    cycle_map = {}
    for line in input:
        res = line.strip().split('\t')
        if res[0] == 'Interval':
            interval_map[res[1]] = hg19.interval(
                res[2] if res[2].find('chr') != -1 else "chr%s" % res[2],
                int(res[3]),
                int(res[4]),
                info={'line': res})
        if res[0] == 'Segment':
            segment_map[res[1]] = hg19.interval(
                res[2] if res[2].find('chr') != -1 else "chr%s" % res[2],
                int(res[3]),
                int(res[4]),
                info={'line': res})
        if res[0].find('Cycle') != -1:
            segments = res[0].split(';')
            cycle_name = segments[0].split('=')[-1]
            copy_count = float(segments[1].split('=')[-1])
            cycle = segments[2].split('=')[-1].split(',')
            #Sometimes a source/sink cycle will not start/end with 0, but have the 0 within the cycle
            #Rotate the result such that 0s always start/end the cycle in those cases
            if cycle[0][0] != '0' and len([c
                                           for c in cycle if c[0] == '0']) > 0:
                idx = cycle.index([c for c in cycle if c == '0+'][0])
                cycle = cycle[idx:] + cycle[:idx]
            cycle_map[cycle_name] = {'cycle': cycle, 'copy_count': copy_count}
    input.close()
    return (segment_map, interval_map, cycle_map)
    def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None):
        if file is not None or file_content is not None:
            self.segment_list = hg.interval_list([])
            self.segment_dict = {}
            self.cycle_dict = {}
            self.ilist = hg.interval_list([])

            if file_content:
                lines = file_content.split('\n')
            else:
                lines = str(open(file).read().decode()).split('\n')
            ll = [l.strip().split() for l in lines if len(l.strip()) > 0]
            for l in ll:
                if 'Segment' == l[0]:
                    s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])
                    self.segment_dict[l[1]] = s
                    self.segment_list.append(s)
                elif 'Cycle=' in l[0]:
                    ls = l[0].split(';')
                    ci = ls[0].split('=')[1]
                    cn = float(ls[1].split('=')[1])
                    cl = []
                    for s in ls[2].split('=')[1].split(','):
                        if s[-1] == '+':
                            cl.append((s[:-1], 1))
                        else:
                            cl.append((s[:-1], -1))
                    self.cycle_dict[ci] = (ci, cn, cl)
                elif 'Interval' == l[0]:
                    self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]))
        elif cycle_list is None:
            segment_set = hg.interval_list([hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}])
            segment_set.sort()
            self.segment_list = segment_set
            self.segment_dict = {}
            seg_id = {}
            cl = []
            for s in enumerate(segment_set):
                self.segment_dict[str(s[0] + 1)] = s[1]
                seg_id[(s[1].chrom, s[1].start, s[1].end)] = str(s[0] + 1)
            for s in segment_list:
                cl.append((seg_id[(s.chrom, s.start, s.end)], s.strand))
            for ii in range(len(self.segment_list)):
                s = self.segment_list[ii]
                s.info = [seg_id[(s.chrom, s.start, s.end)]]
            self.cycle_dict = {'1':('1', 1, cl)}
            self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)])
            for ii in range(len(self.ilist)):
                self.ilist[ii].info = [str(ii)]
        else:
            self.segment_list = segment_list
            self.segment_dict = {s.info[0]: s for s in segment_list}
            self.cycle_dict = {c[0]:c for c in cycle_list}
            if ilist is not None:
                self.ilist = ilist
            else:
                self.ilist = hg.interval_list([s[0] for s in segment_list.merge_clusters(extend=1)])
                for ii in range(len(self.ilist)):
                    self.ilist[ii].info = [str(ii)]
Ejemplo n.º 4
0
 def load_bed(bed_file, value = None, log = False, sep='\t'):
   bed_data = hg19.interval_list()
   for line in open(bed_file):
     res = line.split(sep)
     if value is None:
       bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':float(res[3]) if not log else 10**float(res[3])}))
     else:
       bed_data.append(hg19.interval(res[0], int(res[1]), int(res[2]), info={'value':value}))
   bed_data.sort()
   return bed_data
Ejemplo n.º 5
0
 def pivot(self, c1, si1, si2):
     cycle1 = self.cycle_dict[c1]
     # check if segments overlap
     if not self.segment_dict[cycle1[2][si1][0]].intersects(
             self.segment_dict[cycle1[2][si2][0]]):
         raise Exception("Segments do not overlap")
     # check if segments have opposite orientation
     if cycle1[2][si1][1] == cycle1[2][si2][1]:
         raise Exception("Segments should be in opposite orientation")
     seg1 = self.segment_dict[cycle1[2][si1][0]]
     seg2 = self.segment_dict[cycle1[2][si2][0]]
     seg1_found = False
     seg2_found = False
     for i in self.segment_list:
         if (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end):
             seg1_found = True
             ns1 = i.info[0]
             overlap1 = (ns1, cycle1[2][si1][1])
         if (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end):
             seg2_found = True
             ns2 = i.info[0]
             overlap2 = (ns2, cycle1[2][si2][1])
     if not seg1_found:
         ns1 = self.next_seg_id()
         overlap1 = (ns1, cycle1[2][si1][1])
         self.segment_dict[ns1] = hg.interval(seg1.chrom,
                                              seg1.start,
                                              seg2.end,
                                              info=[ns1])
         self.segment_list.append(self.segment_dict[ns1])
     if not seg2_found:
         ns2 = self.next_seg_id()
         overlap2 = (ns2, cycle1[2][si2][1])
         self.segment_dict[ns2] = hg.interval(seg1.chrom,
                                              seg2.start,
                                              seg1.end,
                                              info=[ns2])
         self.segment_list.append(self.segment_dict[ns2])
     cycle1_init = cycle1[2][:si1]
     if cycle1[2][si1][1] == -1:
         (overlap1, overlap2, ns1, ns2) = ((overlap2[0], -1 * overlap2[1]),
                                           (overlap1[0],
                                            -1 * overlap1[1]), ns2, ns1)
     cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1:si2][::-1]]
     cycle1_final = cycle1[2][si2 + 1:]
     mcycle = cycle1_init + [overlap1] + cycle1_span + [overlap2
                                                        ] + cycle1_final
     mcycle_id = self.next_cycle_id()
     self.cycle_dict[mcycle_id] = (mcycle_id, cycle1[1], mcycle)
     self.cycle_dict[c1] = (c1, 0.0, cycle1[2])
     return
Ejemplo n.º 6
0
def parse_bed(segment_file):
    input = open(segment_file, 'r')
    amplicons = hg19.interval_list()
    for line in input:
        res = line.strip().split('\t')
        amplicons.append(hg19.interval(res[0], int(res[1]), int(res[2])))
    return amplicons
Ejemplo n.º 7
0
def parse_cycles_file(cycles_file, addchr = False):
  cycles = {}
  segSeqD = {}
  with open(cycles_file) as infile:
    for line in infile:
      if line.startswith("Segment"):
        fields = line.rstrip().split()
        lowerBound = int(fields[3])
        upperBound = int(fields[4])
        chrom = fields[2]
        if addchr:
          chrom = "chr%s" % chrom
        segNum = fields[1]
        segSeqD[segNum] = hg19.interval(chrom,lowerBound,upperBound, info={'name':segNum})
      elif "Cycle=" in line:
        curr_cycle = []
        fields = line.rstrip().rsplit(";")
        lineD = {x.rsplit("=")[0]:x.rsplit("=")[1] for x in fields}
        segs = lineD["Segments"].rsplit(",")
        #TODO: Need to rotate the segs in case there's a 0 within the path, or else path is incorrect
        skip = False
        for i in segs:
          seg = i[:-1]
          if seg != "0":
            strand = i[-1]
            curr_cycle.append((seg,strand))
          else:
            skip = True
        if not skip:
          cycles[lineD["Cycle"]] = curr_cycle
  return cycles,segSeqD
Ejemplo n.º 8
0
 def build_segments(self, bed_data = None):
   if bed_data is None:
     bed_data = self.bed_data
   points_x = []
   points_y = []
   colors = []
   
   fpoints_x = []
   fpoints_y = []
   fcolors = []
   
   previous_end = total_length_with_spacing*(global_rot/360.0)    
   for ind,sp in enumerate(start_points):
     start_point = int(previous_end - sp)
     start_angle = start_point/total_length_with_spacing*360
     end_angle = (start_point - lens[ind])/total_length_with_spacing*360
     
     #segseqD referenced as global variable here because I'm lazy
     segment = segSeqD[cycle[ind][0]] 
     strand = cycle[ind][1]
     hits = [h[0] for h in bed_data.intersection([segment])]
     if self.color_bed is not None:
       color_subhits = hg19.interval_list([h[0] for h in self.color_bed.intersection([segment])])      
     for h in hits:
       for pos in xrange(h.start, h.end, self.point_spacing):
         if pos > segment.end or pos < segment.start:
           continue
         if self.color_bed is not None:
           temp = hg19.interval(h.chrom, pos, pos)
           color_hits = color_subhits.intersection([temp],self.point_spacing)
           if len(color_hits) != 0:
             color = color_hits[0][0].info['color']
           else:
             color = self.color if 'color' not in h.info else h.info['color']
         else:
           color = self.color if 'color' not in h.info else h.info['color']
         if strand == "+":
           normStart = start_point - max(0,pos-segment.start)
           normEnd = start_point - min(segment.end-segment.start,pos-segment.start)
         else:
           normEnd = start_point - min(segment.end-segment.start,segment.end-pos)
           normStart = start_point - max(0,segment.end - pos)
         hvalue = h.info['value'] if h.info['value'] > self.ymin else self.ymin
         hvalue = hvalue if hvalue < self.ymax else self.ymax
         y_scale_value = (1.*hvalue-self.ymin)/(self.ymax-self.ymin)          
         if self.is_log:
           y_scale_value = (math.log10(hvalue)-math.log10(self.ymin))/(math.log10(self.ymax)-math.log10(self.ymin))
         r_scale_value = y_scale_value*(self.track_rmax-self.track_rmin)+self.track_rmin                    
         x_s,y_s = pol2cart(r_scale_value,normStart/total_length_with_spacing*2*np.pi)
         if 'fill' in h.info:
           foo = fpoints_x.append(x_s)
           foo = fpoints_y.append(y_s)
           fcolors.append(color)          
         else:
           foo = points_x.append(x_s)
           foo = points_y.append(y_s)
           colors.append(color)
   foo = ax.scatter(points_x,points_y,marker='o',s=1,linewidths=0.01,facecolors='none',color=colors)  
   foo = ax.scatter(fpoints_x,fpoints_y,marker='*',s=1,linewidths=0.01,color=fcolors)  
Ejemplo n.º 9
0
 def kmer_homology(self, k=10, span=100):
     seq1 = ''.join([
         a.capitalize() for a in hg.interval(
             self.v1.chrom, max(1, self.v1.pos - span),
             min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]),
             self.v1.strand).sequence()
     ])
     seq2 = ''.join([
         a.capitalize() for a in hg.interval(
             self.v2.chrom, max(1, self.v2.pos - span),
             min(self.v2.pos +
                 span, hg.chrLen[hg.chrNum(self.v2.chrom)]), -1 *
             self.v2.strand).sequence()
     ])
     kset1 = Set([seq1[i:i + 10] for i in range(len(seq1) - k + 1)])
     kset2 = Set([seq2[i:i + 10] for i in range(len(seq2) - k + 1)])
     return len(kset1.intersection(kset2))
Ejemplo n.º 10
0
def build_genebed_from_fpkm(fpkm):
  fpkm_bed = hg19.interval_list()
  for (g,f) in fpkm.items():
    if g not in ensembl_grc37_map:
      continue
    gene = ensembl_grc37_map[g]
    fpkm_bed.append(hg19.interval(gene.chrom, gene.start, gene.end, info={'value':f, 'name':gene.info['Name']}))
  fpkm_bed.sort()
  return fpkm_bed
Ejemplo n.º 11
0
def load_ensembl_grc37():
  input = open('/pedigree2/projects/namphuon/data/references/hg19/annotations/Homo_sapiens.GRCh37.64.gtf' ,'r')
  ensemble_data = hg19.interval_list()
  ensembl_grc37_map = {}
  for line in input:
    res = line.split('\t')
    info = dict([r.strip().replace('"','').split(' ') for r in res[-1].strip().split('; ') if len(r.split(' ')) == 2])
    temp = hg19.interval("chr%s" % res[0],int(res[3]),int(res[4]),info={'data':info})
    ensemble_data.append(temp)
    foo = ensembl_grc37_map.setdefault(info['gene_id'],[]).append(temp)
  ensembl_grc37 = hg19.interval_list()    
  for g in ensembl_grc37_map:
    start = min([e.start for e in ensembl_grc37_map[g]])
    end = max([e.end for e in ensembl_grc37_map[g]])
    name = [e.info['data']['gene_name'] for e in ensembl_grc37_map[g] if 'gene_name' in e.info['data']]
    name = name[0] if len(name) >= 1 else g          
    ensembl_grc37.append(hg19.interval("%s" % e.chrom, start, end, info={'intervals':ensembl_grc37_map[g],'GeneID':g,'Name':name}))
  input.close()
  ensembl_grc37.sort()
  for e in ensembl_grc37:
    ensembl_grc37_map[e.info['GeneID']] = e
  return (ensembl_grc37, ensembl_grc37_map)
Ejemplo n.º 12
0
 def sequence(self, flank_size=-1):
     if self.edge_type == 'sequence':
         seq = hg.interval(self.v1.chrom, self.v1.pos,
                           self.v2.pos).sequence()
         if flank_size > 0:
             seq = hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1,
                               self.v1.pos).sequence() + seq + hg.interval(
                                   self.v2.chrom, self.v2.pos,
                                   self.v2.pos + flank_size - 1).sequence()
     else:
         if self.hom == None:
             seq = 'N' * 20
         else:
             seq = self.hom_seq
         if flank_size == -1:
             flank_size = 1000
         if flank_size > 0:
             if self.hom is not None and self.hom > 0:
                 hom = self.hom
             else:
                 hom = 0
             if self.edge_type == 'source':
                 if self.v2.strand == -1:
                     right_seq = hg.interval(
                         self.v2.chrom, self.v2.pos + hom,
                         self.v2.pos + hom + flank_size - 1).sequence()
                     left_seq = ''
                 else:
                     left_seq = hg.interval(
                         self.v2.chrom, self.v2.pos - hom - flank_size + 1,
                         self.v2.pos - hom).sequence()
                     right_seq = ''
             elif self.v1.strand == 1:
                 left_seq = hg.interval(self.v1.chrom,
                                        self.v1.pos - hom - flank_size + 1,
                                        self.v1.pos - hom).sequence()
                 if self.v2.strand == -1:
                     right_seq = hg.interval(
                         self.v2.chrom, self.v2.pos + hom,
                         self.v2.pos + hom + flank_size - 1).sequence()
                 else:
                     right_seq = hg.interval(self.v2.chrom,
                                             self.v2.pos - hom -
                                             flank_size + 1,
                                             self.v2.pos - hom,
                                             strand=-1).sequence()
             else:
                 right_seq = hg.interval(self.v1.chrom, self.v1.pos + hom,
                                         self.v1.pos + hom + flank_size -
                                         1).sequence()
                 if self.v2.strand == -1:
                     left_seq = hg.interval(self.v2.chrom,
                                            self.v2.pos + hom,
                                            self.v2.pos + hom + flank_size -
                                            1,
                                            strand=-1).sequence()
                 else:
                     left_seq = hg.interval(
                         self.v2.chrom, self.v2.pos - hom - flank_size + 1,
                         self.v2.pos - hom).sequence()
         seq = left_seq + seq + right_seq
     return seq
Ejemplo n.º 13
0
else:
  samp_name = args.sname.rsplit("/")[-1]

fname = samp_name

bed_feat_dict = {}
if args.bed_files:
  for i,j in zip(args.bed_files,args.feature_labels):
    print j,i
    #feature name -> chromosome -> ordered list of positions
    bed_list = parse_bed_file(i)
    bed_feat_dict[j] = feat_bed_to_lookup(bed_list)

outer_bar = max(bed_track_height*(len(bed_feat_dict)+2),10)

bed_data = hg19.interval_list([hg19.interval('chr8', 127638302, 127938302, info={'value':int(random.random()*100)}), hg19.interval('chr8', 128716346,128746346, info={'value':int(random.random()*100)})])
bed_data.sort()

args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/COLO320DM'
args.cycles_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/onco_amplicon1_cycles.txt'
args.fpkm_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.fpkm.csv'
args.wgs_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/COLO320_DM_S270/colo320dm.wgs.1000.pileup.log.bed'
cycles_numbers = ['6', '9', '10', '12', '13', '14', '15', '16','19']
args.atac_peak_file = '/pedigree2/projects/namphuon/data/paul_gbm39/unsorted/ATAC-seq/SRC1655_summits_250ext_q1e6_nochrM_merged.bed'
args.atac_file = '/pedigree2/projects/namphuon/results/paul_gbm39/ATAC/COLO320DM.atac.1000.pileup.log.bed'


args.prefix_name = '/pedigree2/projects/namphuon/programs/CycleViz/PC3'
args.cycles_file = '/nucleus/pedigree/projects/extrachromosome/data/turner2017/reconstruction/run14/FF-77_amplicon4_cycles.txt'
args.fpkm_file = '/pedigree2/projects/namphuon/results/paul_gbm39/rnaseq/PC3.fpkm.csv'
args.wgs_file = '/pedigree2/projects/namphuon/results/paul_gbm39/PC3/PC3.wgs.1000.pileup.log.bed'
                a.intersection(ci[1]).size()
                for ci in hg.interval_list([a]).intersection(cr)
            ])) or a.size() - sum([
                a.intersection(ci[1]).size()
                for ci in hg.interval_list([a]).intersection(cr)
            ]) > 2000000):
        if (len(hg.interval_list([a]).intersection(cr))) == 0:
            uc_list.append(a)
        else:
            cra = hg.interval_list([a]).intersection(cr)
            cpos = a.start
            for crai in cra:
                if cpos < crai[1].start - 1000000:
                    uc_list.append(
                        hg.interval(a.chrom,
                                    cpos,
                                    crai[1].start - 1000000,
                                    info=a.info))
                cpos = crai[1].end + 1000000
            if a.end > cpos:
                uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info))

uc_list = hg.interval_list([
    a for a in uc_list if float(a.info[-1]) *
    a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5
])
uc_merge = uc_list.merge_clusters(extend=300000)

with open(outname, "w") as outfile:
    for a in uc_merge:
        if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN:
            outfile.write('\t'.join([
from collections import defaultdict
import pysam

import hg19util as hg

f = pysam.AlignmentFile("/pedigree2/projects/namphuon/data/SCC090/pacbio/merged.bam")

segs = defaultdict(lambda: [], {})
readlen = {}

refi = hg.interval_list([hg.interval(i) for i in f.references])

segi = 1

qi = 0
qindex = {}
qlist = []

for l in f.fetch():
    ref = l.reference_name.split(':')[0]
    ref_start = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_start
    ref_end = int(l.reference_name.split(':')[1].split('-')[0]) + l.reference_end
    qstart = l.query_alignment_start
    qend = l.query_alignment_end
    if l.query_name not in qindex:
        qindex[l.query_name] = qi
        qlist.append(l.query_name)
        qi += 1
    if l.is_reverse:
        qstart = l.infer_query_length() - l.query_alignment_end
        qend = l.infer_query_length() - l.query_alignment_start
Ejemplo n.º 16
0
for a in bamFile:
    vlist[(a.qname, a.is_read1)].append(a)
    if a.tid == -1 or bamFile.getrname(a.tid) not in hg19refs:
        if a.tid == -1:
            continue
        vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1))
        continue
    if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid
                             ) and clean_genomic_cluster(clist):
        clusterList.append(clist)
    if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid:
        clist = []
    caln = a
    #        if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35:
    if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10:
        clist.append(a)
if caln is not None and (a.pos > caln.pos + 300 or
                         caln.tid != a.tid) and clean_genomic_cluster(clist):
    clusterList.append(clist)

# clusterList.sort(key=cmp_to_key(lambda x, y: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length()) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1].infer_query_length())))
clusterList.sort(
    key=lambda x: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos +
                              x[-1].infer_query_length()),
    reverse=True)

vsuper = {
    v: set([
        v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2])
        and vreads[v].issubset(vreads[v2])
                    a.intersection(ci[1]).size()
                    for ci in hg.interval_list([a]).intersection(cr)
                ])) or a.size() - sum([
                    a.intersection(ci[1]).size()
                    for ci in hg.interval_list([a]).intersection(cr)
                ]) > 2000000):
            if (len(hg.interval_list([a]).intersection(cr))) == 0:
                uc_list.append(a)
            else:
                cra = hg.interval_list([a]).intersection(cr)
                cpos = a.start
                for crai in cra:
                    if cpos < crai[1].start - 1000000:
                        uc_list.append(
                            hg.interval(a.chrom,
                                        cpos,
                                        crai[1].start - 1000000,
                                        info=a.info))
                    cpos = crai[1].end + 1000000
                if a.end > cpos:
                    uc_list.append(
                        hg.interval(a.chrom, cpos, a.end, info=a.info))

    uc_list = hg.interval_list([
        a for a in uc_list if float(a.info[1]) *
        a.segdup_uniqueness() > 5.0 and a.rep_content() < 2.5
    ])
    uc_merge = uc_list.merge_clusters(extend=300000)
    all_uc = hg.interval_list([
        a[0] for a in uc_merge if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN
    ])
Ejemplo n.º 18
0
for a in bamFile:
    vlist[(a.qname, a.is_read1)].append(a)
    if a.tid == -1 or bamFile.getrname(a.tid) not in hg19refs:
        if a.tid == -1:
            continue
        vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1))
        continue
    if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid
                             ) and clean_genomic_cluster(clist):
        clusterList.append(clist)
    if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid:
        clist = []
    caln = a
    #        if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35:
    if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10:
        clist.append(a)
if caln is not None and (a.pos > caln.pos + 300 or
                         caln.tid != a.tid) and clean_genomic_cluster(clist):
    clusterList.append(clist)

clusterList.sort(lambda x, y: hg.interval(
    bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length(
    )) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1].
                     infer_query_length()))

vsuper = {
    v: Set([
        v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2])
        and vreads[v].issubset(vreads[v2])
    ])
parser = argparse.\
ArgumentParser(description="Cycles File")
parser.add_argument('--cycles',
                    dest='cycles_file',
                    help="File listing cycles in amplicon",
                    metavar='FILE',
                    action='store',
                    type=str,
                    nargs=1)
args = parser.parse_args()
cycles_file = args.cycles_file[0]
ll = [l.strip().split() for l in open(cycles_file) if len(l.strip()) > 0]

segments = hg.interval_list([
    hg.interval(l[2], int(l[3]), int(l[4]), info=[int(l[1])]) for l in ll
    if l[0] == 'Segment'
])
for s in segments:
    if s.chrom[:3] == 'chr':
        s.info.append('Human')
    else:
        s.info.append('Viral')
segments.sort()
segment_id_dict = {s.info[0]: s for s in segments}

cycles = []
for c in [l[0].split(';') for l in ll if 'Cycle=' in l[0]]:
    c_dict = {cc.split('=')[0]: cc.split('=')[1] for cc in c}
    new_dict = {}
    new_dict['Cycle'] = int(c_dict['Cycle'])
 def merge(self, c1, c2, si1, si2):
     cycle1 = self.cycle_dict[c1]
     cycle2 = self.cycle_dict[c2]
     # check if atmost 1 cycle has source vertex
     if '0' in [s[0] for s in cycle1[2]] and '0' in [s[0] for s in cycle2[2]]:
         raise Exception("Cannot merge 2 cycles with source vertices")
     # if cycle2 has source vertex, exchange c1,c2
     if '0' in [s[0] for s in cycle2[2]]:
         (c1, c2, si1, si2, cycle1, cycle2) = (c2, c1, si2, si1, cycle2, cycle1)
         if si1 == 0 or si1 == len(cycle1[2]) - 1:
             raise Exception("Cannot use source segment for merging")
     # check if segments overlap
     if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle2[2][si2][0]]):
         raise Exception("Segments do not overlap" + str(self.segment_dict[cycle1[2][si1][0]]) + " " + str(self.segment_dict[cycle2[2][si2][0]]))
     # cnlist: (merged cn, cycle1cn, cycle2cn)
     if cycle1[1] == 0 or cycle2[1] == 0:
         raise Exception("Cycle copy numbers should be > 0 to merge")
     if cycle1[1] > cycle2[1]:
         cnlist = (cycle2[1], cycle1[1] - cycle2[1], 0.0)
     else:
         cnlist = (cycle1[1], 0.0, cycle2[1] - cycle1[1])
     seg1 = self.segment_dict[cycle1[2][si1][0]]
     seg2 = self.segment_dict[cycle2[2][si2][0]]
     seg1_found = False
     seg2_found = False
     for i in self.segment_list:
         if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end):
             seg1_found = True
             ns1 = i.info[0]
             overlap1 = (ns1, cycle1[2][si1][1])
         elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end):
             seg1_found = True
             ns1 = i.info[0]
             overlap1 = (ns1, cycle1[2][si1][1])
         if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end):
             seg2_found = True
             ns2 = i.info[0]
             overlap2 = (ns2, cycle1[2][si1][1])
         elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end):
             seg2_found = True
             ns2 = i.info[0]
             overlap2 = (ns2, cycle1[2][si1][1])
     if not seg1_found:
         ns1 = self.next_seg_id()
         overlap1 = (ns1, cycle1[2][si1][1])
         if cycle1[2][si1][1] == 1:
             self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1])
         else:
             self.segment_dict[ns1] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns1])
         self.segment_list.append(self.segment_dict[ns1])
     if not seg2_found:
         ns2 = self.next_seg_id()
         overlap2 = (ns2, cycle1[2][si1][1])
         if cycle1[2][si1][1] == 1:
             self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2])
         else:
             self.segment_dict[ns2] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns2])
         self.segment_list.append(self.segment_dict[ns2])
     cycle1_init = cycle1[2][:si1]
     if not cycle1[2][si1][1]:
         (overlap1, overlap2, ns1, ns2) = (overlap2, overlap1, ns2, ns1)
     if cycle1[2][si1][1] == cycle2[2][si2][1]:
         cycle2_span = cycle2[2][si2 + 1:] + cycle2[2][:si2]
     else:
         cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1:][::-1]]
     cycle1_final = cycle1[2][si1 + 1:]
     mcycle = cycle1_init + [overlap1] + cycle2_span + [overlap2] + cycle1_final
     mcycle_id = self.next_cycle_id()
     self.cycle_dict[mcycle_id] = (mcycle_id, cnlist[0], mcycle)
     self.cycle_dict[c1] = (c1, cnlist[1], cycle1[2])
     self.cycle_dict[c2] = (c2, cnlist[2], cycle2[2])
     return
Ejemplo n.º 21
0
                if a.tid == -1:
                    continue
                vreads[bamFile.getrname(a.tid)].add((a.qname, a.is_read1))
                continue
        if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist):
                clusterList.append(clist)
        if caln is None or a.pos > caln.pos + 300 or caln.tid != a.tid:
                clist = []
        caln = a
#        if hg.interval(a, bamfile=bamFile).num_unmasked() >= 35:
#        if hg.interval(a, bamfile=bamFile).rep_content() <= 3 and a.mapq >= 10:
         clist.append(a)
if caln is not None and (a.pos > caln.pos + 300 or caln.tid != a.tid) and clean_genomic_cluster(clist):
        clusterList.append(clist)

clusterList.sort(lambda x, y: hg.interval(bamFile.getrname(x[0].tid), x[0].pos, x[-1].pos + x[-1].infer_query_length()) > hg.interval(bamFile.getrname(y[0].tid), y[0].pos, y[-1].pos + y[-1].infer_query_length()))

vsuper = {v: Set([v2 for v2 in vreads if v2 != v and len(vreads[v]) < len(vreads[v2]) and vreads[v].issubset(vreads[v2])]) for v in vreads}
vequaldict = {}
vequal = []
for v in vreads:
        inserted = False
        for vset in vequal:
                if vreads[v] == vreads[vset[0]]:
                        vset[1].add
                        vequaldict[v] = vset
                        inserted = True
                        break
        if not inserted:
                vset = (v, Set([v]))
                vequal.append(vset)
Ejemplo n.º 22
0
segments = []
# segments=hg.interval_list(rdAlts.replace('.bed', '_segments.bed'), 'bed')

# bandsfile="karyotype.HK359.EGFR.txt"
# segments = [(l[2], hg.interval(l[1], int(l[4]), int(l[5])).intersection(i), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr'] if hg.interval(l[1], int(l[4]), int(l[5])).intersects(i)]
# segments = [('', hg.interval(l[1], int(l[4]), int(l[5])), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr']]

if args.extendmode == 'VIRAL':
    logging.info("#TIME " + '%.3f\t' % (clock() - TSTART) +
                 "Finding integration sites: " + str(rdList[0]))
    de = bamFileb2b.interval_discordant_edges(rdList)
    old_stdout = sys.stdout
    sys.stdout = mystdout = StringIO()
    amplist = bamFileb2b.interval_hops(rdList, explore=False)
    alist = hg.interval_list(
        [hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos) for e in de] +
        [hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos)
         for e in de] + rdList)
    alist.sort()
    rdList = hg.interval_list([
        i[0] for i in alist.merge_clusters(extend=5000000) if len(
            hg.interval_list([i[0]]).intersection(amplist) +
            hg.interval_list([i[0]]).intersection(rdList)) > 0
    ])
    rdList = hg.interval_list([
        hg.interval(i.chrom, max(0, i.start - 10000),
                    min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)]))
        for i in rdList
    ])
    iout = open(outName + '.integration_search.out', 'w')
    iout.write(mystdout.getvalue())
Ejemplo n.º 23
0
    def draw_episome(self, input_files, output_file=None, auto_scale=0):
        cycles_section_top = 30
        cycles_section_size = 0
        space_between_decompositions = 90 / (1 + auto_scale)
        bottoms = []
        for i in range(len(input_files)):
            if i != 0:
                cycles_section_size += space_between_decompositions
            input_content = input_files[i][1]
            intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile(
                input_content)

            number_of_element = sum(len(x) for x in cycles)
            cycles_section_size += number_of_element * 10
            for cycle in cycles:
                if cycle[-1] != 0:
                    cycles_section_size += 10
            bottoms.append(cycles_section_top + cycles_section_size)
        tops = [cycles_section_top] + [
            bottom + space_between_decompositions for bottom in bottoms[:-1]
        ]
        cycles_section_bottom = cycles_section_top + cycles_section_size

        # print ('cycle section_top:', cycles_section_top)
        # print ('cycle section_bottom:', cycles_section_bottom)
        for i in range(len(input_files)):
            input_content = input_files[i][1]
            self.file_names.append(
                Text('%s: %s' % (str(i + 1), input_files[i][0]), 0.5,
                     tops[i] - 27))
            # print ('bottom:', bottoms[i])
            # print ('top:', tops[i])
            intervals, segments, seg_name_to_index_map, cycles, directions, chr_offs, copy_counts, cycles_names = self.readDataFile(
                input_content)
            if i == 0:
                self.reconstructed_cycles = [cname for cname in cycles_names]
                self.reconstructed_segments = [
                    segment_count for segment_count in range(len(segments))
                ]

            self.compute_chr_offsets(chr_offs)
            if len(intervals) == 0:
                intervals = self.compute_intervals(segments)
            ilist = hg.interval_list([
                hg.interval(chr_name, start_point, end_point)
                for chr_name, start_point, end_point in intervals
            ])
            maxIntvl = self.findMaxIntervals(segments)
            sortedL = {}
            compact = {}
            span = {}
            for ch in maxIntvl.keys():
                sortedL[ch] = self.makeListOfSegmentEndPoints(segments, ch)
                sortedCopy = list(sortedL[ch])
                mergeL = self.mergeIntervals(sortedCopy)
                compact[ch], span[ch] = self.compactIntervals(
                    mergeL, maxIntvl[ch])

            newsegs = self.convertSegmentCoordinates(segments, span, compact,
                                                     maxIntvl, ilist)

            if i == 0:
                self.drawSections(intervals, ilist, cycles_section_top,
                                  cycles_section_bottom, auto_scale)

            # self.drawAxesLabels(sortedL, compact, maxIntvl, span, chr_offs, bottoms[i])
            # self.drawAxes(span, compact, maxIntvl, chr_offs, bottoms[i])
            # self.drawAxesDottedLines(sortedL, compact, maxIntvl, span, chr_offs, tops[i], bottoms[i])
            self.drawCycles(newsegs, seg_name_to_index_map, cycles, directions,
                            tops[i], chr_offs, i, copy_counts, cycles_names)