def __init__(self, chrom_sizes, regions=None): self.counter = 0 if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: if line: line = line.strip() line = line.split() c, s, e = line[0], int(line[1]), int(line[2]) #if c in contained_chrom: self.regionset.add( GenomicRegion(chrom=c, initial=s, final=e)) self.chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) #if chrom in contained_chrom: self.regionset.add( GenomicRegion(chrom=chrom, initial=0, final=end)) self.chrom_sizes_dict[chrom] = end if not self.regionset.sequences: print('something wrong here', file=sys.stderr) sys.exit(2)
def __init__(self, chrom, initial, final, name=None, score=None, errors_bp=None, motif=None, strand=None, orientation=None, guanine_rate=None, seq=None): """*Keyword arguments:* - name -- The name of this binding site (Default: None) - seq_type -- DNA or RNA - chrm -- Define the chromosome for DNA; for RNA its default is "RNA" - initial -- Binding start position - final -- Binding end position - score -- Score of the binding pattern (Default: None) - errors_bp -- Error base pair in this binding (Default: None) - motif -- The motif for this binding (Default: None) - strand -- The strand of DNA (+ or -) (Default: None) - orientation -- Parallel or antiparallel (Default: None) - guanine_rate -- (Default: None) - seq -- Sequence of this region with ATCG as letters """ GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final) self.name = name # RNA name self.score = score # Score for pattern matching self.errors_bp = errors_bp self.motif = motif #self.strand = strand self.orientation = orientation self.seq = seq # An object (Sequence) not just a string if seq: self.guanine_rate = "{0:.2f}".format(float(seq.seq.count("G"))/len(seq))
def __init__(self, chrom, pos, ref, alt, qual, filter=None, id=None, info=None, format=None, genotype=None, samples=None): GenomicRegion.__init__(self, chrom, pos, pos + 1) self.chrom = str(chrom) self.pos = int(pos) self.id = id self.ref = ref self.alt = alt self.qual = qual self.filter = filter self.info = info self.format = format self.genotype = genotype self.samples = samples self.name = self.__str__ self.data = "_$_".join( map(lambda x: str(x), [ self.id, self.ref, self.alt, self.qual, self.filter, self.info, self.format, self.genotype, self.samples ]))
def __init__(self, chrom, initial, final, name=None, score=None, errors_bp=None, motif=None, strand=None, orientation=None, guanine_rate=None, seq=None): """Initialize name The name of this binding site (Default: None) seq_type DNA or RNA chrm Define the chromosome for DNA; for RNA its default is "RNA" initial Binding start position final Binding end position score Score of the binding pattern (Default: None) errors_bp Error base pair in this binding (Default: None) motif The motif for this binding (Default: None) strand The strand of DNA (+ or -) (Default: None) orientation Parallel or antiparallel (Default: None) guanine_rate (Default: None) seq Sequence of this region with ATCG as letters """ GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final) self.name = name # RNA name self.score = score # Score for pattern matching self.errors_bp = errors_bp self.motif = motif #self.strand = strand self.orientation = orientation self.seq = seq # An object (Sequence) not just a string if seq: self.guanine_rate = "{0:.2f}".format(float(seq.seq.count("G"))/len(seq))
def test_match_multiple(self): dirname = os.path.dirname(__file__) jasp_dir = "../../data/motifs/jaspar_vertebrates/" scanner = scan.Scanner(7) pssm_list = [] thresholds = [] motif = Motif(os.path.join(dirname, jasp_dir, "MA0139.1.CTCF.pwm"), 1, 0.0001, None) thresholds.append(motif.threshold) thresholds.append(motif.threshold_rc) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 0, 5022) # Reading sequence associated to genomic_region sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
def test_match_multiple(self): ms = MotifSet(preload_motifs="default") ms = ms.filter({'database': ["jaspar_vertebrates"], 'name': ["MA0139.1.CTCF"]}, search="inexact") self.assertEqual(len(ms), 1) motif = ms.get_motif_list(1, 0.0001)[0] scanner = scan.Scanner(7) pssm_list, thresholds = [], [] thresholds.append(motif.threshold) thresholds.append(motif.threshold) pssm_list.append(motif.pssm) pssm_list.append(motif.pssm_rc) bg = tools.flat_bg(4) scanner.set_motifs(pssm_list, bg, thresholds) genomic_region = GenomicRegion("chr1", 0, 5022) # Reading sequence associated to genomic_region sequence = str(self.genome_file.fetch(genomic_region.chrom, genomic_region.initial, genomic_region.final)) grs = match_multiple(scanner, [motif], sequence, genomic_region) self.assertSequenceEqual(grs.sequences, [GenomicRegion("chr1", 4270, 4289, name="MA0139.1.CTCF", orientation="+"), GenomicRegion("chr1", 4180, 4199, name="MA0139.1.CTCF", orientation="-")])
def __init__(self, chrom, pos, ref, alt, qual, filter = None, id = None, info = None, format = None, genotype = None, samples = None): GenomicRegion.__init__(self, chrom, pos, pos + 1) self.chrom = str(chrom) self.pos = int(pos) self.id = id self.ref = ref self.alt = alt self.qual = qual self.filter = filter self.info = info self.format = format self.genotype = genotype self.samples = samples self.name = self.__str__ self.data = "_$_".join(map(lambda x: str(x), [self.id, self.ref, self.alt, self.qual, self.filter, self.info, self.format, self.genotype, self.samples]))
def initialize(name, dims, genome_path, regions, stepsize, binsize, bamfiles, exts, \ inputs, exts_inputs, factors_inputs, chrom_sizes, verbose, no_gc_content, \ tracker, debug, norm_regions, scaling_factors_ip, save_wig): """Initialize the MultiCoverageSet""" regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: print("Call DPs on specified regions.", file=sys.stderr) with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: print("Call DPs on whole genome.", file=sys.stderr) with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end if norm_regions: norm_regionset = GenomicRegionSet('norm_regions') norm_regionset.read_bed(norm_regions) else: norm_regionset = None regionset.sequences.sort() exts, exts_inputs = _compute_extension_sizes(bamfiles, exts, inputs, exts_inputs, verbose) tracker.write(text=str(exts).strip('[]'), header="Extension size (rep1, rep2, input1, input2)") multi_cov_set = MultiCoverageSet(name=name, regions=regionset, dims=dims, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ path_bamfiles = bamfiles, path_inputs = inputs, exts = exts, exts_inputs = exts_inputs, factors_inputs = factors_inputs, \ chrom_sizes=chrom_sizes, verbose=verbose, no_gc_content=no_gc_content, chrom_sizes_dict=chrom_sizes_dict, debug=debug, \ norm_regionset=norm_regionset, scaling_factors_ip=scaling_factors_ip, save_wig=save_wig) return multi_cov_set
def intersect(gnrsA, gnrsB, overlap_type): # Convert to ctypes lenA = len(gnrsA) lenB = len(gnrsB) lenR = min(lenA, lenB) chromsA_python = [gr.chrom for gr in gnrsA.sequences] chromsA_c = (c_char_p * lenA)(*chromsA_python) chromsB_python = [gr.chrom for gr in gnrsB.sequences] chromsB_c = (c_char_p * lenB)(*chromsB_python) initialsA_python = [gr.initial for gr in gnrsA.sequences] initialsA_c = (c_int * lenA)(*initialsA_python) initialsB_python = [gr.initial for gr in gnrsB.sequences] initialsB_c = (c_int * lenB)(*initialsB_python) finalsA_python = [gr.final for gr in gnrsA.sequences] finalsA_c = (c_int * lenA)(*finalsA_python) finalsB_python = [gr.final for gr in gnrsB.sequences] finalsB_c = (c_int * lenB)(*finalsB_python) indices_c = POINTER(c_int)((c_int * lenR)()) initialsR_c = POINTER(c_int)((c_int * lenR)()) finalsR_c = POINTER(c_int)((c_int * lenR)()) sizeR_c = c_int() # Call C-function if overlap_type == 0: intersect_overlap_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) elif overlap_type == 1: intersect_original_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) elif overlap_type == 2: intersect_completely_included_c(chromsA_c, initialsA_c, finalsA_c, lenA, chromsB_c, initialsB_c, finalsB_c, lenB, pointer(indices_c), pointer(initialsR_c), pointer(finalsR_c), byref(sizeR_c)) result = GenomicRegionSet(gnrsA.name) for i in range(sizeR_c.value): result.add( GenomicRegion(chromsA_python[indices_c[i]], initialsR_c[i], finalsR_c[i])) return result
def __init__(self, chrom, initial, final, name=None, score=None, errors_bp=None, motif=None, strand=None, orientation=None, guanine_rate=None, seq=None): """*Keyword arguments:* - name -- The name of this binding site (Default: None) - seq_type -- DNA or RNA - chrm -- Define the chromosome for DNA; for RNA its default is "RNA" - initial -- Binding start position - final -- Binding end position - score -- Score of the binding pattern (Default: None) - errors_bp -- Error base pair in this binding (Default: None) - motif -- The motif for this binding (Default: None) - strand -- The strand of DNA (+ or -) (Default: None) - orientation -- Parallel or antiparallel (Default: None) - guanine_rate -- (Default: None) - seq -- Sequence of this region with ATCG as letters """ GenomicRegion.__init__(self, chrom=chrom, initial=initial, final=final) self.name = name # RNA name self.score = score # Score for pattern matching self.errors_bp = errors_bp self.motif = motif #self.strand = strand self.orientation = orientation self.seq = seq # An object (Sequence) not just a string if seq: self.guanine_rate = "{0:.2f}".format( float(seq.seq.count("G")) / len(seq))
def merge_delete(ext_size, merge, peak_list, pvalue_list): # peaks_gain = read_diffpeaks(path) regions_plus = GenomicRegionSet('regions') #pot. mergeable regions_minus = GenomicRegionSet('regions') #pot. mergeable regions_unmergable = GenomicRegionSet('regions') last_orientation = "" for i, t in enumerate(peak_list): chrom, start, end, c1, c2, strand, ratio = t[0], t[1], t[2], t[3], t[ 4], t[5], t[6] r = GenomicRegion(chrom = chrom, initial = start, final = end, name = '', \ orientation = strand, data = str((c1, c2, pvalue_list[i], ratio))) if end - start > ext_size: if strand == '+': if last_orientation == '+': region_plus.add(r) else: regions_unmergable.add(r) elif strand == '-': if last_orientation == '-': region_mins.add(r) else: regions_unmergable.add(r) if merge: regions_plus.extend(ext_size / 2, ext_size / 2) regions_plus.merge() regions_plus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_plus) regions_minus.extend(ext_size / 2, ext_size / 2) regions_minus.merge() regions_minus.extend(-ext_size / 2, -ext_size / 2) merge_data(regions_minus) results = GenomicRegionSet('regions') for el in regions_plus: results.add(el) for el in regions_minus: results.add(el) for el in regions_unmergable: results.add(el) results.sort() return results
def rna_associated_gene(rna_regions, name, organism): if rna_regions: s = [ rna_regions[0][0], min([e[1] for e in rna_regions]), max([e[2] for e in rna_regions]), rna_regions[0][3] ] g = GenomicRegionSet("RNA associated genes") g.add( GenomicRegion(chrom=s[0], initial=s[1], final=s[2], name=name, orientation=s[3]) ) asso_genes = g.gene_association(organism=organism, promoterLength=1000, show_dis=True) genes = asso_genes[0].name.split(":") closest_genes = [] for n in genes: if name not in n: closest_genes.append(n) closest_genes = set(closest_genes) if len(closest_genes) == 0: return "." else: return ":".join(closest_genes) else: return "."
def test_cmp(self): r = GenomicRegion(chrom=1, initial=10, final=20) r2 = GenomicRegion(chrom=1, initial=12, final=22) self.assertTrue(r < r2) r2 = GenomicRegion(chrom=1, initial=8, final=18) self.assertTrue(r > r2) r2 = GenomicRegion(chrom=1, initial=10, final=12) self.assertTrue(r > r2) r2 = GenomicRegion(chrom=1, initial=12, final=14) self.assertTrue(r < r2) r2 = GenomicRegion(chrom='X', initial=4, final=8) self.assertTrue(r < r2) r2 = GenomicRegion(chrom=1, initial=10, final=18) self.assertTrue(r >= r2)
def test_len(self): r = GenomicRegion(chrom=1, initial=10, final=20) self.assertEqual(len(r), 10)
def test_overlap(self): r = GenomicRegion(chrom=1, initial=10, final=15) # usual cases r2 = GenomicRegion(chrom=1, initial=20, final=25) self.assertFalse(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=0, final=5) self.assertFalse(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=7, final=12) self.assertTrue(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=12, final=18) self.assertTrue(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=12, final=14) self.assertTrue(r.overlap(r2)) # r2 within r r2 = GenomicRegion(chrom=1, initial=11, final=13) self.assertTrue(r.overlap(r2)) # border cases # GenomicRegions touch, but do not overlap r2 = GenomicRegion(chrom=1, initial=5, final=10) self.assertFalse(r.overlap(r2)) # here, they overlap r2 = GenomicRegion(chrom=1, initial=5, final=11) self.assertTrue(r.overlap(r2)) # they touch, do not overlap r2 = GenomicRegion(chrom=1, initial=15, final=20) self.assertFalse(r.overlap(r2)) # they overlap in 1 bp (14th) r2 = GenomicRegion(chrom=1, initial=14, final=20) self.assertTrue(r.overlap(r2)) # they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=10, final=10) self.assertFalse(r.overlap(r2)) # they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=11, final=11) self.assertFalse(r.overlap(r2)) # they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=5, final=10) self.assertFalse(r.overlap(r2))
def test_extend(self): # normal extend r = GenomicRegion(chrom=1, initial=10, final=20) r.extend(5, 15) self.assertEqual(r.initial, 5) self.assertEqual(r.final, 35) # use negative values to extend r2 = GenomicRegion(chrom=1, initial=10, final=20) r2.extend(-5, -1) self.assertEqual(r2.initial, 15) self.assertEqual(r2.final, 19) # extend to under zero r3 = GenomicRegion(chrom=1, initial=10, final=20) r3.extend(15, 0) self.assertEqual(r3.initial, 0) # extend so that inital and final coordinate change r4 = GenomicRegion(chrom=1, initial=10, final=20) r4.extend(-50, -50) self.assertEqual(r4.initial, 0) self.assertEqual(r4.final, 60)
def initialize(name, genome_path, regions, stepsize, binsize, bam_file_1, bam_file_2, ext_1, ext_2, \ input_1, input_factor_1, ext_input_1, input_2, input_factor_2, ext_input_2, chrom_sizes, verbose, norm_strategy, no_gc_content, deadzones,\ factor_input_1, factor_input_2, debug, tracker): regionset = GenomicRegionSet(name) chrom_sizes_dict = {} #if regions option is set, take the values, otherwise the whole set of #chromosomes as region to search for DPs if regions is not None: with open(regions) as f: for line in f: line = line.strip() line = line.split('\t') c, s, e = line[0], int(line[1]), int(line[2]) regionset.add(GenomicRegion(chrom=c, initial=s, final=e)) chrom_sizes_dict[c] = e else: with open(chrom_sizes) as f: for line in f: line = line.strip() line = line.split('\t') chrom, end = line[0], int(line[1]) regionset.add(GenomicRegion(chrom=chrom, initial=0, final=end)) chrom_sizes_dict[chrom] = end regionset.sequences.sort() start = 0 end = 600 ext_stepsize = 5 #TODO: maybe for-loops? #compute extension size if [ext_1, ext_2, ext_input_1, ext_input_2].count(None) > 0: print("Computing read extension sizes...", file=sys.stderr) if ext_1 is None: ext_1, values_1 = get_extension_size(bam_file_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first file: %s" % ext_1, file=sys.stderr) if ext_2 is None: ext_2, values_2 = get_extension_size(bam_file_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second file: %s" % ext_2, file=sys.stderr) if input_1 is not None and ext_input_1 is None: ext_input_1, values_input_1 = get_extension_size(input_1, start=start, end=end, stepsize=ext_stepsize) print("Read extension for first input file: %s" % ext_input_1, file=sys.stderr) if input_1 is not None and input_2 is not None and input_1 == input_2 and 'ext_input_1' in locals( ) and 'values_input_1' in locals(): ext_input_2, values_input_2 = ext_input_1, values_input_1 elif input_2 is not None and ext_input_2 is None: ext_input_2, values_input_2 = get_extension_size(input_2, start=start, end=end, stepsize=ext_stepsize) print("Read extension for second input file: %s" % ext_input_2, file=sys.stderr) tracker.write(text=str(ext_1) + "," + str(ext_2), header="Extension size IP1, IP2") if input_1 is not None and input_2 is not None: tracker.write(text=str(ext_input_1) + "," + str(ext_input_2), header="Extension size Control1, Control2") if verbose: if 'values_1' in locals() and values_1 is not None: with open(name + '-read-ext-1', 'w') as f: for v, i in values_1: print(i, v, sep='\t', file=f) if 'values_2' in locals() and values_2 is not None: with open(name + '-read-ext-2', 'w') as f: for v, i in values_2: print(i, v, sep='\t', file=f) if 'values_input_1' in locals() and values_input_1 is not None: with open(name + '-read-ext-input-1', 'w') as f: for v, i in values_input_1: print(i, v, sep='\t', file=f) if 'values_input_2' in locals() and values_input_2 is not None: with open(name + '-read-ext-input-2', 'w') as f: for v, i in values_input_2: print(i, v, sep='\t', file=f) cov_cdp_mpp = DualCoverageSet(name=name, region=regionset, genome_path=genome_path, binsize=binsize, stepsize=stepsize,rmdup=True,\ file_1=bam_file_1, ext_1=ext_1,\ file_2=bam_file_2, ext_2=ext_2, \ input_1=input_1, ext_input_1=ext_input_1, input_factor_1=input_factor_1, \ input_2=input_2, ext_input_2=ext_input_2, input_factor_2=input_factor_2, \ chrom_sizes=chrom_sizes, verbose=verbose, norm_strategy=norm_strategy, no_gc_content=no_gc_content, deadzones=deadzones,\ factor_input_1=factor_input_1, factor_input_2=factor_input_2, chrom_sizes_dict=chrom_sizes_dict, debug=debug, tracker=tracker) return cov_cdp_mpp, [ext_1, ext_2]
def test_extend(self): #normal extend r = GenomicRegion(chrom=1, initial=10, final=20) r.extend(5,15) self.assertEqual(r.initial, 5) self.assertEqual(r.final, 35) #use negative values to extend r2 = GenomicRegion(chrom=1, initial=10, final=20) r2.extend(-5,-1) self.assertEqual(r2.initial, 15) self.assertEqual(r2.final, 19) #extend to under zero r3 = GenomicRegion(chrom=1, initial=10, final=20) r3.extend(15,0) self.assertEqual(r3.initial, 0) #extend so that inital and final coordinate change r4 = GenomicRegion(chrom=1, initial=10, final=20) r4.extend(-50,-50) self.assertEqual(r4.initial, 0) self.assertEqual(r4.final, 60)
def test_overlap(self): r = GenomicRegion(chrom=1, initial=10, final=15) #usual cases r2 = GenomicRegion(chrom=1, initial=20, final=25) self.assertFalse(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=0, final=5) self.assertFalse(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=7, final=12) self.assertTrue(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=12, final=18) self.assertTrue(r.overlap(r2)) r2 = GenomicRegion(chrom=1, initial=12, final=14) self.assertTrue(r.overlap(r2)) #r2 within r r2 = GenomicRegion(chrom=1, initial=11, final=13) self.assertTrue(r.overlap(r2)) #border cases #GenomicRegions touch, but do not overlap r2 = GenomicRegion(chrom=1, initial=5, final=10) self.assertFalse(r.overlap(r2)) #here, they overlap r2 = GenomicRegion(chrom=1, initial=5, final=11) self.assertTrue(r.overlap(r2)) #they touch, do not overlap r2 = GenomicRegion(chrom=1, initial=15, final=20) self.assertFalse(r.overlap(r2)) #they overlap in 1 bp (14th) r2 = GenomicRegion(chrom=1, initial=14, final=20) self.assertTrue(r.overlap(r2)) #they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=10, final=10) self.assertFalse(r.overlap(r2)) #they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=11, final=11) self.assertFalse(r.overlap(r2)) #they have zero length r = GenomicRegion(chrom=1, initial=10, final=10) r2 = GenomicRegion(chrom=1, initial=5, final=10) self.assertFalse(r.overlap(r2))
def dbd_regions(exons, sig_region, rna_name, output,out_file=False, temp=None, fasta=True): """Generate the BED file of significant DBD regions and FASTA file of the sequences""" if len(sig_region) == 0: return #print(self.rna_regions) if not exons: pass else: dbd = GenomicRegionSet("DBD") dbdmap = {} if len(exons) == 1: print("## Warning: No information of exons in the given RNA sequence, the DBD position may be problematic. ") for rbs in sig_region: loop = True if exons[0][3] == "-": while loop: cf = 0 for exon in exons: #print(exon) l = abs(exon[2] - exon[1]) tail = cf + l if cf <= rbs.initial <= tail: dbdstart = exon[2] - rbs.initial + cf if rbs.final <= tail: #print("1") dbdend = exon[2] - rbs.final + cf if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.final) ) ) dbdmap[str(rbs)] = dbd[-1].toString() + " strand:-" loop = False break elif rbs.final > tail: subtract = l + cf - rbs.initial #print("2") #print("Subtract: "+str(subtract)) if dbdstart > exon[1]: dbdstart, exon[1] = exon[1], dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=exon[1], orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) ) elif rbs.initial < cf and rbs.final <= tail: #print("3") dbdstart = exon[2] dbdend = exon[2] - rbs.final + rbs.initial + subtract if dbdstart > dbdend: dbdstart, dbdend = dbdend, dbdstart dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(cf)+"-"+str(rbs.final)+"_split2" ) ) dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:-" loop = False break elif rbs.initial > tail: pass cf += l loop = False else: while loop: cf = 0 for exon in exons: #print(exon) l = exon[2] - exon[1] tail = cf + l #print("cf: " + str(cf)) #print("tail: " + str(tail) ) if cf <= rbs.initial <= tail: dbdstart = exon[1] + rbs.initial - cf if rbs.final <= tail: #print("1") dbdend = exon[1] + rbs.final -cf dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.final) ) ) dbdmap[str(rbs)] = dbd[-1].toString() + " strand:+" loop = False break elif rbs.final > tail: subtract = l + cf - rbs.initial #print("2") #print("Subtract: "+str(subtract)) dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=exon[2], orientation=exons[0][3], name=str(rbs.initial)+"-"+str(rbs.initial+subtract)+"_split1" ) ) elif rbs.initial < cf and rbs.final <= tail: #print("3") dbdstart = exon[1] dbdend = exon[1] + rbs.final - rbs.initial - subtract dbd.add( GenomicRegion(chrom=exons[0][0], initial=dbdstart, final=dbdend, orientation=exons[0][3], name=str(cf)+"-"+str(rbs.final)+"_split2" ) ) dbdmap[str(rbs)] = dbd[-2].toString() + " & " + dbd[-1].toString() + " strand:+" loop = False break elif rbs.initial > tail: pass cf += l loop = False if not out_file: dbd.write_bed(filename=os.path.join(output, "DBD_"+rna_name+".bed")) else: # print(dbd) # print(dbd.sequences[0]) dbd.write_bed(filename=output) # FASTA if fasta: #print(dbdmap) if not out_file: seq = pysam.Fastafile(os.path.join(output,"rna_temp.fa")) fasta_f = os.path.join(output, "DBD_"+rna_name+".fa") else: seq = pysam.Fastafile(os.path.join(temp,"rna_temp.fa")) fasta_f = output+".fa" with open(fasta_f, 'w') as fasta: for rbs in sig_region: print(">"+ rna_name +":"+str(rbs.initial)+"-"+str(rbs.final), file=fasta) s = seq.fetch(rbs.chrom, max(0, rbs.initial), rbs.final) for ss in [s[i:i + 80] for i in range(0, len(s), 80)]: print(ss, file=fasta)
ints = [gr.initial for gr in gnrsB.sequences] initialsB = (c_int * len(ints))(*ints) ints = [gr.final for gr in gnrsA.sequences] finalsA = (c_int * len(ints))(*ints) ints = [gr.final for gr in gnrsB.sequences] finalsB = (c_int * len(ints))(*ints) # Call C-function return jaccardC(chromsA, initialsA, finalsA, len(gnrsA), chromsB, initialsB, finalsB, len(gnrsB)) set1 = GenomicRegionSet("A") set1.add(GenomicRegion("chr1", 0, 10)) set1.add(GenomicRegion("chr1", 15, 20)) set1.add(GenomicRegion("chr1", 30, 45)) print(set1.sequences) set2 = GenomicRegionSet("B") set2.add(GenomicRegion("chr1", 0, 5)) set2.add(GenomicRegion("chr1", 10, 25)) set2.add(GenomicRegion("chr1", 35, 45)) print(set2.sequences) jaccard2 = jaccardIndex(set1, set2) print("jaccard2", jaccard2) def intersect(gnrsA, gnrsB, overlap_type): # Convert to ctypes
def match_single(motif, sequence, genomic_region, unique_threshold=None, normalize_bitscore=True, sort=False): """ Performs motif matching given sequence and the motif.pssm passed as parameter. The genomic_region is needed to evaluate the correct binding position. Please note that the arguments should be passed as a list, to allow for parallelization mapping function. Keyword arguments: motif -- TODO. sequence -- A DNA sequence (string). genomic_region -- A GenomicRegion. output_file -- TODO. unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and then accepting only the motif matches with bitscore/motif_length >= unique_threshold. Return: Print MPBSs to output_file. """ # Establishing threshold if unique_threshold: current_threshold = 0.0 eval_threshold = unique_threshold motif_max = motif.max / motif.len else: current_threshold = motif.threshold eval_threshold = motif.threshold motif_max = motif.max # Performing motif matching try: # old MOODS version results = MOODS.search(sequence, [motif.pssm_list], current_threshold, absolute_threshold=True, both_strands=True) except: # TODO: we can expand this to use bg from sequence, for example, # or from organism. bg = MOODS.tools.flat_bg(4) results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg, [current_threshold], 7) grs = GenomicRegionSet("mpbs") for search_result in results: for r in search_result: try: position = r.pos score = r.score except: (position, score) = r # Verifying unique threshold acceptance if unique_threshold and score / motif.len < unique_threshold: continue # If match forward strand if position >= 0: p1 = genomic_region.initial + position strand = "+" # If match reverse strand elif not motif.is_palindrome: p1 = genomic_region.initial - position strand = "-" else: continue # Evaluating p2 p2 = p1 + motif.len # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation) if normalize_bitscore: # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation) if motif_max > eval_threshold: norm_score = int(((score - eval_threshold) * 1000.0) / (motif_max - eval_threshold)) else: norm_score = 1000 else: # Keep the original bitscore if unique_threshold: norm_score = score / motif.len else: norm_score = score grs.add( GenomicRegion(genomic_region.chrom, int(p1), int(p2), name=motif.name, orientation=strand, data=str(norm_score))) if sort: grs.sort() return grs
def call_peaks(bam, csizes, pval, min_reads, binsize, cfile=None): ''' Call peaks on bam file using pvalue and binomial model. Returns GenomeRegionSet with peaks, and CoverageSet with signal. ''' # make chromsizes region set rs = get_chrom_sizes_as_genomicregionset(csizes) print("calculating extension sizes...") # calculate ext size ext, _ = get_extension_size(bam, start=0, end=300, stepsize=5) print("calculating coverage...") # calc coverage cov = CoverageSet('coverageset', rs) cov.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True) # calculate cov2 for output bw cov2 = CoverageSet('coverageset2', rs) cov2.coverage_from_bam(bam_file=bam, extension_size=ext, paired_reads=True, binsize=binsize, stepsize=binsize // 2) if cfile is not None: print(f"Using control file: {cfile}") control = CoverageSet('contorl', rs) control.coverage_from_bam(bam_file=cfile, extension_size=ext) with np.errstate(divide='ignore', invalid='ignore'): norm_igg(cov, control) # recalc overall coverage cov.overall_cov = reduce(lambda x, y: np.concatenate( (x, y)), [cov.coverage[i] for i in range(len(cov.genomicRegions))]) # total coverage s = np.sum(cov.overall_cov) # probability of event, a read in a bin, (avg reads/bin )/libsize p = np.mean(cov.overall_cov[cov.overall_cov > 0]) / s # what is the max coverage maxcov = np.max(cov.overall_cov) # create dict with probability for each count value mc = np.arange(0, maxcov + 1, dtype="object") d = {count: binom_test((count, s - count), p=p) for count in mc} # create GenomicRegionSet to hold peaks res = GenomicRegionSet('identified_peaks') print("calculating peaks...") # iterate through bins in genome, store peaks for i, c in enumerate(cov.overall_cov): if filter_bins(c, d, min_reads): chrom, s, e = cov.index2coordinates(i, rs) res.add(GenomicRegion(chrom, s, e + 1, data=d[c])) # merge ol peaks res.merge() # merge peaks within ext dist rc = res.cluster(ext) return rc, cov, cov2