def test_cache_index(self): """ Walk over a track after the index has been made but has been removed from the filesystem. """ values = [(2, 392.0), (3, 408.0), (4, 420.0), (5, 452.0), (7, 466.0), (8, 474.0), (9, 479.0)] b = [(r, p, v) for r in ('1', '13', 'MT') for (p, v) in values] walker = wiggelen.walk(open_('b.wig'), force_index=True) for expected, item in zip(b, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker) remove_indices(keep_cache=True) walker = wiggelen.walk(open_('b.wig')) for expected, item in zip(b, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def test_walk_complex(self): """ Walk over a complex track. """ walker = wiggelen.walk(open_('complex.wig')) for _ in walker: pass
def test_walk_fixed_step_without_step(self): """ Walk over a fixed step wiggle track without `step` arguments. According to the spec, `fixedStep` definitions require the `step` argument. However, there seems to be real-world data where it is missing and the UCSC Genome Browser can still work with it. So we also support it. Issue: https://github.com/martijnvermaat/wiggelen/issues/1 """ c = [ ('chr', 1, 64.), ('chr', 2, 64.), ('chr', 3, 65.), ('chr', 4, 66.), ('chr', 5, 66.), ('chr', 6, 66.), ('chr', 7, 69.), ('chr', 8, 70.), ('chr', 9, 71.), ('chr', 10, 71.), ('chr', 11, 71.), ('chr', 12, 71.), ('chr', 13, 71.), ('chr', 14, 71.), ('chr', 15, 71.), ('chr', 16, 71.), ('chr', 17, 71.), ('chr', 18, 71.), ('chr', 19, 73.), ('chr', 20, 73.), ('chr', 21, 73.), ('chr', 22, 73.), ('chr', 23, 73.), ('chr', 24, 73.), ('chr', 25, 73.), ('chr', 26, 74.), ('chr', 27, 75.), ('chr', 28, 75.), ('chr', 29, 75.), ('chr', 30, 75.), ('chr', 31, 76.) ] walker = wiggelen.walk(open_('fixedstep-without-step.wig')) for expected, item in zip(c, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def find_max(regions, wig): # find max within a given region. Assumes regions are ordered and non-overlapping # find max by walking through wig file # Let's also calculate a new score sum just to be extra sure it's right. The # old score sum was done during merging regions, so it could have been messed up regions_with_max = [] start, end, score_sum = regions.pop(0) max_value = -1 max_position = None new_score_sum = 0 for chrom, position, value in fill(walk(wig)): if start <= position and position <= end: new_score_sum += value if value >= max_value: max_value = value max_position = position if position >= end and max_value != -1: # position is past region and max_value has been recorded region_with_max = (start, end, new_score_sum, max_value, max_position) regions_with_max.append(region_with_max) # grab new region and reset max if len(regions) == 0: # break when there are no more regions break start, end, score_sum = regions.pop(0) max_value = -1 max_position = None new_score_sum = 0 return regions_with_max
def test_store_index(self): """ Walk over a track after the index has been made. """ values = [(2, 392.0), (3, 408.0), (4, 420.0), (5, 452.0), (7, 466.0), (8, 474.0), (9, 479.0)] b = [(r, p, v) for r in ('1', '13', 'MT') for (p, v) in values] walker = wiggelen.walk(open_('b.wig'), force_index=True) for expected, item in zip(b, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker) walker = wiggelen.walk(open_('b.wig')) for expected, item in zip(b, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def read_wig(filename, norm=False): wig_dict = {position : value for region, position, value in fill(walk(open(filename)))} if norm: print "Normalizing..." wig_median = float(np.median(wig_dict.values())) wig_norm = {x : wig_dict[x]/wig_median for x in wig_dict} return wig_norm else: return wig_dict
def test_walk_single_region(self): """ Walk over a track with a single region. """ c = [('MT', 1, 364.0), ('MT', 6, 435.0), ('MT', 10, 485.0)] walker = wiggelen.walk(open_('c.wig')) for expected, item in zip(c, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def test_walk_fixed_step(self): """ Walk over a fixed step wiggle track. """ c = [('chr8', 1, 11), ('chr8', 2, 11), ('chr8', 6, 33), ('chr8', 7, 33), ('chr8', 11, 44), ('chr8', 12, 44)] walker = wiggelen.walk(open_('fixedstep.wig')) for expected, item in zip(c, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def test_sort_multiple_regions(self): """ Walk over a track with multiple regions and index. """ values = [(2, 392.0), (3, 408.0), (4, 420.0), (5, 452.0), (7, 466.0), (8, 474.0), (9, 479.0)] b = [(r, p, v) for r in ('1', '13', 'MT') for (p, v) in values] walker = wiggelen.walk(open_('b.wig'), force_index=True) for expected, item in zip(b, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
def main(infile, threshold, merge_dist, min_width, strand, outfile): # list of tuples, (start, end, avg_exp) called_regions = [] start = None end = None total_exp = 0 # fill function steps through every position, returns None if position not # in original wig file print("Calling preliminary regions...") wig = open(infile) for region, position, value in fill(walk(wig)): if start is None: # initialize start of new region to current position start = position if value is None: if total_exp > 0: # if a region already exists, end it called_regions.append((start, end, total_exp)) # reset start, end, and total_exp start = None end = None total_exp = 0 elif value < threshold: if total_exp > 0: # if a region already exists, end it called_regions.append((start, end, total_exp)) # reset start, end, and total_exp start = None end = None total_exp = 0 elif value >= threshold: # value exceeds threshold, continue region total_exp += value end = position wig.close() if total_exp != 0: # finished iterating but one last region called_regions.append((start, end, total_exp)) print("Filtering out regions smaller than minimum width...") # filter out regions that are below minimum width filtered_regions = [ x for x in called_regions if x[1] - x[0] + 1 >= min_width ] print("Merging regions...") merged_regions = merge_regions(filtered_regions, merge_dist) # find max region and re-do score sum # open wig file again print("Finding region max and calculating total score...") regions_with_max = find_max(merged_regions, open(infile)) write_bed(regions_with_max, strand, outfile)
def test_walk_fixed_step_without_step(self): """ Walk over a fixed step wiggle track without `step` arguments. According to the spec, `fixedStep` definitions require the `step` argument. However, there seems to be real-world data where it is missing and the UCSC Genome Browser can still work with it. So we also support it. Issue: https://github.com/martijnvermaat/wiggelen/issues/1 """ c = [('chr', 1, 64.), ('chr', 2, 64.), ('chr', 3, 65.), ('chr', 4, 66.), ('chr', 5, 66.), ('chr', 6, 66.), ('chr', 7, 69.), ('chr', 8, 70.), ('chr', 9, 71.), ('chr', 10, 71.), ('chr', 11, 71.), ('chr', 12, 71.), ('chr', 13, 71.), ('chr', 14, 71.), ('chr', 15, 71.), ('chr', 16, 71.), ('chr', 17, 71.), ('chr', 18, 71.), ('chr', 19, 73.), ('chr', 20, 73.), ('chr', 21, 73.), ('chr', 22, 73.), ('chr', 23, 73.), ('chr', 24, 73.), ('chr', 25, 73.), ('chr', 26, 74.), ('chr', 27, 75.), ('chr', 28, 75.), ('chr', 29, 75.), ('chr', 30, 75.), ('chr', 31, 76.)] walker = wiggelen.walk(open_('fixedstep-without-step.wig')) for expected, item in zip(c, walker): assert_equal(expected, item) assert_raises(StopIteration, next, walker)
return wig_sum if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('bed', help='bed file') parser.add_argument('plus_wig', help='wig file for plus strand') parser.add_argument('minus_wig', help='wig file for minus strand') parser.add_argument('output_name', help='name of output file') args = parser.parse_args() plus_wig_dict = { position: value for region, position, value in fill(walk(open(args.plus_wig))) } minus_wig_dict = { position: value for region, position, value in fill(walk(open(args.minus_wig))) } outfile = open(args.output_name, 'w') with open(args.bed) as infile: for line in infile: fields = line.strip().split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) name = fields[3]
import sys import wiggelen wig_fname = sys.argv[1] threshold = int(sys.argv[2]) # the lowest coverage of high-coverage region. # Read data frame from output of 'samtools depth'. wig_file = wiggelen.walk(open(wig_fname)) # Set the beginning state 'LOW'. state = 'LOW' reference_name = None # Search and print regions. for reference, position, value in wig_file: if state == 'LOW': if value >= threshold: reference_name = reference beg = position end = position state = 'HIGH' elif state == 'HIGH': # The next reference has started. if reference_name != reference: print(str(reference_name) + '\t' + str(beg) + '\t' + str(end)) state = 'LOW'
ofile = output_area + '/' + base + '.covered_bases.tsv' of = open(ofile, 'wt') L = fbed.readline().strip().split() # assume order of bed and wig both in chromosome position order n0 = 0 n1 = 0 pp1 = 0 pp2 = 0 c0 = 'A' p1 = genome_pos(L[0], L[1]) p2 = genome_pos(L[0], L[2]) n = 0 LT = LT + p2 - p1 for x in wiggelen.walk(fw): n = n + 1 wp = genome_pos(x[0], x[1]) if wp > p2: # write out interval line, read next interval (pp1,pp2,p1,p2), reset n1,n2 of.write('%s\t%s\t%d\t%d\t%d\n' % (L[0], L[1], int(L[2]) - int(L[1]), n0, n1)) if (L[0] != c0): print('%s\t%d' % (L[0], n)) c0 = L[0] N0 = N0 + n0 N1 = N1 + n1 n0 = 0 n1 = x[2] pp1 = p1 pp2 = p2
Modified by: Zaka Yuen, JCSMR, ANU Created on Nov 2019 Tombo can only output methylation scores in a wig format. This script is to: -convert a wiggle format to a text format containing per-site scores for downstream analyses """ # Positive strand r=[] p=[] v=[] with open(snakemake.input[0],'r') as fh: for region, position, value in wiggelen.walk(fh): r.append(region) p.append(position) v.append(value) d_plus = pd.DataFrame(list(zip(r,p,v)), columns =['names', 'start','values']) d_plus["strand"]="+" # Negative strand r_m=[] p_m=[] v_m=[] with open(snakemake.input[1],'r') as fh: for region, position, value in wiggelen.walk(fh): r_m.append(region) p_m.append(position)