def join_regions(regions, min_gap_size): """Filter regions, joining those separated by small gaps.""" regions = iter(regions) prev_chrom, prev_start, prev_end = next(regions) for chrom, start, end in regions: if chrom != prev_chrom: # New chromosome -- emit the remainder & reset things yield (prev_chrom, prev_start, prev_end) prev_chrom, prev_start, prev_end = chrom, start, end else: gap = start - prev_end assert gap > 0, ( "Impossible gap between %s %d-%d and %d-%d (=%d)" % (chrom, prev_start, prev_end, start, end, gap)) if gap < min_gap_size: # Join with the previous region echo("\tJoining %s %d-%d and %d-%d (gap %d)" % (chrom, prev_start, prev_end, start, end, gap)) prev_end = end else: # Keep the gap; emit the previous region as-is echo("\tKeeping gap between %s %d and %d (size %d)" % (chrom, prev_end, start, gap)) yield (prev_chrom, prev_start, prev_end) prev_chrom, prev_start, prev_end = chrom, start, end # If the last chromosome had no gaps, emit it too if prev_start == 0: yield (prev_chrom, prev_start, prev_end)
def exclude_in_region(exclude_rows, chrom, a_start, a_end, ex_start, ex_end): """Take region exclusions from an iterable and apply, perhaps recursively. Returns an iterable (usually length 1) of two tuples: (accessible chromosome, start, end) (current exclusion start, end) """ # If we've leapfrogged the excluded area, catch up while ex_end <= a_start: ex_start, ex_end = next_or_inf(exclude_rows) if a_end <= ex_start: # Excluded area does not overlap this one yield (chrom, a_start, a_end) else: # Excluded area overlaps this one -> trim this region echo("\tExclusion %s:%d-%d overlaps accessible region %d-%d" % (chrom, ex_start, ex_end, a_start, a_end)) if ex_start <= a_start: if ex_end < a_end: # Exclusion covers this region's left (start) edge only for row in exclude_in_region(exclude_rows, chrom, ex_end, a_end, ex_start, ex_end): yield row # Otherwise: Exclusion covers the whole region else: yield (chrom, a_start, ex_start) if ex_end < a_end: # Exclusion is in the middle of this region for row in exclude_in_region(exclude_rows, chrom, ex_end, a_end, ex_start, ex_end): yield row
def join_regions(regions, min_gap_size): """Filter regions, joining those separated by small gaps.""" regions = iter(regions) prev_chrom, prev_start, prev_end = next(regions) for chrom, start, end in regions: if chrom != prev_chrom: # New chromosome -- emit the remainder & reset things yield (prev_chrom, prev_start, prev_end) prev_chrom, prev_start, prev_end = chrom, start, end else: gap = start - prev_end assert gap > 0, ("Impossible gap between %s %d-%d and %d-%d (=%d)" % (chrom, prev_start, prev_end, start, end, gap)) if gap < min_gap_size: # Join with the previous region echo("\tJoining %s %d-%d and %d-%d (gap %d)" % (chrom, prev_start, prev_end, start, end, gap)) prev_end = end else: # Keep the gap; emit the previous region as-is echo("\tKeeping gap between %s %d and %d (size %d)" % (chrom, prev_end, start, gap)) yield (prev_chrom, prev_start, prev_end) prev_chrom, prev_start, prev_end = chrom, start, end # If the last chromosome had no gaps, emit it too if prev_start == 0: yield (prev_chrom, prev_start, prev_end)
def write_bed(rows, fname): """Write region coordinates to `fname` in BED format.""" with ngfrills.safe_write(fname, False) as outfile: i = 0 for i, row in enumerate(rows): outfile.write("\t".join(map(str, row)) + '\n') ngfrills.echo("Wrote", fname, "with", i + 1, "bins")
def get_regions(fasta_fname): """Find accessible sequence regions (those not masked out with 'N').""" with open(fasta_fname) as infile: chrom = cursor = run_start = None for line in infile: if line.startswith('>'): # Emit the last chromosome's last run, if any if run_start is not None: yield log_this(chrom, run_start, cursor) # Start new chromosome chrom = line.split(None, 1)[0][1:] run_start = None cursor = 0 echo(chrom + ": Scanning for accessible regions") else: line = line.rstrip() if 'N' in line: if all(c == 'N' for c in line): # Shortcut if the line is all N chars if run_start is not None: yield log_this(chrom, run_start, cursor) run_start = None else: # Slow route: line is a mix of N and non-N chars line_chars = np.array(line, dtype='c') n_indices = np.where(line_chars == 'N')[0] # Emit the first block of non-N chars, if any if run_start is not None: yield log_this(chrom, run_start, cursor + n_indices[0]) elif n_indices[0] != 0: yield log_this(chrom, cursor, cursor + n_indices[0]) # Emit any short intermediate blocks gap_mask = np.diff(n_indices) > 1 if gap_mask.any(): ok_starts = n_indices[gap_mask] + 1 + cursor ok_ends = n_indices[1:][gap_mask] + cursor for start, end in zip(ok_starts, ok_ends): yield log_this(chrom, start, end) # Account for any tailing non-N chars if n_indices[-1] + 1 < len(line_chars): run_start = cursor + n_indices[-1] + 1 else: run_start = None else: if run_start is None: # Start of a new run of non-N characters run_start = cursor cursor += len(line) # Emit the last run if it's accessible (i.e. not a telomere) if run_start is not None: yield log_this(chrom, run_start, cursor)
def load_cna(fname, reference): """Read CNA, adjust gender. Subtract reference if given (for ratio).""" echo("Processing", fname) cnarr = read(fname) if reference: # Subtract the reference copy number values (to get the log2 ratio) cnarr = fix.load_adjust_coverages(cnarr, reference, False, False, False) cnarr = shift_xx(cnarr, male_normal=True) else: cnarr = shift_xx(cnarr, male_normal=True) # Drop low-coverage probes (otherwise done in load_adjust_coverages) # cnarr = cnarr.to_rows( # cnarr[cnarr.coverage >= params.MIN_BIN_COVERAGE]) return cnarr
def main(args): """*""" do_ratio = bool(args.reference) ref_pset = read(args.reference or args.no_reference) bias_func = get_bias_func(args.mode, ref_pset, read(args.filenames[0])) print("Sample \tRaw probes \tTrend line \tReduction (%)") if args.batch: plot_overlaid(args.filenames, ref_pset, bias_func, args.mode, do_ratio, args.color) else: plot_separate(args.filenames, ref_pset, bias_func, args.mode, do_ratio) if args.output: pyplot.savefig(args.output, format='pdf', bbox_inches=0) echo("Wrote", args.output) else: pyplot.show()
def get_regions(fasta_fname): """Find accessible sequence regions (those not masked out with 'N').""" with open(fasta_fname) as infile: chrom = i = cursor = run_start = None for line in infile: if line.startswith('>'): # Emit the last chromosome's last run, if any if run_start is not None: yield logme(chrom, run_start, cursor) else: echo("\tChromosome ended with a telomere (N's)") # Start new chromosome chrom = line.split(None, 1)[0][1:] run_start = None cursor = 0 echo("Scanning", chrom) else: for i, char in enumerate(line.rstrip()): if char == 'N': if run_start is not None: # End of a run; emit the current region yield logme(chrom, run_start, cursor + i) run_start = None else: if run_start is None: # Start of a new run of non-N characters run_start = cursor + i echo("\tStarted new run at", run_start) cursor += i + 1 # Emit the last run if it's accessible (i.e. not a telomere) if run_start is not None: yield logme(chrom, run_start, cursor)
def exclude_regions(bed_fname, access_rows): ex_by_chrom = dict( group_regions_by_chromosome(parse_regions(bed_fname, coord_only=True))) if len(ex_by_chrom) == 0: # Nothing to exclude -> emit the input regions unmodified for row in access_rows: yield row else: # Check if each input region overlaps an excluded region for chrom, a_rows in group_regions_by_chromosome(access_rows): if chrom in ex_by_chrom: echo(chrom + ": Subtracting excluded regions") exclude_rows = iter(ex_by_chrom[chrom]) ex_start, ex_end = next_or_inf(exclude_rows) for a_start, a_end in a_rows: for row in exclude_in_region(exclude_rows, chrom, a_start, a_end, ex_start, ex_end): yield row else: echo(chrom + ": No excluded regions") for a_start, a_end in a_rows: yield (chrom, a_start, a_end)
def exclude_regions(bed_fname, access_rows): ex_by_chrom = dict(group_regions_by_chromosome( parse_regions(bed_fname, coord_only=True))) if len(ex_by_chrom) == 0: # Nothing to exclude -> emit the input regions unmodified for row in access_rows: yield row else: # Check if each input region overlaps an excluded region for chrom, a_rows in group_regions_by_chromosome(access_rows): if chrom in ex_by_chrom: echo(chrom + ": Subtracting excluded regions") exclude_rows = iter(ex_by_chrom[chrom]) ex_start, ex_end = next_or_inf(exclude_rows) for a_start, a_end in a_rows: for row in exclude_in_region(exclude_rows, chrom, a_start, a_end, ex_start, ex_end): yield row else: echo(chrom + ": No excluded regions") for a_start, a_end in a_rows: yield (chrom, a_start, a_end)
def join_regions(regions, min_gap_size): """Filter regions, joining those separated by small gaps.""" for chrom, coords in group_regions_by_chromosome(regions): echo(chrom + ": Joining over small gaps") coords = iter(coords) prev_start, prev_end = next(coords) for start, end in coords: gap = start - prev_end assert gap > 0, ("Impossible gap between %s %d-%d and %d-%d (=%d)" % (chrom, prev_start, prev_end, start, end, gap)) if gap < min_gap_size: # Join with the previous region echo("\tJoining %s %d-%d and %d-%d (gap size %d)" % (chrom, prev_start, prev_end, start, end, gap)) prev_end = end else: # Keep the gap; emit the previous region as-is echo("\tKeeping gap %s:%d-%d (size %d)" % (chrom, prev_end, start, gap)) yield (chrom, prev_start, prev_end) prev_start, prev_end = start, end yield (chrom, prev_start, prev_end)
def join_regions(regions, min_gap_size): """Filter regions, joining those separated by small gaps.""" for chrom, coords in group_regions_by_chromosome(regions): echo(chrom + ": Joining over small gaps") coords = iter(coords) prev_start, prev_end = next(coords) for start, end in coords: gap = start - prev_end assert gap > 0, ( "Impossible gap between %s %d-%d and %d-%d (=%d)" % (chrom, prev_start, prev_end, start, end, gap)) if gap < min_gap_size: # Join with the previous region echo("\tJoining %s %d-%d and %d-%d (gap size %d)" % (chrom, prev_start, prev_end, start, end, gap)) prev_end = end else: # Keep the gap; emit the previous region as-is echo("\tKeeping gap %s:%d-%d (size %d)" % (chrom, prev_end, start, gap)) yield (chrom, prev_start, prev_end) prev_start, prev_end = start, end yield (chrom, prev_start, prev_end)
def log_this(chrom, run_start, run_end): echo("\tAccessible region %s:%d-%d (size %d)" % (chrom, run_start, run_end, run_end - run_start)) return (chrom, run_start, run_end)
def logme(chrom, run_start, run_end): echo("\tEnded run at", run_end, "==>", run_end - run_start, 'b') return (chrom, run_start, run_end)