Esempio n. 1
0
def join_regions(regions, min_gap_size):
    """Filter regions, joining those separated by small gaps."""
    regions = iter(regions)
    prev_chrom, prev_start, prev_end = next(regions)
    for chrom, start, end in regions:
        if chrom != prev_chrom:
            # New chromosome -- emit the remainder & reset things
            yield (prev_chrom, prev_start, prev_end)
            prev_chrom, prev_start, prev_end = chrom, start, end
        else:
            gap = start - prev_end
            assert gap > 0, (
                "Impossible gap between %s %d-%d and %d-%d (=%d)" %
                (chrom, prev_start, prev_end, start, end, gap))
            if gap < min_gap_size:
                # Join with the previous region
                echo("\tJoining %s %d-%d and %d-%d (gap %d)" %
                     (chrom, prev_start, prev_end, start, end, gap))
                prev_end = end
            else:
                # Keep the gap; emit the previous region as-is
                echo("\tKeeping gap between %s %d and %d (size %d)" %
                     (chrom, prev_end, start, gap))
                yield (prev_chrom, prev_start, prev_end)
                prev_chrom, prev_start, prev_end = chrom, start, end
    # If the last chromosome had no gaps, emit it too
    if prev_start == 0:
        yield (prev_chrom, prev_start, prev_end)
Esempio n. 2
0
def exclude_in_region(exclude_rows, chrom, a_start, a_end, ex_start, ex_end):
    """Take region exclusions from an iterable and apply, perhaps recursively.

    Returns an iterable (usually length 1) of two tuples:
        (accessible chromosome, start, end)
        (current exclusion start, end)
    """
    # If we've leapfrogged the excluded area, catch up
    while ex_end <= a_start:
        ex_start, ex_end = next_or_inf(exclude_rows)
    if a_end <= ex_start:
        # Excluded area does not overlap this one
        yield (chrom, a_start, a_end)
    else:
        # Excluded area overlaps this one -> trim this region
        echo("\tExclusion %s:%d-%d overlaps accessible region %d-%d" %
             (chrom, ex_start, ex_end, a_start, a_end))
        if ex_start <= a_start:
            if ex_end < a_end:
                # Exclusion covers this region's left (start) edge only
                for row in exclude_in_region(exclude_rows, chrom, ex_end,
                                             a_end, ex_start, ex_end):
                    yield row
            # Otherwise: Exclusion covers the whole region
        else:
            yield (chrom, a_start, ex_start)
            if ex_end < a_end:
                # Exclusion is in the middle of this region
                for row in exclude_in_region(exclude_rows, chrom, ex_end,
                                             a_end, ex_start, ex_end):
                    yield row
Esempio n. 3
0
def exclude_in_region(exclude_rows, chrom, a_start, a_end, ex_start, ex_end):
    """Take region exclusions from an iterable and apply, perhaps recursively.

    Returns an iterable (usually length 1) of two tuples:
        (accessible chromosome, start, end)
        (current exclusion start, end)
    """
    # If we've leapfrogged the excluded area, catch up
    while ex_end <= a_start:
        ex_start, ex_end = next_or_inf(exclude_rows)
    if a_end <= ex_start:
        # Excluded area does not overlap this one
        yield (chrom, a_start, a_end)
    else:
        # Excluded area overlaps this one -> trim this region
        echo("\tExclusion %s:%d-%d overlaps accessible region %d-%d"
             % (chrom, ex_start, ex_end, a_start, a_end))
        if ex_start <= a_start:
            if ex_end < a_end:
                # Exclusion covers this region's left (start) edge only
                for row in exclude_in_region(exclude_rows, chrom, ex_end, a_end,
                                             ex_start, ex_end):
                    yield row
            # Otherwise: Exclusion covers the whole region
        else:
            yield (chrom, a_start, ex_start)
            if ex_end < a_end:
                # Exclusion is in the middle of this region
                for row in exclude_in_region(exclude_rows, chrom, ex_end,
                                             a_end, ex_start, ex_end):
                    yield row
Esempio n. 4
0
def join_regions(regions, min_gap_size):
    """Filter regions, joining those separated by small gaps."""
    regions = iter(regions)
    prev_chrom, prev_start, prev_end = next(regions)
    for chrom, start, end in regions:
        if chrom != prev_chrom:
            # New chromosome -- emit the remainder & reset things
            yield (prev_chrom, prev_start, prev_end)
            prev_chrom, prev_start, prev_end = chrom, start, end
        else:
            gap = start - prev_end
            assert gap > 0, ("Impossible gap between %s %d-%d and %d-%d (=%d)"
                             % (chrom, prev_start, prev_end, start, end, gap))
            if gap < min_gap_size:
                # Join with the previous region
                echo("\tJoining %s %d-%d and %d-%d (gap %d)"
                     % (chrom, prev_start, prev_end,
                        start, end, gap))
                prev_end = end
            else:
                # Keep the gap; emit the previous region as-is
                echo("\tKeeping gap between %s %d and %d (size %d)"
                     % (chrom, prev_end, start, gap))
                yield (prev_chrom, prev_start, prev_end)
                prev_chrom, prev_start, prev_end = chrom, start, end
    # If the last chromosome had no gaps, emit it too
    if prev_start == 0:
        yield (prev_chrom, prev_start, prev_end)
Esempio n. 5
0
def write_bed(rows, fname):
    """Write region coordinates to `fname` in BED format."""
    with ngfrills.safe_write(fname, False) as outfile:
        i = 0
        for i, row in enumerate(rows):
            outfile.write("\t".join(map(str, row)) + '\n')
        ngfrills.echo("Wrote", fname, "with", i + 1, "bins")
Esempio n. 6
0
def get_regions(fasta_fname):
    """Find accessible sequence regions (those not masked out with 'N')."""
    with open(fasta_fname) as infile:
        chrom = cursor = run_start = None
        for line in infile:
            if line.startswith('>'):
                # Emit the last chromosome's last run, if any
                if run_start is not None:
                    yield log_this(chrom, run_start, cursor)
                # Start new chromosome
                chrom = line.split(None, 1)[0][1:]
                run_start = None
                cursor = 0
                echo(chrom + ": Scanning for accessible regions")
            else:
                line = line.rstrip()
                if 'N' in line:
                    if all(c == 'N' for c in line):
                        # Shortcut if the line is all N chars
                        if run_start is not None:
                            yield log_this(chrom, run_start, cursor)
                            run_start = None
                    else:
                        # Slow route: line is a mix of N and non-N chars
                        line_chars = np.array(line, dtype='c')
                        n_indices = np.where(line_chars == 'N')[0]
                        # Emit the first block of non-N chars, if any
                        if run_start is not None:
                            yield log_this(chrom, run_start,
                                           cursor + n_indices[0])
                        elif n_indices[0] != 0:
                            yield log_this(chrom, cursor,
                                           cursor + n_indices[0])
                        # Emit any short intermediate blocks
                        gap_mask = np.diff(n_indices) > 1
                        if gap_mask.any():
                            ok_starts = n_indices[gap_mask] + 1 + cursor
                            ok_ends = n_indices[1:][gap_mask] + cursor
                            for start, end in zip(ok_starts, ok_ends):
                                yield log_this(chrom, start, end)
                        # Account for any tailing non-N chars
                        if n_indices[-1] + 1 < len(line_chars):
                            run_start = cursor + n_indices[-1] + 1
                        else:
                            run_start = None
                else:
                    if run_start is None:
                        # Start of a new run of non-N characters
                        run_start = cursor
                cursor += len(line)
        # Emit the last run if it's accessible (i.e. not a telomere)
        if run_start is not None:
            yield log_this(chrom, run_start, cursor)
Esempio n. 7
0
def get_regions(fasta_fname):
    """Find accessible sequence regions (those not masked out with 'N')."""
    with open(fasta_fname) as infile:
        chrom = cursor = run_start = None
        for line in infile:
            if line.startswith('>'):
                # Emit the last chromosome's last run, if any
                if run_start is not None:
                    yield log_this(chrom, run_start, cursor)
                # Start new chromosome
                chrom = line.split(None, 1)[0][1:]
                run_start = None
                cursor = 0
                echo(chrom + ": Scanning for accessible regions")
            else:
                line = line.rstrip()
                if 'N' in line:
                    if all(c == 'N' for c in line):
                        # Shortcut if the line is all N chars
                        if run_start is not None:
                            yield log_this(chrom, run_start, cursor)
                            run_start = None
                    else:
                        # Slow route: line is a mix of N and non-N chars
                        line_chars = np.array(line, dtype='c')
                        n_indices = np.where(line_chars == 'N')[0]
                        # Emit the first block of non-N chars, if any
                        if run_start is not None:
                            yield log_this(chrom, run_start, cursor + n_indices[0])
                        elif n_indices[0] != 0:
                            yield log_this(chrom, cursor, cursor + n_indices[0])
                        # Emit any short intermediate blocks
                        gap_mask = np.diff(n_indices) > 1
                        if gap_mask.any():
                            ok_starts = n_indices[gap_mask] + 1 + cursor
                            ok_ends = n_indices[1:][gap_mask] + cursor
                            for start, end in zip(ok_starts, ok_ends):
                                yield log_this(chrom, start, end)
                        # Account for any tailing non-N chars
                        if n_indices[-1] + 1 < len(line_chars):
                            run_start = cursor + n_indices[-1] + 1
                        else:
                            run_start = None
                else:
                    if run_start is None:
                        # Start of a new run of non-N characters
                        run_start = cursor
                cursor += len(line)
        # Emit the last run if it's accessible (i.e. not a telomere)
        if run_start is not None:
            yield log_this(chrom, run_start, cursor)
Esempio n. 8
0
def load_cna(fname, reference):
    """Read CNA, adjust gender. Subtract reference if given (for ratio)."""
    echo("Processing", fname)
    cnarr = read(fname)
    if reference:
        # Subtract the reference copy number values (to get the log2 ratio)
        cnarr = fix.load_adjust_coverages(cnarr, reference, False, False, False)
        cnarr = shift_xx(cnarr, male_normal=True)
    else:
        cnarr = shift_xx(cnarr, male_normal=True)
        # Drop low-coverage probes (otherwise done in load_adjust_coverages)
        # cnarr = cnarr.to_rows(
        #     cnarr[cnarr.coverage >= params.MIN_BIN_COVERAGE])
    return cnarr
Esempio n. 9
0
def load_cna(fname, reference):
    """Read CNA, adjust gender. Subtract reference if given (for ratio)."""
    echo("Processing", fname)
    cnarr = read(fname)
    if reference:
        # Subtract the reference copy number values (to get the log2 ratio)
        cnarr = fix.load_adjust_coverages(cnarr, reference, False, False,
                                          False)
        cnarr = shift_xx(cnarr, male_normal=True)
    else:
        cnarr = shift_xx(cnarr, male_normal=True)
        # Drop low-coverage probes (otherwise done in load_adjust_coverages)
        # cnarr = cnarr.to_rows(
        #     cnarr[cnarr.coverage >= params.MIN_BIN_COVERAGE])
    return cnarr
Esempio n. 10
0
def main(args):
    """*"""
    do_ratio = bool(args.reference)
    ref_pset = read(args.reference or args.no_reference)
    bias_func = get_bias_func(args.mode, ref_pset, read(args.filenames[0]))

    print("Sample \tRaw probes \tTrend line \tReduction (%)")
    if args.batch:
        plot_overlaid(args.filenames, ref_pset, bias_func, args.mode, do_ratio, args.color)
    else:
        plot_separate(args.filenames, ref_pset, bias_func, args.mode, do_ratio)

    if args.output:
        pyplot.savefig(args.output, format='pdf', bbox_inches=0)
        echo("Wrote", args.output)
    else:
        pyplot.show()
Esempio n. 11
0
def main(args):
    """*"""
    do_ratio = bool(args.reference)
    ref_pset = read(args.reference or args.no_reference)
    bias_func = get_bias_func(args.mode, ref_pset, read(args.filenames[0]))

    print("Sample \tRaw probes \tTrend line \tReduction (%)")
    if args.batch:
        plot_overlaid(args.filenames, ref_pset, bias_func, args.mode, do_ratio,
                      args.color)
    else:
        plot_separate(args.filenames, ref_pset, bias_func, args.mode, do_ratio)

    if args.output:
        pyplot.savefig(args.output, format='pdf', bbox_inches=0)
        echo("Wrote", args.output)
    else:
        pyplot.show()
Esempio n. 12
0
def get_regions(fasta_fname):
    """Find accessible sequence regions (those not masked out with 'N')."""
    with open(fasta_fname) as infile:
        chrom = i = cursor = run_start = None
        for line in infile:
            if line.startswith('>'):
                # Emit the last chromosome's last run, if any
                if run_start is not None:
                    yield logme(chrom, run_start, cursor)
                else:
                    echo("\tChromosome ended with a telomere (N's)")
                # Start new chromosome
                chrom = line.split(None, 1)[0][1:]
                run_start = None
                cursor = 0
                echo("Scanning", chrom)
            else:
                for i, char in enumerate(line.rstrip()):
                    if char == 'N':
                        if run_start is not None:
                            # End of a run; emit the current region
                            yield logme(chrom, run_start, cursor + i)
                            run_start = None
                    else:
                        if run_start is None:
                            # Start of a new run of non-N characters
                            run_start = cursor + i
                            echo("\tStarted new run at", run_start)
                cursor += i + 1
        # Emit the last run if it's accessible (i.e. not a telomere)
        if run_start is not None:
            yield logme(chrom, run_start, cursor)
Esempio n. 13
0
def get_regions(fasta_fname):
    """Find accessible sequence regions (those not masked out with 'N')."""
    with open(fasta_fname) as infile:
        chrom = i = cursor = run_start = None
        for line in infile:
            if line.startswith('>'):
                # Emit the last chromosome's last run, if any
                if run_start is not None:
                    yield logme(chrom, run_start, cursor)
                else:
                    echo("\tChromosome ended with a telomere (N's)")
                # Start new chromosome
                chrom = line.split(None, 1)[0][1:]
                run_start = None
                cursor = 0
                echo("Scanning", chrom)
            else:
                for i, char in enumerate(line.rstrip()):
                    if char == 'N':
                        if run_start is not None:
                            # End of a run; emit the current region
                            yield logme(chrom, run_start, cursor + i)
                            run_start = None
                    else:
                        if run_start is None:
                            # Start of a new run of non-N characters
                            run_start = cursor + i
                            echo("\tStarted new run at", run_start)
                cursor += i + 1
        # Emit the last run if it's accessible (i.e. not a telomere)
        if run_start is not None:
            yield logme(chrom, run_start, cursor)
Esempio n. 14
0
def exclude_regions(bed_fname, access_rows):
    ex_by_chrom = dict(
        group_regions_by_chromosome(parse_regions(bed_fname, coord_only=True)))
    if len(ex_by_chrom) == 0:
        # Nothing to exclude -> emit the input regions unmodified
        for row in access_rows:
            yield row
    else:
        # Check if each input region overlaps an excluded region
        for chrom, a_rows in group_regions_by_chromosome(access_rows):
            if chrom in ex_by_chrom:
                echo(chrom + ": Subtracting excluded regions")
                exclude_rows = iter(ex_by_chrom[chrom])
                ex_start, ex_end = next_or_inf(exclude_rows)
                for a_start, a_end in a_rows:
                    for row in exclude_in_region(exclude_rows, chrom, a_start,
                                                 a_end, ex_start, ex_end):
                        yield row
            else:
                echo(chrom + ": No excluded regions")
                for a_start, a_end in a_rows:
                    yield (chrom, a_start, a_end)
Esempio n. 15
0
def exclude_regions(bed_fname, access_rows):
    ex_by_chrom = dict(group_regions_by_chromosome(
        parse_regions(bed_fname, coord_only=True)))
    if len(ex_by_chrom) == 0:
        # Nothing to exclude -> emit the input regions unmodified
        for row in access_rows:
            yield row
    else:
        # Check if each input region overlaps an excluded region
        for chrom, a_rows in group_regions_by_chromosome(access_rows):
            if chrom in ex_by_chrom:
                echo(chrom + ": Subtracting excluded regions")
                exclude_rows = iter(ex_by_chrom[chrom])
                ex_start, ex_end = next_or_inf(exclude_rows)
                for a_start, a_end in a_rows:
                    for row in exclude_in_region(exclude_rows, chrom, a_start,
                                                 a_end, ex_start, ex_end):
                        yield row
            else:
                echo(chrom + ": No excluded regions")
                for a_start, a_end in a_rows:
                    yield (chrom, a_start, a_end)
Esempio n. 16
0
def join_regions(regions, min_gap_size):
    """Filter regions, joining those separated by small gaps."""
    for chrom, coords in group_regions_by_chromosome(regions):
        echo(chrom + ": Joining over small gaps")
        coords = iter(coords)
        prev_start, prev_end = next(coords)
        for start, end in coords:
            gap = start - prev_end
            assert gap > 0, ("Impossible gap between %s %d-%d and %d-%d (=%d)"
                             % (chrom, prev_start, prev_end, start, end, gap))
            if gap < min_gap_size:
                # Join with the previous region
                echo("\tJoining %s %d-%d and %d-%d (gap size %d)"
                     % (chrom, prev_start, prev_end, start, end, gap))
                prev_end = end
            else:
                # Keep the gap; emit the previous region as-is
                echo("\tKeeping gap %s:%d-%d (size %d)"
                     % (chrom, prev_end, start, gap))
                yield (chrom, prev_start, prev_end)
                prev_start, prev_end = start, end
        yield (chrom, prev_start, prev_end)
Esempio n. 17
0
def join_regions(regions, min_gap_size):
    """Filter regions, joining those separated by small gaps."""
    for chrom, coords in group_regions_by_chromosome(regions):
        echo(chrom + ": Joining over small gaps")
        coords = iter(coords)
        prev_start, prev_end = next(coords)
        for start, end in coords:
            gap = start - prev_end
            assert gap > 0, (
                "Impossible gap between %s %d-%d and %d-%d (=%d)" %
                (chrom, prev_start, prev_end, start, end, gap))
            if gap < min_gap_size:
                # Join with the previous region
                echo("\tJoining %s %d-%d and %d-%d (gap size %d)" %
                     (chrom, prev_start, prev_end, start, end, gap))
                prev_end = end
            else:
                # Keep the gap; emit the previous region as-is
                echo("\tKeeping gap %s:%d-%d (size %d)" %
                     (chrom, prev_end, start, gap))
                yield (chrom, prev_start, prev_end)
                prev_start, prev_end = start, end
        yield (chrom, prev_start, prev_end)
Esempio n. 18
0
def log_this(chrom, run_start, run_end):
    echo("\tAccessible region %s:%d-%d (size %d)"
         % (chrom, run_start, run_end, run_end - run_start))
    return (chrom, run_start, run_end)
Esempio n. 19
0
def log_this(chrom, run_start, run_end):
    echo("\tAccessible region %s:%d-%d (size %d)" %
         (chrom, run_start, run_end, run_end - run_start))
    return (chrom, run_start, run_end)
Esempio n. 20
0
def logme(chrom, run_start, run_end):
    echo("\tEnded run at", run_end, "==>", run_end - run_start, 'b')
    return (chrom, run_start, run_end)
Esempio n. 21
0
def logme(chrom, run_start, run_end):
    echo("\tEnded run at", run_end, "==>", run_end - run_start, 'b')
    return (chrom, run_start, run_end)