def interval2genes(interval, skip=('CGH', '-')):
    """Squash intervals into named genes."""
    curr_name = None
    curr_chrom = None
    curr_start = None
    curr_end = None
    curr_len = 0
    for chrom, start, end, name in ngfrills.parse_regions(interval):
        if name in skip:
            continue
        if chrom != curr_chrom or name != curr_name:
            if curr_name is not None:
                # yield (curr_chrom, curr_start, curr_end, curr_name)
                if curr_len > 1:
                    # Emit
                    yield (curr_chrom, curr_start, curr_end, curr_name)
                else:
                    print("Single-probe gene is probably CGH:", curr_name,
                          file=sys.stderr)
            # Reset
            curr_name = name
            curr_chrom = chrom
            curr_start = start
            curr_len = 0
        # Extend
        curr_end = end
        curr_len += 1
    if curr_name is not None:
        if curr_len > 1:
            # Emit
            yield (curr_chrom, curr_start, curr_end, curr_name)
        else:
            print("Single-probe gene is probably CGH:", curr_name,
                    file=sys.stderr)
Exemple #2
0
def exclude_regions(bed_fname, access_rows):
    ex_by_chrom = dict(
        group_regions_by_chromosome(parse_regions(bed_fname, coord_only=True)))
    if len(ex_by_chrom) == 0:
        # Nothing to exclude -> emit the input regions unmodified
        for row in access_rows:
            yield row
    else:
        # Check if each input region overlaps an excluded region
        for chrom, a_rows in group_regions_by_chromosome(access_rows):
            if chrom in ex_by_chrom:
                logging.info("%s: Subtracting excluded regions", chrom)
                exclude_rows = iter(ex_by_chrom[chrom])
                ex_start, ex_end = next_or_inf(exclude_rows)
                for a_start, a_end in a_rows:
                    for row in exclude_in_region(exclude_rows, chrom, a_start,
                                                 a_end, ex_start, ex_end):
                        yield row
            else:
                logging.info("%s: No excluded regions", chrom)
                for a_start, a_end in a_rows:
                    yield (chrom, a_start, a_end)
Exemple #3
0
def exclude_regions(bed_fname, access_rows):
    ex_by_chrom = dict(group_regions_by_chromosome(
        parse_regions(bed_fname, coord_only=True)))
    if len(ex_by_chrom) == 0:
        # Nothing to exclude -> emit the input regions unmodified
        for row in access_rows:
            yield row
    else:
        # Check if each input region overlaps an excluded region
        for chrom, a_rows in group_regions_by_chromosome(access_rows):
            if chrom in ex_by_chrom:
                logging.info("%s: Subtracting excluded regions", chrom)
                exclude_rows = iter(ex_by_chrom[chrom])
                ex_start, ex_end = next_or_inf(exclude_rows)
                for a_start, a_end in a_rows:
                    for row in exclude_in_region(exclude_rows, chrom, a_start,
                                                 a_end, ex_start, ex_end):
                        yield row
            else:
                logging.info("%s: No excluded regions", chrom)
                for a_start, a_end in a_rows:
                    yield (chrom, a_start, a_end)
def interval2genes(interval, skip=('CGH', '-')):
    """Squash intervals into named genes."""
    curr_name = None
    curr_chrom = None
    curr_start = None
    curr_end = None
    curr_len = 0
    for chrom, start, end, name in ngfrills.parse_regions(interval):
        if name in skip:
            continue
        if chrom != curr_chrom or name != curr_name:
            if curr_name is not None:
                # yield (curr_chrom, curr_start, curr_end, curr_name)
                if curr_len > 1:
                    # Emit
                    yield (curr_chrom, curr_start, curr_end, curr_name)
                else:
                    print("Single-probe gene is probably CGH:",
                          curr_name,
                          file=sys.stderr)
            # Reset
            curr_name = name
            curr_chrom = chrom
            curr_start = start
            curr_len = 0
        # Extend
        curr_end = end
        curr_len += 1
    if curr_name is not None:
        if curr_len > 1:
            # Emit
            yield (curr_chrom, curr_start, curr_end, curr_name)
        else:
            print("Single-probe gene is probably CGH:",
                  curr_name,
                  file=sys.stderr)