def annotate(chromos, pos, anno): """Annotate genomic locations. Tests if sites specified by `chromos` and `pos` are annotated by `anno`. Parameters ---------- chromos: :class:`numpy.ndarray` :class:`numpy.ndarray` with chromosome of sites. pos: :class:`numpy.ndarray` :class:`numpy.ndarray` with position on chromosome of sites. anno: :class:`pandas.DataFrame` :class:`pandas.DataFrame` with columns `chromo`, `start`, `end` that specify annotated regions. Returns ------- :class:`numpy.ndarray` Binary :class:`numpy.ndarray` of same length as `chromos` indicating if positions are annotated. """ idx = [] for chromo in np.unique(chromos): chromo_pos = pos[chromos == chromo] chromo_anno = anno.loc[anno.chromo == chromo] chromo_idx = is_in(chromo_pos, chromo_anno['start'].values, chromo_anno['end'].values) idx.append(chromo_idx) idx = np.hstack(idx) return idx
def test_is_in(): ys = [2, 4, 12, 17] ye = [2, 8, 15, 18] x = [-1, 2, 2, 3, 4, 8, 15, 16] expect = [False, True, True, False, True, True, True, False] result = annos.is_in(x, ys, ye) npt.assert_array_equal(result, expect)
def annotate(anno_file, chromo, pos): anno_file = dat.GzipFile(anno_file, 'r') anno = pd.read_table(anno_file, header=None, usecols=[0, 1, 2], dtype={0: 'str', 1: 'int32', 2: 'int32'}) anno_file.close() anno.columns = ['chromo', 'start', 'end'] anno.chromo = anno.chromo.str.upper().str.replace('CHR', '') anno = anno.loc[anno.chromo == chromo] anno.sort_values('start', inplace=True) start, end = an.join_overlapping(anno.start.values, anno.end.values) anno = np.array(an.is_in(pos, start, end), dtype='int8') return anno