Esempio n. 1
0
def unique1d(values):
    """
    Hash table-based unique
    """
    if np.issubdtype(values.dtype, np.floating):
        table = htable.Float64HashTable(len(values))
        uniques = np.array(table.unique(_ensure_float64(values)),
                           dtype=np.float64)
    elif np.issubdtype(values.dtype, np.datetime64):
        table = htable.Int64HashTable(len(values))
        uniques = table.unique(_ensure_int64(values))
        uniques = uniques.view('M8[ns]')
    elif np.issubdtype(values.dtype, np.timedelta64):
        table = htable.Int64HashTable(len(values))
        uniques = table.unique(_ensure_int64(values))
        uniques = uniques.view('m8[ns]')
    elif np.issubdtype(values.dtype, np.signedinteger):
        table = htable.Int64HashTable(len(values))
        uniques = table.unique(_ensure_int64(values))
    elif np.issubdtype(values.dtype, np.unsignedinteger):
        table = htable.UInt64HashTable(len(values))
        uniques = table.unique(_ensure_uint64(values))
    else:

        # its cheaper to use a String Hash Table than Object
        if lib.infer_dtype(values) in ['string']:
            table = htable.StringHashTable(len(values))
        else:
            table = htable.PyObjectHashTable(len(values))

        uniques = table.unique(_ensure_object(values))

    return uniques
Esempio n. 2
0
def test_get_labels_groupby_for_Int64(writable):
    table = ht.Int64HashTable()
    vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
    vals.flags.writeable = writable
    arr, unique = table.get_labels_groupby(vals)
    expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp)
    expected_unique = np.array([1, 2], dtype=np.int64)
    tm.assert_numpy_array_equal(arr, expected_arr)
    tm.assert_numpy_array_equal(unique, expected_unique)
Esempio n. 3
0
    def __init__(self, comp_ids, ngroups, levels, labels):
        self.levels = levels
        self.labels = labels
        self.comp_ids = comp_ids.astype(np.int64)

        self.k = len(labels)
        self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)]

        self._populate_tables()
Esempio n. 4
0
def get_flattened_list(
    comp_ids: np.ndarray,
    ngroups: int,
    levels: Iterable[Index],
    labels: Iterable[np.ndarray],
) -> List[Tuple]:
    """Map compressed group id -> key tuple."""
    comp_ids = comp_ids.astype(np.int64, copy=False)
    arrays: DefaultDict[int, List[int]] = defaultdict(list)
    for labs, level in zip(labels, levels):
        table = hashtable.Int64HashTable(ngroups)
        table.map(comp_ids, labs.astype(np.int64, copy=False))
        for i in range(ngroups):
            arrays[i].append(level[table.get_item(i)])
    return [tuple(array) for array in arrays.values()]
Esempio n. 5
0
def compress_group_index(group_index, sort: bool = True):
    """
    Group_index is offsets into cartesian product of all possible labels. This
    space can be huge, so this function compresses it, by computing offsets
    (comp_ids) into the list of unique labels (obs_group_ids).
    """
    size_hint = len(group_index)
    table = hashtable.Int64HashTable(size_hint)

    group_index = ensure_int64(group_index)

    # note, group labels come out ascending (ie, 1,2,3 etc)
    comp_ids, obs_group_ids = table.get_labels_groupby(group_index)

    if sort and len(obs_group_ids) > 0:
        obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)

    return ensure_int64(comp_ids), ensure_int64(obs_group_ids)