Esempio n. 1
0
 def test02(self):
     """Testing evaluation of ndcarrays (reduction, no axis)"""
     a = np.arange(np.prod(self.shape)).reshape(self.shape)
     b = bcolz.arange(np.prod(self.shape)).reshape(self.shape)
     if bcolz.defaults.eval_vm == "python":
         assert_array_equal(sum(a), bcolz.eval("sum(b)"),
                            "Arrays are not equal")
     else:
         self.assertEqual(a.sum(), bcolz.eval("sum(b)"))
Esempio n. 2
0
 def test02b(self):
     """Testing evaluation of ndcarrays (reduction, with axis)"""
     a = np.arange(np.prod(self.shape)).reshape(self.shape)
     b = bcolz.arange(np.prod(self.shape)).reshape(self.shape)
     if bcolz.defaults.eval_vm == "python":
         # The Python VM does not have support for `axis` param
         assert_array_equal(sum(a), bcolz.eval("sum(b)"),
                            "Arrays are not equal")
     else:
         assert_array_equal(a.sum(axis=1), bcolz.eval("sum(b, axis=1)"),
                            "Arrays are not equal")
Esempio n. 3
0
 def test01(self):
     """Testing evaluation of ndcarrays (int out)"""
     a = np.arange(np.prod(self.shape)).reshape(self.shape)
     b = bcolz.arange(np.prod(self.shape)).reshape(self.shape)
     outa = eval("a*2.+1")
     outb = bcolz.eval("b*2.+1")
     assert_array_equal(outa, outb, "Arrays are not equal")
Esempio n. 4
0
    def eval(self, expression, **kwargs):
        """Evaluate the `expression` on columns and return the result.

        Parameters
        ----------
        expression : string
            A string forming an expression, like '2*a+3*b'. The values
            for 'a' and 'b' are variable names to be taken from the
            calling function's frame.  These variables may be column
            names in this table, scalars, carrays or NumPy arrays.
        kwargs : list of parameters or dictionary
            Any parameter supported by the `eval()` first level function.

        Returns
        -------
        out : carray object
            The outcome of the expression.  You can tailor the
            properties of this carray by passing additional arguments
            supported by carray constructor in `kwargs`.

        See Also
        --------
        eval (first level function)

        """

        # Get the desired frame depth
        depth = kwargs.pop('depth', 3)
        # Call top-level eval with cols as user_dict
        return bcolz.eval(expression,
                          user_dict=self.cols,
                          depth=depth,
                          **kwargs)
Esempio n. 5
0
    def eval(self, expression, **kwargs):
        """Evaluate the `expression` on columns and return the result.

        Parameters
        ----------
        expression : string
            A string forming an expression, like '2*a+3*b'. The values
            for 'a' and 'b' are variable names to be taken from the
            calling function's frame.  These variables may be column
            names in this table, scalars, carrays or NumPy arrays.
        kwargs : list of parameters or dictionary
            Any parameter supported by the `eval()` first level function.

        Returns
        -------
        out : carray object
            The outcome of the expression.  You can tailor the
            properties of this carray by passing additional arguments
            supported by carray constructor in `kwargs`.

        See Also
        --------
        eval (first level function)

        """

        # Get the desired frame depth
        depth = kwargs.pop('depth', 3)
        # Call top-level eval with cols as user_dict
        return bcolz.eval(expression, user_dict=self.cols, depth=depth,
                          **kwargs)
Esempio n. 6
0
 def test00b(self):
     """Testing evaluation of ndcarrays (bool out, NumPy)"""
     a = np.arange(np.prod(self.shape)).reshape(self.shape)
     b = bcolz.arange(np.prod(self.shape)).reshape(self.shape)
     outa = eval("a>0")
     outb = bcolz.eval("b>0", out_flavor='numpy')
     assert_array_equal(outa, outb, "Arrays are not equal")
 def _filtered_index(self, instrument):
     # TODO 确认是否跳过日内涨跌停
     if instrument not in self._index_skip_suspending:
         carray = self._open_minute_file("close", instrument)
         sub_index = bcolz.eval("carray != 0", vm="numexpr")
         index = self._minute_index[:len(sub_index)][sub_index[:]]
         self._index_skip_suspending[instrument] = index
     return self._index_skip_suspending[instrument]
Esempio n. 8
0
 def _calc_group_index(eval_list, factor_set, vm=None):
     factorize_list = []
     for eval_node in eval_list:
         # calculate the cartesian group index for each row
         factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm)
         # now factorize the unique groupby combinations
         sub_factor_carray, sub_values = ctable_ext.factorize(factor_input)
         factorize_list.append((sub_factor_carray, sub_values))
     return factorize_list
Esempio n. 9
0
def compute_bcolz(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # bcolz.set_nthreads(1)
    #bcolz.blosc_set_nthreads(1)
    print("*** bcolz (using compression clevel = %d):" % clevel)
    x = cx  # comment this for using numpy arrays in inputs
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,))
Esempio n. 10
0
    def replace_value(self, old, new):
        """
        Replace old by new in my values
        """
        if type(new) != int and type(new) != float and type(new) != bool:
            raise ValueError("new must be int, float or bool: in %s()" % (method_name()))

        ca = self.carray
        idx = bcolz.eval('ca == ' + str(old))
        self.carray[idx] = new
Esempio n. 11
0
def compute_carray(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # Maybe due to some contention between Numexpr and Blosc?
    # bcolz.set_nthreads(bcolz.ncores//2)
    print("*** carray (using compression clevel = %d):" % clevel)
    if clevel > 0:
        x, y, z = cx, cy, cz
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,), end="")
    print(", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes)))
Esempio n. 12
0
def compute_bcolz(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # bcolz.set_nthreads(1)
    #bcolz.blosc_set_nthreads(1)
    print("*** bcolz (using compression clevel = %d):" % clevel)
    x = cx  # comment this for using numpy arrays in inputs
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (
        vm,
        time() - t0,
    ))
Esempio n. 13
0
    def replace_list(self, old, new):
        """
        Replace each ith value in old by each ith value in new in my values
        """
        # TODO rewrite this method: it takes very long when there are many replacements to perform
        # for instance, replacing CategoryID in userinfo in avito dataset is very sloooow

        if len([x for x in new if type(x) != int and type(x) != float and type(x) != bool]) > 0:
            raise ValueError("Values in new must be int, float or bool: in %s()" % (method_name()))

        ca = self.carray
        cond = "(" +  ") | (". join(["ca == " + str(o) for o in old]) + ")"
        mask = bcolz.eval(cond)
        rep = [i for i in mask.wheretrue()]
        pairs = dict(zip(old, new))
        if len(rep) > 0:
            self.carray[rep] = [pairs[o] for o in self.carray[rep]]
Esempio n. 14
0
    rootdir = tempfile.mkdtemp(prefix='bcolz-')
    os.rmdir(rootdir)  # folder should be emtpy
    ct = bz.fromiter(generator, dtype='i4,i4', count=N, rootdir=rootdir)
    ct = bq.open(rootdir)
    # print ct
    ct.flush()
    ct = bq.open(rootdir)

    yield ct

    shutil.rmtree(rootdir)


def gen(N):
    x = 0
    for i in range(N):
        if random.randint(0, 1):
            x += 1
        yield x, random.randint(0, 20)


if __name__ == '__main__':
    N = int(1e5)
    g = gen(N)

    with on_disk_data_cleaner(g) as ct:
        f1 = ct['f1']
        barr = bz.eval("f1 == 1")  # filter
        with ctime('is_in_ordered_subgroups'):
            result = ct.is_in_ordered_subgroups(basket_col='f0', bool_arr=barr)
Esempio n. 15
0
def eval(expr):
    return bcolz.eval(expr)
Esempio n. 16
0
def filter(db, query, user_dict):
    # these should be translated to a bunch or or/and statements within gemini
    # so they are supported, but must be translated before getting here.
    if query == "False" or query is None or query is False:
        return []
    if "any(" in query or "all(" in query or \
       ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1):
        return None
    user_dict['where'] = np.where

    if query.startswith("not "):
        # "~" is not to numexpr.
        query = "~" + query[4:]
    sum_cmp = False
    if query.startswith("sum("):
        assert query[-1].isdigit()
        query, sum_cmp = query[4:].rsplit(")", 1)
        query = "(%s) %s" % (query, sum_cmp)

    query = query.replace(".", "__")
    query = " & ".join("(%s)" % token for token in query.split(" and "))
    query = " | ".join("(%s)" % token for token in query.split(" or "))

    conn, metadata = database.get_session_metadata(db)
    samples = get_samples(metadata)
    # convert gt_col[index] to gt_col__sample_name
    patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types))

    def subfn(x):
        """Turn gt_types[1] into gt_types__sample"""
        field, idx = x.groups()
        return "%s__%s" % (field, fix_sample_name(samples[int(idx)]))

    query = re.sub(patt, subfn, query)
    if os.environ.get('GEMINI_DEBUG') == 'TRUE':
        sys.stderr.write(query[:250] + "...\n")
    carrays = load(db, query=query)

    if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \
       any(not any(carrays[c]) for c in carrays):
        # need this 2nd check above because of the place-holders in load()
        raise NoGTIndexException

    # loop through and create a cache of "$gt__$sample"
    for gt_col in carrays:
        if not gt_col in query: continue
        for i, sample_array in enumerate(carrays[gt_col]):
            sample = fix_sample_name(samples[i])
            if not sample in query: continue
            user_dict["%s__%s" % (gt_col, sample)] = sample_array

    # had to special-case count. it won't quite be as efficient
    if "|count|" in query:
        tokens = query[2:-2].split("|count|")
        icmp = tokens[-1]
        # a list of carrays, so not in memory.
        res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]]
        # in memory after this, but just a single axis array.
        res = np.sum(res, axis=0)
        res = ne.evaluate('res%s' % icmp)
    else:
        res = bcolz.eval(query, user_dict=user_dict)

    try:
        if res.shape[0] == 1 and len(res.shape) > 1:
            res = res[0]
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []
Esempio n. 17
0
    def make_group_index(self, groupby_cols, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                bool_arr:

            Returns:
                carray: (carray_factor)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''
        factor_list, values_list = self.factorize_groupby_cols(groupby_cols)

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.zeros(len(self), dtype='int64', rootdir=tmp_rootdir, mode='w')
            carray_values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            carray_factor = factor_list[0]
            carray_values = values_list[0]
        else:
            # multi column groupby
            # first combine the factorized columns to single values
            if self.group_cache_valid(col_list=groupby_cols):
                # there is a group cache that we can use
                col_rootdir = os.path.join(self.rootdir, self.create_group_base_name(groupby_cols))
                col_factor_rootdir = col_rootdir + '.factor'
                carray_factor = bcolz.carray(rootdir=col_factor_rootdir)
                col_values_rootdir = col_rootdir + '.values'
                carray_values = bcolz.carray(rootdir=col_values_rootdir)
            else:
                # create a brand new groupby col combination
                carray_factor, carray_values = \
                    self.create_group_column_factor(factor_list, groupby_cols, cache=self.auto_cache)

        nr_groups = len(carray_values)
        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            tmp_rootdir = self.create_tmp_rootdir()
            carray_factor = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': carray_factor, 'bool': bool_arr}, rootdir=tmp_rootdir, mode='w')
            # now check how many unique values there are left
            tmp_rootdir = self.create_tmp_rootdir()
            labels = bcolz.carray([], dtype='int64', expectedlen=len(carray_factor), rootdir=tmp_rootdir, mode='w')
            carray_factor, values = ctable_ext.factorize(carray_factor, labels)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]
            # the new nr of groups depends on the outcome after filtering
            nr_groups = len(values)

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally

        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return carray_factor, nr_groups, skip_key
Esempio n. 18
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        # create unique groups for groupby loop

        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']

        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]

        else:
            # multi column groupby
            # nb: this might also be cached in the future

            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}

            # create a numexpr expression that calculates the place on
            # a cartesian join index
            eval_str = ''
            previous_value = 1
            for col, values \
                    in zip(reversed(groupby_cols), reversed(values_list)):
                if eval_str:
                    eval_str += ' + '
                eval_str += str(previous_value) + '*' + col
                previous_value *= len(values)

            # calculate the cartesian group index for each row
            factor_input = bcolz.eval(eval_str, user_dict=factor_set)

            # now factorize the unique groupby combinations
            factor_carray, values = ctable_ext.factorize(factor_input)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval('(factor + 1) * bool - 1',
                                       user_dict={
                                           'factor': factor_carray,
                                           'bool': bool_arr
                                       })
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.iteritems() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Esempio n. 19
0
def filter(db, query, user_dict):
    # these should be translated to a bunch or or/and statements within gemini
    # so they are supported, but must be translated before getting here.
    if "any(" in query or "all(" in query or \
       ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1):
        return None
    user_dict['where'] = np.where

    carrays = load(db)
    if query.startswith("not "):
        # "~" is not to numexpr.
        query = "~" + query[4:]
    sum_cmp = False
    if query.startswith("sum("):
        assert query[-1].isdigit()
        query, sum_cmp = query[4:].rsplit(")", 1)
        query = "(%s) %s" % (query, sum_cmp)

    query = query.replace(".", "__")
    query = " & ".join("(%s)" % token for token in query.split(" and "))
    query = " | ".join("(%s)" % token for token in query.split(" or "))

    conn = sqlite3.connect(db)
    cur = conn.cursor()
    samples = get_samples(cur)
    # convert gt_col[index] to gt_col__sample_name
    patt = "(%s)\[(\d+)\]" % "|".join(carrays.keys())

    def fix_sample_name(s):
        return s.replace("-", "_").replace(" ", "_")

    def subfn(x):
        """Turn gt_types[1] into gt_types__sample"""
        field, idx = x.groups()
        return "%s__%s" % (field, fix_sample_name(samples[int(idx)]))

    query = re.sub(patt, subfn, query)
    if os.environ.get('GEMINI_DEBUG') == 'TRUE':
        print >>sys.stderr, query

    # loop through and create a cache of "$gt__$sample"
    for gt_col in carrays:
        # if not gt_col in query: continue
        for i, sample_array in enumerate(carrays[gt_col]):
            sample = fix_sample_name(samples[i])
            # if not sample in query: continue
            user_dict["%s__%s" % (gt_col, sample)] = sample_array

    # had to special-case count. it won't quite be as efficient
    if "|count|" in query:
        tokens = query[2:-2].split("|count|")
        icmp = tokens[-1]
        # a list of carrays, so not in memory.
        res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]]
        # in memory after this, but just a single axis array.
        res = np.sum(res, axis=0)
        res = ne.evaluate('res%s' % icmp)
    else:
        res = bcolz.eval(query, user_dict=user_dict)

    if res.shape[0] == 1 and len(res.shape) > 1:
        res = res[0]
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []
Esempio n. 20
0
    def test_pos_basket_01(self):
        """test_pos_basket_01:

             <----- data ----->
            | Basket | Product | Filter | Result |
            |--------|---------|--------|--------|
            | 1      | A       | 0      | 1      |
            | 1      | B       | 1      | 1      |
            | 1      | C       | 0      | 1      |
            | 2      | A       | 0      | 1      |
            | 2      | B       | 1      | 1      |
            | 3      | A       | 0      | 0      |
            | 4      | A       | 0      | 0      |
            | 4      | C       | 0      | 0      |
            | 5      | B       | 1      | 1      |
            | 6      | A       | 0      | 1      |
            | 6      | B       | 1      | 1      |
            | 6      | C       | 0      | 1      |
            | 7      | B       | 1      | 1      |
            | 7      | B       | 1      | 1      |
            | 7      | B       | 1      | 1      |
            | 8      | B       | 1      | 1      |
            | 9      | C       | 0      | 0      |

        """

        # -- Data --
        data = np.array(
            [(1, 0),
             (1, 1),
             (1, 2),
             (2, 0),
             (2, 1),
             (3, 0),
             (4, 0),
             (4, 2),
             (5, 1),
             (6, 0),
             (6, 1),
             (6, 2),
             (7, 1),
             (7, 1),
             (7, 1),
             (8, 1),
             (9, 2),
             ],
            dtype='i8,i8')

        # -- Bcolz --
        with self.on_disk_data_cleaner(data) as ct:
            f1 = ct['f1']
            barr = bz.eval("f1 == 1")  # filter
            result = ct.is_in_ordered_subgroups(basket_col='f0', bool_arr=barr,
                                                _max_len_subgroup=1)

        assert_list_equal(list(barr[:]),
                          [False, True, False, False, True, False, False, False,
                           True, False, True, False, True, True, True, True,
                           False])

        assert_list_equal(list(result[:]),
                          [True, True, True, True, True, False, False, False,
                           True, True, True, True, True, True, True, True,
                           False])
Esempio n. 21
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        # create unique groups for groupby loop

        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']

        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]

        else:
            # multi column groupby
            # nb: this might also be cached in the future

            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}

            # create a numexpr expression that calculates the place on
            # a cartesian join index
            eval_str = ''
            previous_value = 1
            for col, values \
                    in zip(reversed(groupby_cols), reversed(values_list)):
                if eval_str:
                    eval_str += ' + '
                eval_str += str(previous_value) + '*' + col
                previous_value *= len(values)

            # calculate the cartesian group index for each row
            factor_input = bcolz.eval(eval_str, user_dict=factor_set)

            # now factorize the unique groupby combinations
            factor_carray, values = ctable_ext.factorize(factor_input)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': factor_carray, 'bool': bool_arr})
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.iteritems() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Esempio n. 22
0
    def make_group_index(self, factor_list, values_list, groupby_cols,
                         array_length, bool_arr):
        '''Create unique groups for groupby loop

            Args:
                factor_list:
                values_list:
                groupby_cols:
                array_length:
                bool_arr:

            Returns:
                carray: (factor_carray)
                int: (nr_groups) the number of resulting groups
                int: (skip_key)
        '''

        def _create_eval_str(groupby_cols, values_list, check_overflow=True):

            eval_list = []
            eval_str = ''
            col_list = []
            previous_value = 1
            # Sort evaluated columns by length
            col_len_list = [(col, values) for col, values in zip(groupby_cols, values_list)]
            col_len_list.sort(key=lambda x: len(x[1]))
            groupby_cols = [col for col, _ in col_len_list]
            values_list = [values for _, values in col_len_list]

            for col, values \
                    in zip(groupby_cols, values_list):

                # check for overflow
                if check_overflow:
                    if previous_value * len(values) > 4294967295:
                        eval_list.append((eval_str, col_list))
                        # reset
                        eval_str = ''
                        col_list = []
                        previous_value = 1

                if eval_str:
                    eval_str += ' + '
                else:
                    eval_str += '-2147483648 + '

                eval_str += str(previous_value) + '*' + col
                col_list.append(col)
                previous_value *= len(values)

            eval_list.append((eval_str, col_list))
            return eval_list

        def _calc_group_index(eval_list, factor_set, vm=None):
            factorize_list = []
            for eval_node in eval_list:
                # calculate the cartesian group index for each row
                factor_input = bcolz.eval(eval_node[0], user_dict=factor_set, vm=vm)
                # now factorize the unique groupby combinations
                sub_factor_carray, sub_values = ctable_ext.factorize(factor_input)
                factorize_list.append((sub_factor_carray, sub_values))
            return factorize_list

        def _is_reducible(eval_list):
            for eval_node in eval_list:
                if len(eval_node[1]) > 1:
                    return True
            return False

        def calc_index(groupby_cols, values_list, factor_set, vm=None):
            # Initialize eval list
            eval_list = _create_eval_str(groupby_cols, values_list)

            # Reduce expression as possible
            while _is_reducible(eval_list):
                del groupby_cols
                del values_list
                factorize_list = _calc_group_index(eval_list, factor_set)
                factor_set = {'g' + str(i): x[0] for i, x in enumerate(factorize_list)}
                groupby_cols = ['g' + str(i) for i, x in enumerate(factorize_list)]
                values_list = [x[1] for i, x in enumerate(factorize_list)]
                eval_list = _create_eval_str(groupby_cols, values_list)
            # If we have multiple expressions that cannot be reduced anymore, rewrite as a single one and use Python vm
            if len(eval_list) > 1:
                eval_list = _create_eval_str(groupby_cols, values_list, check_overflow=False)
                vm = 'python'

            del groupby_cols
            del values_list

            # Now we have a single expression, factorize it
            return _calc_group_index(eval_list, factor_set, vm=vm)[0]

        # create unique groups for groupby loop
        if len(factor_list) == 0:
            # no columns to groupby over, so directly aggregate the measure
            # columns to 1 total (index 0/zero)
            factor_carray = bcolz.zeros(array_length, dtype='int64')
            values = ['Total']
        elif len(factor_list) == 1:
            # single column groupby, the groupby output column
            # here is 1:1 to the values
            factor_carray = factor_list[0]
            values = values_list[0]
        else:
            # multi column groupby
            # nb: this might also be cached in the future
            # first combine the factorized columns to single values
            factor_set = {x: y for x, y in zip(groupby_cols, factor_list)}
            # create a numexpr expression that calculates the place on
            # a cartesian join index
            factor_carray, values = calc_index(groupby_cols, values_list, factor_set)

        skip_key = None

        if bool_arr is not None:
            # make all non relevant combinations -1
            factor_carray = bcolz.eval(
                '(factor + 1) * bool - 1',
                user_dict={'factor': factor_carray, 'bool': bool_arr})
            # now check how many unique values there are left
            factor_carray, values = ctable_ext.factorize(factor_carray)
            # values might contain one value too much (-1) (no direct lookup
            # possible because values is a reversed dict)
            filter_check = \
                [key for key, value in values.items() if value == -1]
            if filter_check:
                skip_key = filter_check[0]

        # using nr_groups as a total length might be one one off due to the skip_key
        # (skipping a row in aggregation)
        # but that is okay normally
        nr_groups = len(values)
        if skip_key is None:
            # if we shouldn't skip a row, set it at the first row after the total number of groups
            skip_key = nr_groups

        return factor_carray, nr_groups, skip_key
Esempio n. 23
0
    def test_pos_basket_01(self):
        """test_pos_basket_01:

             <----- data ----->
            | Basket | Product | Filter | Result |
            |--------|---------|--------|--------|
            | 1      | A       | 0      | 1      |
            | 1      | B       | 1      | 1      |
            | 1      | C       | 0      | 1      |
            | 2      | A       | 0      | 1      |
            | 2      | B       | 1      | 1      |
            | 3      | A       | 0      | 0      |
            | 4      | A       | 0      | 0      |
            | 4      | C       | 0      | 0      |
            | 5      | B       | 1      | 1      |
            | 6      | A       | 0      | 1      |
            | 6      | B       | 1      | 1      |
            | 6      | C       | 0      | 1      |
            | 7      | B       | 1      | 1      |
            | 7      | B       | 1      | 1      |
            | 7      | B       | 1      | 1      |
            | 8      | B       | 1      | 1      |
            | 9      | C       | 0      | 0      |

        """

        # -- Data --
        data = np.array(
            [(1, 0),
             (1, 1),
             (1, 2),
             (2, 0),
             (2, 1),
             (3, 0),
             (4, 0),
             (4, 2),
             (5, 1),
             (6, 0),
             (6, 1),
             (6, 2),
             (7, 1),
             (7, 1),
             (7, 1),
             (8, 1),
             (9, 2),
             ],
            dtype='i8,i8')

        # -- Bcolz --
        with self.on_disk_data_cleaner(data) as ct:
            f1 = ct['f1']
            barr = bz.eval("f1 == 1")  # filter
            result = ct.is_in_ordered_subgroups(basket_col='f0', bool_arr=barr,
                                                _max_len_subgroup=1)

        assert_list_equal(list(barr[:]),
                          [False, True, False, False, True, False, False, False,
                           True, False, True, False, True, True, True, True,
                           False])

        assert_list_equal(list(result[:]),
                          [True, True, True, True, True, False, False, False,
                           True, True, True, True, True, True, True, True,
                           False])
Esempio n. 24
0
t0 = time()
c = bcolz.carray(rootdir='myarray')
print("time open (disk) ->", round(time() - t0, 3))
#print "meta (disk):", c.read_meta()
print("data (disk):", repr(c))

t0 = time()
print(sum(ac))
print("time sum (memory, iter) ->", round(time() - t0, 3))

t0 = time()
print(sum(c))
print("time sum (disk, iter) ->", round(time() - t0, 3))

t0 = time()
print(bcolz.eval('sum(ac)'))
print("time sum (memory, eval) ->", round(time() - t0, 3))

t0 = time()
print(bcolz.eval('sum(c)'))
print("time sum (disk, eval) ->", round(time() - t0, 3))

t0 = time()
print(ac.sum())
print("time sum (memory, method) ->", round(time() - t0, 3))

t0 = time()
print(c.sum())
print("time sum (disk, method) ->", round(time() - t0, 3))

t0 = time()
Esempio n. 25
0
def filter(db, query, user_dict):
    # these should be translated to a bunch or or/and statements within gemini
    # so they are supported, but must be translated before getting here.
    if query == "False" or query is None or query is False:
        return []
    if "any(" in query or "all(" in query or \
       ("sum(" in query and not query.startswith("sum(") and query.count("sum(") == 1):
        return None
    user_dict['where'] = np.where

    if query.startswith("not "):
        # "~" is not to numexpr.
        query = "~" + query[4:]
    sum_cmp = False
    if query.startswith("sum("):
        assert query[-1].isdigit()
        query, sum_cmp = query[4:].rsplit(")", 1)
        query = "(%s) %s" % (query, sum_cmp)

    query = query.replace(".", "__")
    query = " & ".join("(%s)" % token for token in query.split(" and "))
    query = " | ".join("(%s)" % token for token in query.split(" or "))

    conn, metadata = database.get_session_metadata(db)
    samples = get_samples(metadata)
    # convert gt_col[index] to gt_col__sample_name
    patt = "(%s)\[(\d+)\]" % "|".join((g[0] for g in gt_cols_types))


    def subfn(x):
        """Turn gt_types[1] into gt_types__sample"""
        field, idx = x.groups()
        return "%s__%s" % (field, fix_sample_name(samples[int(idx)]))

    query = re.sub(patt, subfn, query)
    if os.environ.get('GEMINI_DEBUG') == 'TRUE':
        sys.stderr.write(query[:250] + "...\n")
    carrays = load(db, query=query)

    if len(carrays) == 0 or max(len(carrays[c]) for c in carrays) == 0 or \
       any(not any(carrays[c]) for c in carrays):
       # need this 2nd check above because of the place-holders in load()
        raise NoGTIndexException

    # loop through and create a cache of "$gt__$sample"
    for gt_col in carrays:
        if not gt_col in query: continue
        for i, sample_array in enumerate(carrays[gt_col]):
            sample = fix_sample_name(samples[i])
            if not sample in query: continue
            user_dict["%s__%s" % (gt_col, sample)] = sample_array

    # had to special-case count. it won't quite be as efficient
    if "|count|" in query:
        tokens = query[2:-2].split("|count|")
        icmp = tokens[-1]
        # a list of carrays, so not in memory.
        res = [bcolz.eval(tok, user_dict=user_dict) for tok in tokens[:-1]]
        # in memory after this, but just a single axis array.
        res = np.sum(res, axis=0)
        res = ne.evaluate('res%s' % icmp)
    else:
        res = bcolz.eval(query, user_dict=user_dict)

    try:
        if res.shape[0] == 1 and len(res.shape) > 1:
            res = res[0]
    except AttributeError:
        return []
    variant_ids, = np.where(res)
    #variant_ids = np.array(list(bcolz.eval(query, user_dict=user_dict,
    #    vm="numexpr").wheretrue()))
    # variant ids are 1-based.
    if len(variant_ids) > 0:
        return 1 + variant_ids
    else:
        return []
Esempio n. 26
0
def on_disk_data_cleaner(generator):
    rootdir = tempfile.mkdtemp(prefix='bcolz-')
    os.rmdir(rootdir)  # folder should be emtpy
    ct = bz.fromiter(generator, dtype='i4,i4', count=N, rootdir=rootdir)
    ct = bq.open(rootdir)
    # print ct
    ct.flush()
    ct = bq.open(rootdir)

    yield ct

    shutil.rmtree(rootdir)


def gen(N):
    x = 0
    for i in range(N):
        if random.randint(0, 1):
            x += 1
        yield x, random.randint(0, 20)


if __name__ == '__main__':
    N = int(1e5)
    g = gen(N)

    with on_disk_data_cleaner(g) as ct:
        f1 = ct['f1']
        barr = bz.eval("f1 == 1")  # filter
        with ctime('is_in_ordered_subgroups'):
            result = ct.is_in_ordered_subgroups(basket_col='f0', bool_arr=barr)