예제 #1
0
    def test_groupby_01(self):
        """
        test_groupby_01: Test groupby's group creation
                         (groupby single row rsults into multiple groups)
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        # no operation is specified in `agg_list`, so `sum` is used by default.
        agg_list = ['f4', 'f5', 'f6']
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        assert_list_equal(list(result_bcolz['f0']), uniquekeys)
예제 #2
0
파일: toplevel.py 프로젝트: useric/bquery
def open(rootdir, mode='a'):
    # ----------------------------------------------------------------------
    # https://github.com/Blosc/bcolz/blob/master/bcolz/toplevel.py#L104-L132
    # ----------------------------------------------------------------------
    """
    open(rootdir, mode='a')

    Open a disk-based carray/ctable.
    This function could be used to open bcolz objects as bquery objects to
    perform queries on them.

    Parameters
    ----------
    rootdir : pathname (string)
        The directory hosting the carray/ctable object.
    mode : the open mode (string)
        Specifies the mode in which the object is opened.  The supported
        values are:

          * 'r' for read-only
          * 'w' for emptying the previous underlying data
          * 'a' for allowing read/write on top of existing data

    Returns
    -------
    out : a carray/ctable object or IOError (if not objects are found)

    """
    # First try with a carray
    rootsfile = os.path.join(rootdir, ROOTDIRS)
    if os.path.exists(rootsfile):
        return bquery.ctable(rootdir=rootdir, mode=mode)
    else:
        return bquery.carray(rootdir=rootdir, mode=mode)
예제 #3
0
    def test_groupby_01(self):
        """
        test_groupby_01: Test groupby's group creation
                         (groupby single row rsults into multiple groups)
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = ['f4', 'f5', 'f6']
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print result_bcolz

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print uniquekeys

        assert_list_equal(list(result_bcolz['f0']), uniquekeys)
예제 #4
0
    def test_groupby_05(self):
        """
        test_groupby_05: Test groupby's group creation without cache
        Groupby type 'sum'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = ['f1']
        num_rows = 200

        for _dtype in \
                [
                    'i8',
                    'i4',
                    'f8',
                    'S1',
                ]:

            # -- Data --
            if _dtype == 'S1':
                iterable = ((str(x % 5), x % 5) for x in range(num_rows))
            else:
                iterable = ((x % 5, x % 5) for x in range(num_rows))

            data = np.fromiter(iterable, dtype=_dtype + ',i8')

            # -- Bcolz --
            print('--> Bcolz')
            self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
            os.rmdir(self.rootdir)  # folder should be emtpy
            fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
            fact_bcolz.flush()

            result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
            print(result_bcolz)

            # Itertools result
            print('--> Itertools')
            result_itt = self.helper_itt_groupby(data, groupby_lambda)
            uniquekeys = result_itt['uniquekeys']
            print(uniquekeys)

            ref = []
            for item in result_itt['groups']:
                f1 = 0
                for row in item:
                    f0 = row[0]
                    f1 += row[1]
                ref.append([f0] + [f1])

            assert_list_equal(
                sorted([list(x) for x in result_bcolz]),
                sorted(ref))

            yield self._assert_list_equal, list(result_bcolz['f0']), uniquekeys
예제 #5
0
    def test_groupby_05(self):
        """
        test_groupby_05: Test groupby's group creation without cache
        Groupby type 'sum'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = ['f1']
        num_rows = 200

        for _dtype in \
                [
                    'i8',
                    'i4',
                    'f8',
                    'S1',
                ]:

            # -- Data --
            if _dtype == 'S1':
                iterable = ((str(x % 5), x % 5) for x in range(num_rows))
            else:
                iterable = ((x % 5, x % 5) for x in range(num_rows))

            data = np.fromiter(iterable, dtype=_dtype + ',i8')

            # -- Bcolz --
            print('--> Bcolz')
            self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
            os.rmdir(self.rootdir)  # folder should be emtpy
            fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
            fact_bcolz.flush()

            result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
            print(result_bcolz)

            # Itertools result
            print('--> Itertools')
            result_itt = self.helper_itt_groupby(data, groupby_lambda)
            uniquekeys = result_itt['uniquekeys']
            print(uniquekeys)

            ref = []
            for item in result_itt['groups']:
                f1 = 0
                for row in item:
                    f0 = row[0]
                    f1 += row[1]
                ref.append([f0] + [f1])

            assert_list_equal(
                sorted([list(x) for x in result_bcolz]),
                sorted(ref))

            yield self._assert_list_equal, list(result_bcolz['f0']), uniquekeys
예제 #6
0
    def on_disk_data_cleaner(self, data):
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        ct = bquery.ctable(data, rootdir=self.rootdir)
        # print(ct)
        ct.flush()
        ct = bquery.open(self.rootdir)

        yield ct

        shutil.rmtree(self.rootdir)
        self.rootdir = None
예제 #7
0
    def on_disk_data_cleaner(self, data):
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        ct = bquery.ctable(data, rootdir=self.rootdir)
        # print ct
        ct.flush()
        ct = bquery.open(self.rootdir)

        yield ct

        shutil.rmtree(self.rootdir)
        self.rootdir = None
예제 #8
0
    def uncompress_groupby_to_df(self,
                                 result_tar,
                                 groupby_col_list,
                                 agg_list,
                                 where_terms_list,
                                 aggregate=False):
        # uncompress result returned by the groupby and convert it to a Pandas DataFrame
        tmp_dir = None
        try:
            try:
                tar_file = TarFile(fileobj=StringIO(result_tar))
                tmp_dir = tempfile.mkdtemp(prefix='tar_dir_')
                tar_file.extractall(tmp_dir)
            except TarError:
                self.logger.exception("Could not create/extract tar.")
                raise ValueError(result_tar)
            del result_tar
            del tar_file

            ct = None

            # now untar and aggregate the individual shard results
            for i, sub_tar in enumerate(glob.glob(os.path.join(tmp_dir, '*'))):
                new_dir = os.path.join(tmp_dir, 'bcolz_' + str(i))
                rm_file_or_dir(new_dir)
                with tarfile.open(sub_tar, mode='r') as tar_file:
                    tar_file.extractall(new_dir)
                # rm_file_or_dir(sub_tar)
                ctable_dir = glob.glob(os.path.join(new_dir, '*'))[0]
                new_ct = ctable(rootdir=ctable_dir, mode='a')
                if i == 0:
                    ct = new_ct
                else:
                    ct.append(new_ct)

            # aggregate by groupby parameters
            if ct is None:
                result_df = pd.DataFrame()
            elif aggregate:
                new_dir = os.path.join(tmp_dir, 'end_result')
                rm_file_or_dir(new_dir)
                # we can only sum now
                new_agg_list = [[x[2], 'sum', x[2]] for x in agg_list]
                result_ctable = ct.groupby(groupby_col_list,
                                           new_agg_list,
                                           rootdir=new_dir)
                result_df = result_ctable.todataframe()
            else:
                result_df = ct.todataframe()
        finally:
            rm_file_or_dir(tmp_dir)

        return result_df
예제 #9
0
    def test_groupby_04(self):
        """
        test_groupby_04: Test groupby's aggregation
                             (groupby over multiple rows results
                             into multiple groups)
                             Groupby type 'sum'
        """
        random.seed(1)

        groupby_cols = ['f0', 'f1', 'f2']
        groupby_lambda = lambda x: [x[0], x[1], x[2]]
        agg_list = ['f4', 'f5', 'f6']
        agg_lambda = lambda x: [x[4], x[5], x[6]]
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []
        for item in result_itt['groups']:
            f4 = 0
            f5 = 0
            f6 = 0
            for row in item:
                f0 = groupby_lambda(row)
                f4 += row[4]
                f5 += row[5]
                f6 += row[6]
            ref.append(f0 + [f4, f5, f6])

        assert_list_equal(
            sorted([list(x) for x in result_bcolz]),
            sorted(ref))
예제 #10
0
    def test_groupby_04(self):
        """
        test_groupby_04: Test groupby's aggregation
                             (groupby over multiple rows results
                             into multiple groups)
                             Groupby type 'sum'
        """
        random.seed(1)

        groupby_cols = ['f0', 'f1', 'f2']
        groupby_lambda = lambda x: [x[0], x[1], x[2]]
        agg_list = ['f4', 'f5', 'f6']
        agg_lambda = lambda x: [x[4], x[5], x[6]]
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []
        for item in result_itt['groups']:
            f4 = 0
            f5 = 0
            f6 = 0
            for row in item:
                f0 = groupby_lambda(row)
                f4 += row[4]
                f5 += row[5]
                f6 += row[6]
            ref.append(f0 + [f4, f5, f6])

        assert_list_equal(
            sorted([list(x) for x in result_bcolz]),
            sorted(ref))
예제 #11
0
    def test_groupby_15(self):
        """
        test_groupby_15: Groupby type 'std'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = [['f4', 'std'], ['f5', 'std'], ['f6', 'std']]
        agg_lambda = lambda x: [x[4], x[5], x[6]]
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []
        for item in result_itt['groups']:
            f4 = []
            f5 = []
            f6 = []
            for row in item:
                f0 = groupby_lambda(row)
                f4.append(row[4])
                f5.append(row[5])
                f6.append(row[6])

            ref.append([np.std(f4), np.std(f5), np.std(f6)])

        # remove the first (text) element for floating point comparison
        result = [list(x[1:]) for x in result_bcolz]

        assert_allclose(result, ref, rtol=1e-10)
예제 #12
0
    def test_groupby_15(self):
        """
        test_groupby_15: Groupby type 'std'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = [['f4', 'std'], ['f5', 'std'], ['f6', 'std']]
        agg_lambda = lambda x: [x[4], x[5], x[6]]
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []
        for item in result_itt['groups']:
            f4 = []
            f5 = []
            f6 = []
            for row in item:
                f0 = groupby_lambda(row)
                f4.append(row[4])
                f5.append(row[5])
                f6.append(row[6])

            ref.append([np.std(f4), np.std(f5), np.std(f6)])

        # remove the first (text) element for floating point comparison
        result = [list(x[1:]) for x in result_bcolz]

        assert_allclose(result, ref, rtol=1e-10)
예제 #13
0
    def test_groupby_07(self):
        """
        test_groupby_07: Groupby type 'count_na'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = ['f4', 'f5', 'f6']
        num_rows = 1000

        # -- Data --
        g = self.gen_dataset_count_with_NA(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list,
                                          agg_method='count_na')
        print result_bcolz

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print uniquekeys

        ref = []
        for item in result_itt['groups']:
            f4 = 0
            f5 = 0
            f6 = 0
            for row in item:
                f0 = groupby_lambda(row)
                if row[4] == row[4]:
                    f4 += 1
                f5 += 1
                f6 += 1
            ref.append([f0, f4, f5, f6])

        assert_list_equal(
            [list(x) for x in result_bcolz], ref)
예제 #14
0
    def test_groupby_07(self):
        """
        test_groupby_07: Groupby type 'count'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = [['f4', 'count'], ['f5', 'count'], ['f6', 'count']]
        num_rows = 1000

        # -- Data --
        g = self.gen_dataset_count_with_NA(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []
        for item in result_itt['groups']:
            f4 = 0
            f5 = 0
            f6 = 0
            for row in item:
                f0 = groupby_lambda(row)
                if row[4] == row[4]:
                    f4 += 1
                f5 += 1
                f6 += 1
            ref.append([f0, f4, f5, f6])

        assert_list_equal(
            [list(x) for x in result_bcolz], ref)
예제 #15
0
    def test_groupby_09(self):
        """
        test_groupby_08: Groupby's type 'sorted_count_distinct'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = ['f4', 'f5', 'f6']
        num_rows = 1000

        # -- Data --
        g = self.gen_dataset_count_with_NA_09(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')
        print 'data'
        print data

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list,
                                          agg_method='sorted_count_distinct')
        print result_bcolz

        # # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print uniquekeys

        ref = []

        for n, (u, item) in enumerate(zip(uniquekeys, result_itt['groups'])):
            f4 = len(self._get_unique([x[4] for x in result_itt['groups'][n]]))
            f5 = len(self._get_unique([x[5] for x in result_itt['groups'][n]]))
            f6 = len(self._get_unique([x[6] for x in result_itt['groups'][n]]))
            ref.append([u, f4, f5, f6])
        print ref

        assert_list_equal(
            [list(x) for x in result_bcolz], ref)
예제 #16
0
    def test_groupby_09(self):
        """
        test_groupby_09: Groupby's type 'sorted_count_distinct'
        """
        random.seed(1)

        groupby_cols = ['f0']
        groupby_lambda = lambda x: x[0]
        agg_list = [['f4', 'sorted_count_distinct'], ['f5', 'sorted_count_distinct'], ['f6', 'sorted_count_distinct']]
        num_rows = 2000

        # -- Data --
        g = self.gen_dataset_count_with_NA_09(num_rows)
        sort = sorted([item for item in g], key=lambda x: x[0])
        data = np.fromiter(sort, dtype='S1,f8,i8,i4,f8,i8,i4')
        print('data')
        print(data)

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        ref = []

        for n, (u, item) in enumerate(zip(uniquekeys, result_itt['groups'])):
            f4 = len(self._get_unique([x[4] for x in result_itt['groups'][n]]))
            f5 = len(self._get_unique([x[5] for x in result_itt['groups'][n]]))
            f6 = len(self._get_unique([x[6] for x in result_itt['groups'][n]]))
            ref.append([u, f4, f5, f6])
        print(ref)

        assert_list_equal(
            [list(x) for x in result_bcolz], ref)
예제 #17
0
    def test_where_terms00(self):
        """
        test_where_terms00: get terms in one column bigger than a certain value
        """

        # expected result
        ref_data = np.fromiter(((x > 10000) for x in range(20000)),
                               dtype='bool')
        ref_result = bquery.carray(ref_data)

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', '>', 10000)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        # compare
        assert_array_equal(result, ref_result)
예제 #18
0
    def test_where_terms_04(self):
        """
        test_where_terms04: get mask where terms in list with only one item
        """

        include = [0]

        # expected result
        mask = np.zeros(20000, dtype=bool)
        mask[include] = True

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', 'in', include)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        assert_array_equal(result, mask)
예제 #19
0
    def test_where_terms00(self):
        """
        test_where_terms00: get terms in one column bigger than a certain value
        """

        # expected result
        ref_data = np.fromiter(((x > 10000) for x in range(20000)),
                               dtype='bool')
        ref_result = bquery.carray(ref_data)

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', '>', 10000)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        # compare
        assert_array_equal(result, ref_result)
예제 #20
0
    def test_where_terms02(self):
        """
        test_where_terms02: get mask where terms not in list
        """

        exclude = [0, 1, 2, 3, 11, 12, 13]

        # expected result
        mask = np.ones(20000, dtype=bool)
        mask[exclude] = False

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', 'not in', exclude)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        assert_array_equal(result, mask)
예제 #21
0
    def test_where_terms_04(self):
        """
        test_where_terms04: get mask where terms in list with only one item
        """

        include = [0]

        # expected result
        mask = np.zeros(20000, dtype=bool)
        mask[include] = True

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', 'in', include)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        assert_array_equal(result, mask)
예제 #22
0
    def test_where_terms02(self):
        """
        test_where_terms02: get mask where terms not in list
        """

        exclude = [0, 1, 2, 3, 11, 12, 13]

        # expected result
        mask = np.ones(20000, dtype=bool)
        mask[exclude] = False

        # generate data to filter on
        iterable = ((x, x) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')

        # filter data
        terms_filter = [('f0', 'not in', exclude)]
        ct = bquery.ctable(data, rootdir=self.rootdir)
        result = ct.where_terms(terms_filter)

        assert_array_equal(result, mask)
예제 #23
0
    def test_factorize_groupby_cols_01(self):
        """
        test_factorize_groupby_cols_01:
        """
        ref_fact_table = np.arange(20000) % 5
        ref_fact_groups = np.arange(5)

        # generate data
        iterable = ((x, x % 5) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')
        ct = bquery.ctable(data, rootdir=self.rootdir)

        # factorize - check the only factirized col. [0]
        fact_1 = ct.factorize_groupby_cols(['f1'])
        # cache should be used this time
        fact_2 = ct.factorize_groupby_cols(['f1'])

        assert_array_equal(ref_fact_table, fact_1[0][0])
        assert_array_equal(ref_fact_groups, fact_1[1][0])

        assert_array_equal(fact_1[0][0], fact_2[0][0])
        assert_array_equal(fact_1[1][0], fact_2[1][0])
예제 #24
0
    def test_factorize_groupby_cols_01(self):
        """
        test_factorize_groupby_cols_01:
        """
        ref_fact_table = np.arange(20000) % 5
        ref_fact_groups = np.arange(5)

        # generate data
        iterable = ((x, x % 5) for x in range(20000))
        data = np.fromiter(iterable, dtype='i8,i8')
        ct = bquery.ctable(data, rootdir=tempfile.mkdtemp(prefix='bcolz-'), mode='w')

        # factorize - check the only factirized col. [0]
        fact_1 = ct.factorize_groupby_cols(['f1'])
        # cache should be used this time
        fact_2 = ct.factorize_groupby_cols(['f1'])

        assert_array_equal(ref_fact_table, fact_1[0][0])
        assert_array_equal(ref_fact_groups, fact_1[1][0])

        assert_array_equal(fact_1[0][0], fact_2[0][0])
        assert_array_equal(fact_1[1][0], fact_2[1][0])
예제 #25
0
    def test_groupby_02(self):
        """
        test_groupby_02: Test groupby's group creation
                         (groupby over multiple rows results
                         into multiple groups)
        """
        random.seed(1)

        groupby_cols = ['f0', 'f1', 'f2']
        groupby_lambda = lambda x: [x[0], x[1], x[2]]
        # no operation is specified in `agg_list`, so `sum` is used by default.
        agg_list = ['f4', 'f5', 'f6']
        num_rows = 2000

        # -- Data --
        g = self.gen_almost_unique_row(num_rows)
        data = np.fromiter(g, dtype='S1,f8,i8,i4,f8,i8,i4')

        # -- Bcolz --
        print('--> Bcolz')
        self.rootdir = tempfile.mkdtemp(prefix='bcolz-')
        os.rmdir(self.rootdir)  # folder should be emtpy
        fact_bcolz = bquery.ctable(data, rootdir=self.rootdir)
        fact_bcolz.flush()

        fact_bcolz.cache_factor(groupby_cols, refresh=True)
        result_bcolz = fact_bcolz.groupby(groupby_cols, agg_list)
        print(result_bcolz)

        # Itertools result
        print('--> Itertools')
        result_itt = self.helper_itt_groupby(data, groupby_lambda)
        uniquekeys = result_itt['uniquekeys']
        print(uniquekeys)

        assert_list_equal(
            sorted([list(x) for x in result_bcolz[groupby_cols]]),
            sorted(uniquekeys))
예제 #26
0
파일: load.py 프로젝트: useric/bquery
def create_bcolz_chunks(workdir):
    file_list = sorted(glob.glob(workdir + 'yellow_tripdata_*.csv'))
    if not file_list:
        raise ValueError('No Files Found')

    for i, filename in enumerate(file_list):
        print(filename)

        rootdir = workdir + 'taxi_' + str(i)

        import_df = pd.read_csv(filename)

        # lower columns because of input inconsistencies
        import_df.columns = [x.lower() for x in import_df.columns]
        import_df.columns = [x.strip() for x in import_df.columns]
        import_df.columns = [x.replace('tpep_', '') for x in import_df.columns]

        import_df['nr_rides'] = 1

        import_df['pickup_date'] = import_df['pickup_datetime'].str[0:10]
        import_df['pickup_date'] = import_df['pickup_date'].str.replace('-', '')
        import_df['pickup_year'] = import_df['pickup_date'].str[0:4].astype(int)
        import_df['pickup_yearmonth'] = import_df['pickup_date'].str[0:6].astype(int)
        import_df['pickup_month'] = import_df['pickup_date'].str[4:6].astype(int)
        import_df['pickup_date'] = import_df['pickup_date'].astype(int)
        import_df['pickup_time'] = import_df['pickup_datetime'].str[11:]
        import_df['pickup_time'] = import_df['pickup_time'].str.replace(':', '')
        import_df['pickup_hour'] = import_df['pickup_time'].str[0:2].astype(int)
        import_df['pickup_time'] = import_df['pickup_time'].astype(int)
        del import_df['pickup_datetime']

        import_df['dropoff_date'] = import_df['dropoff_datetime'].str[0:10]
        import_df['dropoff_date'] = import_df['dropoff_date'].str.replace('-', '')
        import_df['dropoff_year'] = import_df['dropoff_date'].str[0:4].astype(int)
        import_df['dropoff_yearmonth'] = import_df['dropoff_date'].str[0:6].astype(int)
        import_df['dropoff_month'] = import_df['dropoff_date'].str[4:6].astype(int)
        import_df['dropoff_date'] = import_df['dropoff_date'].astype(int)
        import_df['dropoff_time'] = import_df['dropoff_datetime'].str[11:]
        import_df['dropoff_time'] = import_df['dropoff_time'].str.replace(':', '')
        import_df['dropoff_hour'] = import_df['dropoff_time'].str[0:2].astype(int)
        import_df['dropoff_time'] = import_df['dropoff_time'].astype(int)
        del import_df['dropoff_datetime']

        import_ct = ctable.fromdataframe(import_df,
                                         rootdir=rootdir,
                                         expectedlen=len(import_df),
                                         mode='w')
        del import_df

        import_ct.flush()

        import_ct = ctable(rootdir=rootdir, mode='a')

        import_ct.cache_factor([
            'dropoff_date',
            'dropoff_hour',
            'dropoff_latitude',
            'dropoff_longitude',
            'dropoff_month',
            'dropoff_time',
            'dropoff_year',
            'dropoff_yearmonth',
            'payment_type',
            'pickup_date',
            'pickup_hour',
            'pickup_latitude',
            'pickup_longitude',
            'pickup_month',
            'pickup_time',
            'pickup_year',
            'pickup_yearmonth',
            'ratecodeid',
            'store_and_fwd_flag',
            'vendorid'])
예제 #27
0
    def handle_work(self, msg):
        if msg.isa('execute_code'):
            return self.execute_code(msg)

        tmp_dir = tempfile.mkdtemp(prefix='result_')
        buf_file_fd, buf_file = tempfile.mkstemp(prefix='tar_')
        os.close(buf_file_fd)

        args, kwargs = msg.get_args_kwargs()
        self.logger.info('doing calc %s' % args)
        filename = args[0]
        groupby_col_list = args[1]
        aggregation_list = args[2]
        where_terms_list = args[3]
        expand_filter_column = kwargs.get('expand_filter_column')
        aggregate = kwargs.get('aggregate', True)

        # create rootdir
        rootdir = os.path.join(self.data_dir, filename)
        if not os.path.exists(rootdir):
            raise Exception('Path %s does not exist' % rootdir)

        ct = bquery.ctable(rootdir=rootdir, mode='r', auto_cache=True)

        # prepare filter
        if not where_terms_list:
            bool_arr = None
        else:
            # quickly verify the where_terms_list
            if not ct.where_terms_factorization_check(where_terms_list):
                # return an empty result because the where terms do not give a result for this ctable
                msg['data'] = ''
                return msg
            # else create the boolean array
            bool_arr = ct.where_terms(where_terms_list, cache=True)

        # expand filter column check
        if expand_filter_column:
            bool_arr = ct.is_in_ordered_subgroups(basket_col=expand_filter_column, bool_arr=bool_arr)

        # retrieve & aggregate if needed
        rm_file_or_dir(tmp_dir)
        if aggregate:
            # aggregate by groupby parameters
            result_ctable = ct.groupby(groupby_col_list, aggregation_list, bool_arr=bool_arr,
                                       rootdir=tmp_dir)
        else:
            # direct result from the ctable
            column_list = groupby_col_list + [x[0] for x in aggregation_list]
            if bool_arr is not None:
                result_ctable = bcolz.fromiter(ct[column_list].where(bool_arr), ct[column_list].dtype, sum(bool_arr),
                                               rootdir=tmp_dir, mode='w')
            else:
                result_ctable = bcolz.fromiter(ct[column_list], ct[column_list].dtype, ct.len,
                                               rootdir=tmp_dir, mode='w')

        # *** clean up temporary files and memory objects
        # filter
        del bool_arr

        # input
        ct.free_cachemem()
        ct.clean_tmp_rootdir()
        del ct

        # save result to archive
        result_ctable.flush()
        result_ctable.free_cachemem()
        with tarfile.open(buf_file, mode='w') as archive:
            archive.add(tmp_dir, arcname=os.path.basename(tmp_dir))
        del result_ctable
        rm_file_or_dir(tmp_dir)

        # create message
        with open(buf_file, 'r') as file:
            # add result to message
            msg['data'] = file.read()
        rm_file_or_dir(buf_file)

        return msg
예제 #28
0
ga = itt.cycle(['ES', 'NL'])
gb = itt.cycle(['b1', 'b2', 'b3', 'b4', 'b5'])
gx = itt.cycle([1, 2])
gy = itt.cycle([-1, -2])
rootdir = 'bench-data.bcolz'
if os.path.exists(rootdir):
    shutil.rmtree(rootdir)

n_rows = 1000000
print('Rows: ', n_rows)

# -- data
z = np.fromiter(((a, b, x, y) for a, b, x, y in izip(ga, gb, gx, gy)),
                dtype='S2,S2,i8,i8', count=n_rows)

ct = bquery.ctable(z, rootdir=rootdir, )
print(ct)

# -- pandas --
df = pd.DataFrame(z)
with ctime(message='pandas'):
    result = df.groupby(['f0'])['f2'].sum()
print(result)
t_pandas = t_elapsed

# -- cytoolz --
with ctime(message='cytoolz over bcolz'):
    # In Memory Split-Apply-Combine
    # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby
    r = cytoolz.groupby(lambda row: row.f0, ct)
    result = valmap(compose(sum, pluck(2)), r)
예제 #29
0
gx = itt.cycle([1, 2])
gy = itt.cycle([-1, -2])
rootdir = 'bench-data.bcolz'
if os.path.exists(rootdir):
    shutil.rmtree(rootdir)

n_rows = 1000000
print('Rows: ', n_rows)

# -- data
z = np.fromiter(((a, b, x, y) for a, b, x, y in izip(ga, gb, gx, gy)),
                dtype='S2,S2,i8,i8',
                count=n_rows)

ct = bquery.ctable(
    z,
    rootdir=rootdir,
)
print(ct)

# -- pandas --
df = pd.DataFrame(z)
with ctime(message='pandas'):
    result = df.groupby(['f0'])['f2'].sum()
print(result)
t_pandas = t_elapsed

# -- cytoolz --
with ctime(message='cytoolz over bcolz'):
    # In Memory Split-Apply-Combine
    # http://toolz.readthedocs.org/en/latest/streaming-analytics.html?highlight=reduce#split-apply-combine-with-groupby-and-reduceby
    r = cytoolz.groupby(lambda row: row.f0, ct)