Ejemplo n.º 1
0
    def split(self, data_inst, shuffle=True):
        header = data_inst.schema.get('header')

        data_sids_iter, data_size = collect_index(data_inst)

        data_sids = []
        for sid, _ in data_sids_iter:
            data_sids.append(sid)
        data_sids = np.array(data_sids)

        if shuffle:
            np.random.shuffle(data_sids)

        kf = sk_KFold(n_splits=self.n_splits)

        for train, test in kf.split(data_sids):
            train_sids = data_sids[train]
            test_sids = data_sids[test]
            train_sids_table = [(str(x), 1) for x in train_sids]
            test_sids_table = [(str(x), 1) for x in test_sids]
            # print(train_sids_table)
            train_table = eggroll.parallelize(train_sids_table,
                                              include_key=True,
                                              partition=data_inst._partitions)
            train_data = data_inst.join(train_table, lambda x, y: x)
            test_table = eggroll.parallelize(test_sids_table,
                                             include_key=True,
                                             partition=data_inst._partitions)
            test_data = data_inst.join(test_table, lambda x, y: x)
            train_data.schema['header'] = header
            test_data.schema['header'] = header
            yield train_data, test_data
Ejemplo n.º 2
0
    def setUp(self):
        eggroll.init("test_encrypt_mode_calculator")

        self.list_data = []
        self.tuple_data = []
        self.numpy_data = []

        for i in range(30):
            list_value = [100 * i + j for j in range(20)]
            tuple_value = tuple(list_value)
            numpy_value = np.array(list_value, dtype="int")

            self.list_data.append(list_value)
            self.tuple_data.append(tuple_value)
            self.numpy_data.append(numpy_value)

        self.data_list = eggroll.parallelize(self.list_data,
                                             include_key=False,
                                             partition=10)
        self.data_tuple = eggroll.parallelize(self.tuple_data,
                                              include_key=False,
                                              partition=10)
        self.data_numpy = eggroll.parallelize(self.numpy_data,
                                              include_key=False,
                                              partition=10)
Ejemplo n.º 3
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = eggroll.parallelize(dense_inst,
                                               include_key=True,
                                               partition=2)

        sparse_inst = []
        col_zero = []
        for i in range(100):
            indices = []
            data = []
            for j in range(20):
                val = ((i + 5)**3 + (j + 1)**4) % 16
                if val > 0:
                    indices.append(j)
                    data.append(val)
                if j == 0:
                    col_zero.append(val)
            sparse_vec = SparseVector(indices, data, 20)
            inst = Instance(features=sparse_vec)
            sparse_inst.append((i, inst))

        self.sparse_inst = sparse_inst
        self.sparse_table = eggroll.parallelize(sparse_inst,
                                                include_key=True,
                                                partition=1)
Ejemplo n.º 4
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        dense_not_inst = []
        headers = ['x' + str(i) for i in range(20)]
        self.header = headers
        self.eps = 1e-5
        self.count = 100
        self.dense_data_transpose = []
        for i in range(self.count):
            features = i % 16 * np.ones(20)
            inst = Instance(features=features)
            dense_inst.append((i, inst))
            self.dense_data_transpose.append(features)
            dense_not_inst.append((i, features))
        self.dense_inst = dense_inst
        self.dense_not_inst = dense_not_inst
        self.dense_data_transpose = np.array(self.dense_data_transpose)
        self.dense_data_transpose = self.dense_data_transpose.transpose()

        self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5)
        self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5)
        self.dense_table.schema = {'header': headers}
        self.dense_not_inst_table.schema = {'header': headers}

        col_index = [1, 2, 3]
        self.col_index = col_index
        self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None])
        self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index,
                                                                   abnormal_list=[None])
Ejemplo n.º 5
0
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)

        size = 10
        self.wx = eggroll.parallelize(
            [self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = eggroll.parallelize(
            [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = eggroll.parallelize([
            Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2))
            for i in range(size)
        ],
                                             partition=1)

        # test fore_gradient
        self.fore_gradient_local = [
            -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75
        ]
        # test gradient
        self.gradient = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125
        ]
        self.gradient_fit_intercept = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125, 1.125
        ]

        self.loss = 4.505647
Ejemplo n.º 6
0
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        eggroll.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = eggroll.parallelize(data_insts, include_key=False)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]
Ejemplo n.º 7
0
 def setUp(self):
     eggroll.init("test_random_sampler")
     self.data = [(i * 10 + 5, i * i) for i in range(100)]
     self.table = eggroll.parallelize(self.data, include_key=True)
     self.data_to_trans = [(i * 10 + 5, i * i * i) for i in range(100)]
     self.table_trans = eggroll.parallelize(self.data_to_trans,
                                            include_key=True)
Ejemplo n.º 8
0
    def setUp(self):
        eggroll.init("test_instance")

        dense_inst = []
        headers = ['x' + str(i) for i in range(20)]
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = eggroll.parallelize(dense_inst,
                                               include_key=True,
                                               partition=2)
        self.dense_table.schema = {'header': headers}

        self.sparse_inst = []
        for i in range(100):
            dict = {}
            indices = []
            data = []
            for j in range(20):
                idx = random.randint(0, 29)
                if idx in dict:
                    continue
                dict[idx] = 1
                val = random.random()
                indices.append(idx)
                data.append(val)

            sparse_vec = SparseVector(indices, data, 30)
            self.sparse_inst.append((i, Instance(features=sparse_vec)))

        self.sparse_table = eggroll.parallelize(self.sparse_inst,
                                                include_key=True)
        self.sparse_table.schema = {
            "header": ["fid" + str(i) for i in range(30)]
        }
Ejemplo n.º 9
0
 def setUp(self):
     eggroll.init("test_least_abs_error_loss")
     self.lae_loss = LeastAbsoluteErrorLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 10
0
 def setUp(self):
     eggroll.init("test_fair_loss")
     self.log_cosh_loss = LogCoshLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 11
0
 def setUp(self):
     eggroll.init("test_cross_entropy")
     self.sigmoid_loss = SigmoidBinaryCrossEntropyLoss()
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 12
0
 def save_eval_result(self, eval_data):
     eggroll.parallelize(
         [eval_data],
         include_key=False,
         name=self.workflow_param.evaluation_output_table,
         namespace=self.workflow_param.evaluation_output_namespace,
         error_if_exist=False,
         persistent=True)
Ejemplo n.º 13
0
 def setUp(self):
     eggroll.init("test_huber_loss")
     self.delta = 1
     self.huber_loss = HuberLoss(self.delta)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 14
0
    def setUp(self):
        data1 = [("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")]
        self.table1 = eggroll.parallelize(data1, include_key=True)

        data2 = [("a", '-1,,na,null,null,2')]
        self.table2 = eggroll.parallelize(data2, include_key=True)
        self.args1 = {"data": {"data_io_0": {"data": self.table1}}}
        self.args2 = {"data": {"data_io_1": {"data": self.table2}}}
        self.tracker = Tracking("jobid", "guest", 9999, "abc", "123")
Ejemplo n.º 15
0
    def setUp(self):
        eggroll.init("test_label_checker")

        self.small_label_set = [i % 5 for i in range(100)]
        self.classify_y = eggroll.parallelize(self.small_label_set, include_key=False)
        self.regression_label = [random.random() for i in range(100)]
        self.regression_y = eggroll.parallelize(self.regression_label)
        self.classify_checker = ClassifyLabelChecker()
        self.regression_checker = RegressionLabelChecker()
Ejemplo n.º 16
0
 def setUp(self):
     eggroll.init("test_fair_loss")
     self.rho = 0.5
     self.tweedie_loss = TweedieLoss(self.rho)
     self.y_list = [i % 2 for i in range(100)]
     self.predict_list = [random.random() for i in range(100)]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 17
0
 def setUp(self):
     eggroll.init("test_cross_entropy")
     self.softmax_loss = SoftmaxCrossEntropyLoss()
     self.y_list = [i % 5 for i in range(100)]
     self.predict_list = [
         np.array([random.random() for i in range(5)]) for j in range(100)
     ]
     self.y = eggroll.parallelize(self.y_list, include_key=False)
     self.predict = eggroll.parallelize(self.predict_list,
                                        include_key=False)
Ejemplo n.º 18
0
    def setUp(self):
        eggroll.init("test_stratified_sampler")
        self.data = []
        self.data_to_trans = []
        for i in range(1000):
            self.data.append((i, Instance(label=i % 4, features=i * i)))
            self.data_to_trans.append((i, Instance(features = i ** 3)))

        self.table = eggroll.parallelize(self.data, include_key=True)
        self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
Ejemplo n.º 19
0
    def setUp(self):
        eggroll.init("test_dataio" + str(int(time.time())))
        self.table = "dataio_table_test"
        self.namespace = "dataio_test"
        table = eggroll.parallelize([("a", "1,2,-1,0,0,5"), ("b", "4,5,6,0,1,2")], include_key=True)
        table.save_as(self.table, self.namespace)

        self.table2 = "dataio_table_test2"
        self.namespace2 = "dataio_test2"
        table2 = eggroll.parallelize([("a", '-1,,NA,NULL,null,2')], include_key=True)
        table2.save_as(self.table2, self.namespace2)
Ejemplo n.º 20
0
 def save_eval_result(self, eval_data):
     LOGGER.info(
         "@ save evaluation result to table with namespace: {0} and name: {1}"
         .format(self.workflow_param.evaluation_output_namespace,
                 self.workflow_param.evaluation_output_table))
     eggroll.parallelize(
         [eval_data],
         include_key=False,
         name=self.workflow_param.evaluation_output_table,
         namespace=self.workflow_param.evaluation_output_namespace,
         error_if_exist=False,
         persistent=True)
Ejemplo n.º 21
0
    def test_sparse_abnormal_data(self):
        final_result = []
        numpy_array = []
        sparse_inst_shape = self.feature_num + 15
        indices = [x for x in range(self.feature_num + 10)]
        for i in range(self.data_num):
            tmp = 100 * np.random.rand(self.feature_num)
            tmp = [ik for ik in range(self.feature_num)]
            tmp[i % self.feature_num] = 'nan'
            # data_index = np.random.choice(indices, self.feature_num, replace=False)
            # data_index = sorted(data_index)
            data_index = [idx for idx in range(self.feature_num)]
            sparse_inst = SparseVector(data_index,
                                       tmp,
                                       shape=sparse_inst_shape)
            if i == 0:
                aa = sparse_inst.get_data(0, 'a')
                print('in for loop: {}, type: {}'.format(aa, type(aa)))
            inst = Instance(inst_id=i, features=sparse_inst, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            n = 0
            pointer = 0
            tmp_array = []
            while n < sparse_inst_shape:
                if n in data_index:
                    tmp_array.append(tmp[pointer])
                    pointer += 1
                else:
                    tmp_array.append(0)
                n += 1
            numpy_array.append(tmp_array)

        abnormal_value = final_result[0][1].features.get_data(0, 'a')
        print('abnormal_value: {}, type: {}'.format(abnormal_value,
                                                    type(abnormal_value)))
        table = eggroll.parallelize(final_result,
                                    include_key=True,
                                    partition=1)
        header = ['x' + str(i) for i in range(sparse_inst_shape)]
        numpy_table = np.array(numpy_array)
        table.schema = {'header': header}
        self.used_data_set.append(table)

        bin_obj = self._bin_obj_generator(abnormal_list=['nan'])
        split_points = bin_obj.fit_split_points(table)
        print('split_points: {}'.format(split_points))
        print(numpy_table)

        trans_result = bin_obj.transform(table,
                                         transform_cols_idx=-1,
                                         transform_type='bin_num')
        trans_result = trans_result.collect()
        print('transform result: ')
        for k, v in trans_result:
            value = v.features.get_all_data()
            value_list = []
            for value_k, value_v in value:
                value_list.append((value_k, value_v))
            print(k, value_list)
Ejemplo n.º 22
0
def save_model_parameters(model_parameters, model_table_name, model_namespace):
    dtable = parallelize(model_parameters.items(), include_key=True,
                         name=model_table_name,
                         namespace=model_namespace,
                         error_if_exist=True,
                         persistent=True)
    return dtable
Ejemplo n.º 23
0
    def sample_data(data_instance, bin_sample_num=DEFAULT_BIN_SAMPLE_NUM):
        """
        sample data from a dtable

        Parameters
        ----------
        data_instance : DTable
            The input data

        bin_sample_num : int, max number of data to be sample to generate bin split points

        Returns
        -------
        sample_data: list, element is a (id, instance) tuple

        """
        LOGGER.info("fsample data set")

        data_key_none_value = data_instance.mapValues(lambda value: None)
        data_key_none_value_tuple = list(data_key_none_value.collect())

        data_num = len(data_key_none_value_tuple)

        if data_num <= bin_sample_num:
            data_keys = [(key, _) for (key, _) in data_key_none_value_tuple]
        else:
            sample_idxs = np.random.choice(data_num,
                                           bin_sample_num,
                                           replace=False)
            data_keys = [data_key_none_value_tuple[idx] for idx in sample_idxs]

        data_key_table = eggroll.parallelize(data_keys, include_key=True)
        sample_data = list(
            data_key_table.join(data_instance, lambda x, y: y).collect())
        return sample_data
Ejemplo n.º 24
0
    def setUp(self):
        # eggroll.init("123")
        self.data_num = 1000
        self.feature_num = 20
        final_result = []
        numpy_array = []
        for i in range(self.data_num):
            tmp = np.random.rand(self.feature_num)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            numpy_array.append(tmp)
        table = eggroll.parallelize(final_result,
                                    include_key=True,
                                    partition=10)

        header = ['x' + str(i) for i in range(self.feature_num)]
        self.col_dict = {}
        for idx, h in enumerate(header):
            self.col_dict[h] = idx

        self.table = table
        self.table.schema = {'header': header}
        self.numpy_table = np.array(numpy_array)
        self.cols = [1, 2]
        self.used_data_set = []
Ejemplo n.º 25
0
    def transform(self, instance_table):
        """
        transform instances into features.

        Parameters
        ----------
        :param instance_table: dtable with a collection of (index, instance) pairs
        :return:
        """

        LOGGER.debug("@ extract representative features from raw input")

        index_tracking_list = []

        indexed_instances = instance_table.collect()
        features_list = []
        instances_list = []
        for idx, inst in indexed_instances:
            index_tracking_list.append(idx)
            features_list.append(inst.features)
            instances_list.append(inst)
        raw_features = np.array(features_list)
        trans_features = self.model.transform(raw_features)

        indexed_instances = []
        for idx, inst, feat in zip(index_tracking_list, instances_list,
                                   trans_features):
            inst.set_feature(feat)
            indexed_instances.append((idx, inst))

        dtable = eggroll.parallelize(indexed_instances,
                                     include_key=True,
                                     partition=instance_table._partitions)
        return dtable, index_tracking_list
Ejemplo n.º 26
0
def create_shared_gradient_table(gradients, index_list):
    indexed_instances = []
    for idx, grad in zip(index_list, gradients):
        indexed_instances.append((idx, grad))

    dtable = eggroll.parallelize(indexed_instances, include_key=True)
    return dtable
Ejemplo n.º 27
0
    def setUp(self):
        self.data_num = 100
        self.feature_num = 3
        self.cols = [0, 1, 2]
        self.header = ['x' + str(i) for i in range(self.feature_num)]
        final_result = []

        for i in range(self.data_num):
            tmp = []
            for _ in range(self.feature_num):
                tmp.append(np.random.choice([1, 2, 3]))
            tmp = np.array(tmp)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)

        table = eggroll.parallelize(final_result,
                                    include_key=True,
                                    partition=10)
        table.schema = {"header": self.header}
        self.model_name = 'OneHotEncoder'

        self.table = table

        self.args = {"data": {self.model_name: {"data": table}}}
Ejemplo n.º 28
0
def create_table(data, indexes=None, model_table_name=None, model_namespace=None, persistent=False):
    if indexes is None:
        dtable = parallelize(data, include_key=False,
                             name=model_table_name,
                             namespace=model_namespace,
                             error_if_exist=True,
                             persistent=persistent)
    else:
        data_dict = {}
        for i, index in enumerate(indexes):
            data_dict[index] = data[i]
        dtable = parallelize(data_dict.items(), include_key=True,
                             name=model_table_name,
                             namespace=model_namespace,
                             error_if_exist=True,
                             persistent=persistent)
    return dtable
Ejemplo n.º 29
0
    def merge_splitinfo(self, splitinfo_guest, splitinfo_host):
        LOGGER.info("merge splitinfo")
        splitinfo_guest_host_table = eggroll.parallelize(zip(splitinfo_guest, splitinfo_host),
                                                         include_key=False,
                                                         partition=self.data_bin._partitions)
        best_splitinfo_table = splitinfo_guest_host_table.mapValues(self.find_best_split_guest_and_host)
        best_splitinfos = [best_splitinfo[1] for best_splitinfo in best_splitinfo_table.collect()]

        return best_splitinfos
Ejemplo n.º 30
0
    def mini_batch_index_generator(self, data_inst=None, batch_size=320):
        if data_inst is not None or batch_size != self.batch_size:
            self.batch_data_sids = self.__mini_batch_data_seperator(data_inst, batch_size)
            self.batch_size = batch_size
        batch_data_sids = self.batch_data_sids

        for bid in range(len(batch_data_sids)):
            index_data = batch_data_sids[bid]
            index_table = eggroll.parallelize(index_data, include_key=True, partition=data_inst._partitions)
            yield index_table