Beispiel #1
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = eggroll.parallelize(dense_inst,
                                               include_key=True,
                                               partition=2)

        sparse_inst = []
        col_zero = []
        for i in range(100):
            indices = []
            data = []
            for j in range(20):
                val = ((i + 5)**3 + (j + 1)**4) % 16
                if val > 0:
                    indices.append(j)
                    data.append(val)
                if j == 0:
                    col_zero.append(val)
            sparse_vec = SparseVector(indices, data, 20)
            inst = Instance(features=sparse_vec)
            sparse_inst.append((i, inst))

        self.sparse_inst = sparse_inst
        self.sparse_table = eggroll.parallelize(sparse_inst,
                                                include_key=True,
                                                partition=1)
Beispiel #2
0
    def setUp(self):
        eggroll.init("test_instance")

        dense_inst = []
        headers = ['x' + str(i) for i in range(20)]
        for i in range(100):
            inst = Instance(features=(i % 16 * np.ones(20)))
            dense_inst.append((i, inst))
        self.dense_table = eggroll.parallelize(dense_inst,
                                               include_key=True,
                                               partition=2)
        self.dense_table.schema = {'header': headers}

        self.sparse_inst = []
        for i in range(100):
            dict = {}
            indices = []
            data = []
            for j in range(20):
                idx = random.randint(0, 29)
                if idx in dict:
                    continue
                dict[idx] = 1
                val = random.random()
                indices.append(idx)
                data.append(val)

            sparse_vec = SparseVector(indices, data, 30)
            self.sparse_inst.append((i, Instance(features=sparse_vec)))

        self.sparse_table = eggroll.parallelize(self.sparse_inst,
                                                include_key=True)
        self.sparse_table.schema = {
            "header": ["fid" + str(i) for i in range(30)]
        }
    def _compensate_set_difference(self, original_data, data_output):
        self.coverage = data_output.count() / original_data.count()
        import copy
        schema = copy.deepcopy(original_data.schema)
        if self.need_label:
            original_data = original_data.mapValues(lambda v: Instance(label="unretrieved", features=[],
                                                                       inst_id=v.inst_id))
        else:
            feature_count = len(self.target_cols)
            features = np.array(["unretrieved"] * feature_count)
            original_data = original_data.mapValues(lambda v: Instance(features=features,
                                                                       inst_id=v.inst_id))
        # LOGGER.debug(f"original data features is {list(original_data.collect())[0][1].features}")
        # LOGGER.debug(f"original data label is {list(original_data.collect())[0][1].label}")

        data_output = original_data.union(data_output, lambda v, u: u)
        # LOGGER.debug(f"data_output features after union is {list(data_output.collect())[0][1].features}")
        # LOGGER.debug(f"data_output label after union is {list(data_output.collect())[0][1].label}")
        if self.need_label:
            schema["label_name"] = "retrieved_value"
            schema["header"] = []
            data_output.schema = schema
        else:
            schema["label_name"] = None
            schema["header"] = self.target_cols
            data_output.schema = schema
        self._sync_coverage(original_data)
        return data_output
Beispiel #4
0
    def gen_data(self, data_num, feature_num, partition, is_sparse=False, use_random=False):
        data = []
        shift_iter = 0
        header = [str(i) for i in range(feature_num)]

        for data_key in range(data_num):
            value = data_key % bin_num
            if value == 0:
                if shift_iter % bin_num == 0:
                    value = bin_num - 1
                shift_iter += 1
            if not is_sparse:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                inst = Instance(inst_id=data_key, features=features, label=data_key % 2)

            else:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                data_index = [x for x in range(feature_num)]
                sparse_inst = SparseVector(data_index, data=features, shape=10 * feature_num)
                inst = Instance(inst_id=data_key, features=sparse_inst, label=data_key % 2)
                header = [str(i) for i in range(feature_num * 10)]

            data.append((data_key, inst))
        result = session.parallelize(data, include_key=True, partition=partition)
        result.schema = {'header': header}
        return result
Beispiel #5
0
def split_into_guest_host_dtable(X, y, overlap_ratio=0.2, guest_split_ratio=0.5, guest_feature_num=16,
                                 tables_name=None, partition=1):
    data_size = X.shape[0]
    overlap_size = int(data_size * overlap_ratio)
    overlap_indexes = np.array(range(overlap_size))
    guest_size = int((data_size - overlap_size) * guest_split_ratio)

    guest_table_ns = "guest_table_ns"
    guest_table_name = "guest_table_name"
    host_table_ns = "host_table_ns"
    host_table_name = "host_table_name"
    if tables_name is not None:
        guest_table_ns = tables_name["guest_table_ns"]
        guest_table_name = tables_name["guest_table_name"]
        host_table_ns = tables_name["host_table_ns"]
        host_table_name = tables_name["host_table_name"]

    guest_temp = []
    for i in range(0, overlap_size + guest_size):
        guest_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, :guest_feature_num].reshape(1, -1), label=y[i, 0])))
    guest_data = table(name=guest_table_name, namespace=guest_table_ns, partition=partition)
    guest_data.put_all(guest_temp)

    host_temp = []
    for i in range(0, overlap_size):
        host_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0])))
    for i in range(overlap_size + guest_size, len(X)):
        host_temp.append(
            (i, Instance(inst_id=None, weight=1.0, features=X[i, guest_feature_num:].reshape(1, -1), label=y[i, 0])))
    host_data = table(name=host_table_name, namespace=host_table_ns, partition=partition)
    host_data.put_all(host_temp)
    return guest_data, host_data, overlap_indexes
Beispiel #6
0
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        self.gradient_operator = LogisticGradient()
        self.taylor_operator = TaylorLogisticGradient()

        self.X = np.array([[1, 2, 3, 4, 5], [3, 2, 4, 5, 1], [
            2,
            2,
            3,
            1,
            1,
        ]]) / 10
        self.X1 = np.c_[self.X, np.ones(3)]

        self.Y = np.array([[1], [1], [-1]])

        self.values = []
        for idx, x in enumerate(self.X):
            inst = Instance(inst_id=idx, features=x, label=self.Y[idx])
            self.values.append((idx, inst))

        self.values1 = []
        for idx, x in enumerate(self.X1):
            inst = Instance(inst_id=idx, features=x, label=self.Y[idx])
            self.values1.append((idx, inst))

        self.coef = np.array([2, 2.3, 3, 4, 2.1]) / 10
        self.coef1 = np.append(self.coef, [1])
    def setUp(self):
        session.init("test_label_checker")

        self.small_label_set = [Instance(label=i % 5) for i in range(100)]
        self.classify_inst = session.parallelize(self.small_label_set, include_key=False)
        self.regression_label = [Instance(label=random.random()) for i in range(100)]
        self.regression_inst = session.parallelize(self.regression_label)
        self.classify_checker = ClassifyLabelChecker()
        self.regression_checker = RegressionLabelChecker()
    def _gen_data(self,
                  data_num,
                  feature_num,
                  partition,
                  expect_ratio,
                  is_sparse=False,
                  use_random=False):
        data = []
        shift_iter = 0
        header = [str(i) for i in range(feature_num)]
        # bin_num = 3
        label_count = {}
        # expect_ratio = {
        #     0: (1, 9),
        #     1: (1, 1),
        #     2: (9, 1)
        # }
        bin_num = len(expect_ratio)

        for data_key in range(data_num):
            value = data_key % bin_num
            if value == 0:
                if shift_iter % bin_num == 0:
                    value = bin_num - 1
                shift_iter += 1
            if not is_sparse:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                label = self.__gen_label(value, label_count, expect_ratio)
                inst = Instance(inst_id=data_key,
                                features=features,
                                label=label)

            else:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                data_index = [x for x in range(feature_num)]
                sparse_inst = SparseVector(data_index,
                                           data=features,
                                           shape=10 * feature_num)
                label = self.__gen_label(value, label_count, expect_ratio)
                inst = Instance(inst_id=data_key,
                                features=sparse_inst,
                                label=label)
                header = [str(i) for i in range(feature_num * 10)]

            data.append((data_key, inst))
        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        self.table_list.append(result)
        return result
Beispiel #9
0
    def setUp(self):
        eggroll.init("test_stratified_sampler")
        self.data = []
        self.data_to_trans = []
        for i in range(1000):
            self.data.append((i, Instance(label=i % 4, features=i * i)))
            self.data_to_trans.append((i, Instance(features = i ** 3)))

        self.table = eggroll.parallelize(self.data, include_key=True)
        self.table_trans = eggroll.parallelize(self.data_to_trans, include_key=True)
    def _merge_instance(id_map1, id_map2, need_label):
        """

        :param id_map1: (a, b)
        :param id_map2: (a, c)
        :return: (c, b)
        """
        merge_table = id_map1.join(id_map2, lambda v, u: (u, v))
        if need_label:
            return merge_table.map(lambda k, v: (v[0], Instance(label=v[1], features=[])))
        else:
            return merge_table.map(lambda k, v: (v[0], Instance(features=v[1])))
    def _gen_data(self,
                  data_num,
                  feature_num,
                  partition,
                  expect_split_points,
                  is_sparse=False,
                  use_random=False):
        data = []
        shift_iter = 0
        header = [str(i) for i in range(feature_num)]
        bin_num = len(expect_split_points)

        for data_key in range(data_num):
            value = expect_split_points[data_key % bin_num]
            if value == expect_split_points[-1]:
                if shift_iter % bin_num == 0:
                    value = expect_split_points[0]
                shift_iter += 1
            if not is_sparse:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                inst = Instance(inst_id=data_key,
                                features=features,
                                label=data_key % 2)

            else:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                data_index = [x for x in range(feature_num)]
                sparse_inst = SparseVector(data_index,
                                           data=features,
                                           shape=feature_num)
                inst = Instance(inst_id=data_key,
                                features=sparse_inst,
                                label=data_key % 2)
                header = [str(i) for i in range(feature_num)]

            data.append((data_key, inst))
        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        self.table_list.append(result)
        return result
    def gen_data(self, data_num, feature_num, partition):
        data = []
        header = [str(i) for i in range(feature_num)]
        # col_2 = np.random.rand(data_num)
        col_data = []
        for _ in range(feature_num - 1):
            while True:
                col_1 = np.random.rand(data_num)
                if np.mean(col_1) != 0:
                    break
            col_data.append(col_1)
        col_data.append(10 * np.ones(data_num))

        for key in range(data_num):
            data.append(
                (key,
                 Instance(features=np.array([col[key] for col in col_data]))))

        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        self.header = header

        self.coe_list = []
        for col in col_data:
            self.coe_list.append(np.std(col) / np.mean(col))
        return result
Beispiel #13
0
    def _gen_table_data(self):
        if self._dense_table is not None:
            return self._dense_table, self._dense_not_inst_table, self._original_data
        headers = ['x' + str(i) for i in range(self.feature_num)]
        dense_inst = []
        dense_not_inst = []

        original_data = 100 * np.random.random((self.count, self.feature_num))
        # original_data = 100 * np.zeros((self.count, self.feature_num))

        for i in range(self.count):
            features = original_data[i, :]
            inst = Instance(features=features)
            dense_inst.append((i, inst))
            dense_not_inst.append((i, features))

        dense_table = session.parallelize(dense_inst,
                                          include_key=True,
                                          partition=16)
        dense_not_inst_table = session.parallelize(dense_not_inst,
                                                   include_key=True,
                                                   partition=16)
        dense_table.schema = {'header': headers}
        dense_not_inst_table.schema = {'header': headers}
        self._dense_table, self._dense_not_inst_table, self._original_data = \
            dense_table, dense_not_inst_table, original_data
        return dense_table, dense_not_inst_table, original_data
Beispiel #14
0
    def fit(self, data_inst):
        """
        开始正式处理数据
        """
        self._abnormal_detection(data_inst)
        data_instances = data_inst.mapValues(self.load_data)

        LOGGER.info("开始归一化数据")
        LOGGER.info("开始计算sum_square_x_host的结果")
        sum_square_x_host = data_instances.mapValues(
            lambda x: np.sum(np.power(x.features, 2)))

        LOGGER.info("将sum_square_x_host发送给guest方")
        self.transfer_variable.host_to_guest.remote(obj=sum_square_x_host,
                                                    role=consts.GUEST,
                                                    idx=-1,
                                                    suffix=(0, 0))

        LOGGER.info("从guest方接收结果norm_x")
        self.norm_x = self.transfer_variable.guest_to_host.get(idx=-1,
                                                               suffix=(1, 1))

        LOGGER.info("开始归一化数据")
        self.data_output = data_inst.join(
            self.norm_x[0],
            lambda x, y: Instance(features=np.true_divide(x.features, y),
                                  label=x.label))
        return self.data_output
Beispiel #15
0
    def fit(self, data_inst):
        """
        正式开始处理数据
        """
        self._abnormal_detection(data_inst)
        data_instances = data_inst.mapValues(self.load_data)

        LOGGER.info("开始归一化数据")
        LOGGER.info("开始计算sum_square_x_guest的结果")
        sum_square_x_guest = data_instances.mapValues(
            lambda x: np.sum(np.power(x.features, 2)))

        LOGGER.info("从host方接收sum_square_x_host的值")
        sum_square_x_host = self.transfer_variable.host_to_guest.get(
            idx=-1, suffix=(0, 0))

        LOGGER.info("开始求平方根norm_x的值")
        self.norm_x = sum_square_x_guest.join(sum_square_x_host[0],
                                              lambda g, h: (g + h)**0.5)

        LOGGER.info("将norm_x发送给对方")
        self.transfer_variable.guest_to_host.remote(self.norm_x,
                                                    role=consts.HOST,
                                                    idx=-1,
                                                    suffix=(1, 1))

        LOGGER.info("正式归一化")
        self.data_output = data_inst.join(
            self.norm_x,
            lambda x, y: Instance(features=np.true_divide(x.features, y),
                                  label=x.label))
        return self.data_output
Beispiel #16
0
    def setUp(self):
        # eggroll.init("123")
        self.data_num = 1000
        self.feature_num = 200
        self.bin_num = 10
        final_result = []
        numpy_array = []
        for i in range(self.data_num):
            if 100 < i < 500:
                continue
            tmp = i * np.ones(self.feature_num)
            inst = Instance(inst_id=i, features=tmp, label=i % 2)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            numpy_array.append(tmp)
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=10)

        header = ['x' + str(i) for i in range(self.feature_num)]

        self.table = table
        self.table.schema = {'header': header}

        self.numpy_table = np.array(numpy_array)
        self.cols = [1, 2]
 def initialize(y):
     y_inst = y.mapValues(
         lambda label: Instance(features=np.asarray([label])))
     y_inst.schema = {"header": ["label"]}
     statistics = MultivariateStatisticalSummary(y_inst, -1)
     mean = statistics.get_mean()["label"]
     return y.mapValues(lambda x: np.asarray([mean])), np.asarray([mean])
Beispiel #18
0
def feed_into_dtable(ids, X, y, sample_range, feature_range, tables_name=None, partition=1):
    """
    Create an eggroll table feed with data specified by parameters provided

    parameters
    ----------
    :param ids: 1D numpy array
    :param X: 2D numpy array
    :param y: 2D numpy array
    :param sample_range: a tuple specifies the range of samples to feed into dtable
    :param feature_range: a tuple specifies the range of features to feed into dtable
    :param tables_name: a dictionary specifies table namespace (with key table_ns) and table name (with key table_name)
    :param partition: number of partition used when creating the dtable
    :return: an eggroll dtable
    """

    table_ns = "default_table_namespace"
    table_name = get_timestamp()
    if tables_name is not None:
        table_ns = tables_name["table_ns"]
        table_name = tables_name["table_name"]

    sample_list = []
    for i in range(sample_range[0], sample_range[1]):
        sample_list.append((ids[i], Instance(inst_id=ids[i],
                                             features=X[i, feature_range[0]:feature_range[1]],
                                             label=y[i, 0])))
    data_table = table(name=table_name, namespace=table_ns, partition=partition)
    data_table.put_all(sample_list)
    return data_table
Beispiel #19
0
    def fit(self, data_inst):
        LOGGER.info("begin to make guest data")
        self._init_data(data_inst)

        LOGGER.info("split data into multiple random parts")
        self.secure()

        LOGGER.info("share one random part data to multiple hosts")
        self.sync_share_to_host()

        LOGGER.info("get share of one random part data from multiple hosts")
        self.recv_share_from_host()

        LOGGER.info("begin to get sum of multiple party")
        self.sub_key_sum()

        LOGGER.info("receive host sum from host")
        self.recv_host_sum_from_host()

        self.reconstruct()

        LOGGER.info("success to calculate privacy sum")
        self.secret_sum = self.secret_sum.join(data_inst, lambda s, v: Instance(features=numpy.array(s),
                                                                                inst_id=v.inst_id))

        self.secret_sum.schema = self.output_schema

        data_output = self.secret_sum

        return data_output
    def setUp(self):
        # eggroll.init("123")
        self.data_num = 1000
        self.feature_num = 20
        final_result = []
        numpy_array = []
        for i in range(self.data_num):
            tmp = np.random.rand(self.feature_num)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            numpy_array.append(tmp)
        table = eggroll.parallelize(final_result,
                                    include_key=True,
                                    partition=10)

        header = ['x' + str(i) for i in range(self.feature_num)]
        self.col_dict = {}
        for idx, h in enumerate(header):
            self.col_dict[h] = idx

        self.table = table
        self.table.schema = {'header': header}
        self.numpy_table = np.array(numpy_array)
        self.cols = [1, 2]
        self.used_data_set = []
 def trans_sparse(instance):
     dense_features = instance.features
     indices = [i for i in range(len(dense_features))]
     sparse_features = SparseVector(indices=indices, data=dense_features, shape=len(dense_features))
     return Instance(inst_id=None,
                     features=sparse_features,
                     label=instance.label)
Beispiel #22
0
    def test_sparse_abnormal_data(self):
        final_result = []
        numpy_array = []
        sparse_inst_shape = self.feature_num + 15
        indices = [x for x in range(self.feature_num + 10)]
        for i in range(self.data_num):
            tmp = 100 * np.random.rand(self.feature_num)
            tmp = [ik for ik in range(self.feature_num)]
            tmp[i % self.feature_num] = 'nan'
            # data_index = np.random.choice(indices, self.feature_num, replace=False)
            # data_index = sorted(data_index)
            data_index = [idx for idx in range(self.feature_num)]
            sparse_inst = SparseVector(data_index,
                                       tmp,
                                       shape=sparse_inst_shape)
            if i == 0:
                aa = sparse_inst.get_data(0, 'a')
                print('in for loop: {}, type: {}'.format(aa, type(aa)))
            inst = Instance(inst_id=i, features=sparse_inst, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)
            n = 0
            pointer = 0
            tmp_array = []
            while n < sparse_inst_shape:
                if n in data_index:
                    tmp_array.append(tmp[pointer])
                    pointer += 1
                else:
                    tmp_array.append(0)
                n += 1
            numpy_array.append(tmp_array)

        abnormal_value = final_result[0][1].features.get_data(0, 'a')
        print('abnormal_value: {}, type: {}'.format(abnormal_value,
                                                    type(abnormal_value)))
        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=1)
        header = ['x' + str(i) for i in range(sparse_inst_shape)]
        numpy_table = np.array(numpy_array)
        table.schema = {'header': header}
        self.used_data_set.append(table)

        bin_obj = self._bin_obj_generator(abnormal_list=['nan'])
        split_points = bin_obj.fit_split_points(table)
        print('split_points: {}'.format(split_points))
        print(numpy_table)

        trans_result = bin_obj.transform(table,
                                         transform_cols_idx=-1,
                                         transform_type='bin_num')
        trans_result = trans_result.collect()
        print('transform result: ')
        for k, v in trans_result:
            value = v.features.get_all_data()
            value_list = []
            for value_k, value_v in value:
                value_list.append((value_k, value_v))
            print(k, value_list)
Beispiel #23
0
    def to_instance(self, features, label=None):
        if self.header is None and len(features) != 0:
            raise ValueError(
                "features shape {} not equal to header shape 0".format(
                    len(features)))
        elif self.header is not None and len(self.header) != len(features):
            raise ValueError(
                "features shape {} not equal to header shape {}".format(
                    len(features), len(self.header)))

        if self.label_idx is not None:
            if self.label_type == 'int':
                label = int(label)
            elif self.label_type in ["float", "float64"]:
                label = float(label)

            format_features = DenseFeatureTransformer.gen_output_format(
                features,
                self.data_type,
                self.exclusive_data_type_fid_map,
                self.output_format,
                missing_impute=self.missing_impute)

        else:
            format_features = DenseFeatureTransformer.gen_output_format(
                features,
                self.data_type,
                self.exclusive_data_type_fid_map,
                self.output_format,
                missing_impute=self.missing_impute)

        return Instance(inst_id=None, features=format_features, label=label)
Beispiel #24
0
    def setUp(self):
        self.feature_histogram = FeatureHistogram()
        eggroll.init("test_feature_histogram")
        data_insts = []
        for i in range(1000):
            indices = []
            data = []
            for j in range(10):
                x = random.randint(0, 5)
                if x != 0:
                    data.append(x)
                    indices.append(j)
            sparse_vec = SparseVector(indices, data, shape=10)
            data_insts.append((Instance(features=sparse_vec), (1, random.randint(0, 3))))
        self.node_map = {0: 0, 1: 1, 2: 2, 3: 3}
        self.data_insts = data_insts
        self.data_bin = eggroll.parallelize(data_insts, include_key=False)

        self.grad_and_hess_list = [(random.random(), random.random()) for i in range(1000)]
        self.grad_and_hess = eggroll.parallelize(self.grad_and_hess_list, include_key=False)

        bin_split_points = []
        for i in range(10):
            bin_split_points.append(np.array([i for i in range(5)]))
        self.bin_split_points = np.array(bin_split_points)
        self.bin_sparse = [0 for i in range(10)]
Beispiel #25
0
    def setUp(self):
        self.data_num = 1000
        self.feature_num = 3
        self.cols = [0, 1, 2, 3]
        self.header = ['x' + str(i) for i in range(self.feature_num)]
        final_result = []

        for i in range(self.data_num):
            tmp = []
            for _ in range(self.feature_num):
                tmp.append(np.random.choice([1, 2, 3, 'test_str']))
            tmp = np.array(tmp)
            inst = Instance(inst_id=i, features=tmp, label=0)
            tmp_pair = (str(i), inst)
            final_result.append(tmp_pair)

        table = session.parallelize(final_result,
                                    include_key=True,
                                    partition=10)
        table.schema = {"header": self.header}
        self.model_name = 'OneHotEncoder'

        self.table = table

        self.args = {"data": {self.model_name: {"data": table}}}
Beispiel #26
0
    def setUp(self):
        eggroll.init("test_instance")
        dense_inst = []
        dense_not_inst = []
        headers = ['x' + str(i) for i in range(20)]
        self.header = headers
        self.eps = 1e-5
        self.count = 100
        self.dense_data_transpose = []
        for i in range(self.count):
            features = i % 16 * np.ones(20)
            inst = Instance(features=features)
            dense_inst.append((i, inst))
            self.dense_data_transpose.append(features)
            dense_not_inst.append((i, features))
        self.dense_inst = dense_inst
        self.dense_not_inst = dense_not_inst
        self.dense_data_transpose = np.array(self.dense_data_transpose)
        self.dense_data_transpose = self.dense_data_transpose.transpose()

        self.dense_table = eggroll.parallelize(dense_inst, include_key=True, partition=5)
        self.dense_not_inst_table = eggroll.parallelize(dense_not_inst, include_key=True, partition=5)
        self.dense_table.schema = {'header': headers}
        self.dense_not_inst_table.schema = {'header': headers}

        col_index = [1, 2, 3]
        self.col_index = col_index
        self.summary_obj = MultivariateStatisticalSummary(self.dense_table, col_index, abnormal_list=[None])
        self.summary_obj_not_inst = MultivariateStatisticalSummary(self.dense_not_inst_table, col_index,
                                                                   abnormal_list=[None])
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)

        size = 10
        self.wx = eggroll.parallelize(
            [self.paillier_encrypt.encrypt(i) for i in range(size)])
        self.en_sum_wx_square = eggroll.parallelize(
            [self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)])
        self.w = [i for i in range(size)]
        self.data_inst = eggroll.parallelize([
            Instance(features=[1 for _ in range(size)], label=pow(-1, i % 2))
            for i in range(size)
        ],
                                             partition=1)

        # test fore_gradient
        self.fore_gradient_local = [
            -0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75
        ]
        # test gradient
        self.gradient = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125
        ]
        self.gradient_fit_intercept = [
            1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125,
            1.125, 1.125
        ]

        self.loss = 4.505647
    def setUp(self):
        self.paillier_encrypt = PaillierEncrypt()
        self.paillier_encrypt.generate_key()
        # self.hetero_lr_gradient = HeteroLogisticGradient(self.paillier_encrypt)
        self.hetero_lr_gradient = hetero_lr_gradient_and_loss.Guest()

        size = 10
        self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)],
                                         partition=48,
                                         include_key=False)
        # self.en_wx = session.parallelize([self.paillier_encrypt.encrypt(i) for i in range(size)])

        self.en_sum_wx_square = session.parallelize([self.paillier_encrypt.encrypt(np.square(i)) for i in range(size)],
                                                    partition=48,
                                                    include_key=False)
        self.wx = np.array([i for i in range(size)])
        self.w = self.wx / np.array([1 for _ in range(size)])
        self.data_inst = session.parallelize(
            [Instance(features=np.array([1 for _ in range(size)]), label=pow(-1, i % 2)) for i in range(size)],
            partition=48, include_key=False)

        # test fore_gradient
        self.fore_gradient_local = [-0.5, 0.75, 0, 1.25, 0.5, 1.75, 1, 2.25, 1.5, 2.75]
        # test gradient
        self.gradient = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]
        self.gradient_fit_intercept = [1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125, 1.125]

        self.loss = 4.505647
 def test_replace_predict_label(self):
     true_label, predict_label, predict_score, predict_detail, predict_type = 1, 0, 0.1, {"1": 0.1, "0": 0.9}, "train"
     predict_result = Instance(inst_id=0,
                               features=[true_label, predict_label, predict_score, predict_detail, predict_type])
     r_predict_instance = self.label_transformer_obj.replace_predict_label(predict_result, self.predict_label_encoder)
     r_predict_result = r_predict_instance.features
     c_predict_result = ["yes", "no", predict_score, {"yes": 0.1, "no": 0.9}, predict_type]
     self.assertEqual(r_predict_result, c_predict_result)
Beispiel #30
0
    def test_instance(self):
        inst = Instance(inst_id=5, weight=2.0, features=[1, 2, 3], label=-5)
        self.assertTrue(inst.inst_id == 5 and abs(inst.weight - 2.0) < 1e-8
                        and inst.features == [1, 2, 3] and inst.label == -5)

        inst.set_weight(3)
        inst.set_label(5)
        inst.set_feature(["yes", "no"])
        self.assertTrue(inst.weight == 3 and inst.label == 5
                        and inst.features == ["yes", "no"])