def _gen_data(self,
                  data_num,
                  feature_num,
                  partition,
                  expect_split_points,
                  is_sparse=False,
                  use_random=False):
        data = []
        shift_iter = 0
        header = [str(i) for i in range(feature_num)]
        bin_num = len(expect_split_points)

        for data_key in range(data_num):
            value = expect_split_points[data_key % bin_num]
            if value == expect_split_points[-1]:
                if shift_iter % bin_num == 0:
                    value = expect_split_points[0]
                shift_iter += 1
            if not is_sparse:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                inst = Instance(inst_id=data_key,
                                features=features,
                                label=data_key % 2)

            else:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                data_index = [x for x in range(feature_num)]
                sparse_inst = SparseVector(data_index,
                                           data=features,
                                           shape=feature_num)
                inst = Instance(inst_id=data_key,
                                features=sparse_inst,
                                label=data_key % 2)
                header = [str(i) for i in range(feature_num)]

            data.append((data_key, inst))
        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        self.table_list.append(result)
        return result
Beispiel #2
0
    def to_instance(param_list, value):
        delimitor = param_list[0]
        data_type = param_list[1]
        label_type = param_list[2]
        output_format = param_list[3]
        max_fid = param_list[4]

        if output_format not in ["dense", "sparse"]:
            raise ValueError(
                "output format {} is not define".format(output_format))

        cols = value.split(delimitor, -1)

        label = cols[0]
        if label_type == 'int':
            label = int(label)
        elif label_type in ["float", "float64"]:
            label = float(label)

        fid_value = []
        for i in range(1, len(cols)):
            fid, val = cols[i].split(":", -1)

            fid = int(fid)
            if data_type in ["float", "float64"]:
                val = float(val)
            elif data_type in ["int", "int64"]:
                val = int(val)

            fid_value.append((fid, val))

        if output_format == "dense":
            features = [0 for i in range(max_fid + 1)]
            for fid, val in fid_value:
                features[fid] = val

            features = np.asarray(features, dtype=data_type)

        else:
            indices = []
            data = []
            for fid, val in fid_value:
                indices.append(fid)
                data.append(val)

            features = SparseVector(indices, data, max_fid + 1)

        return Instance(inst_id=None, features=features, label=label)
Beispiel #3
0
    def data_format_transform(row):
        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue

                indices.append(i)
                data.append(row.features[i])

            row.features = SparseVector(indices, data, feature_shape)

        return row
Beispiel #4
0
    def gen_data(self,
                 data_num,
                 feature_num,
                 partition,
                 is_sparse=False,
                 use_random=False):
        data = []
        shift_iter = 0
        header = [str(i) for i in range(feature_num)]

        for data_key in range(data_num):
            value = data_key % bin_num
            if value == 0:
                if shift_iter % bin_num == 0:
                    value = bin_num - 1
                shift_iter += 1
            if not is_sparse:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                inst = Instance(inst_id=data_key,
                                features=features,
                                label=data_key % 2)

            else:
                if not use_random:
                    features = value * np.ones(feature_num)
                else:
                    features = np.random.random(feature_num)
                data_index = [x for x in range(feature_num)]
                sparse_inst = SparseVector(data_index,
                                           data=features,
                                           shape=10 * feature_num)
                inst = Instance(inst_id=data_key,
                                features=sparse_inst,
                                label=data_key % 2)
                header = [str(i) for i in range(feature_num * 10)]

            data.append((data_key, inst))
        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result.schema = {'header': header}
        return result
Beispiel #5
0
    def to_instance(param_list, value):
        delimitor = param_list[0]
        data_type = param_list[1]
        with_label = param_list[2]
        label_type = param_list[3]
        output_format = param_list[4]
        tags_dict = param_list[5]

        if output_format not in ["dense", "sparse"]:
            raise ValueError("output format {} is not define".format(output_format))

        cols = value.split(delimitor, -1)
        start_pos = 0
        label = None

        if with_label:
            start_pos = 1
            label = cols[0]
            if label_type == 'int':
                label = int(label)
            elif label_type in ["float", "float64"]:
                label = float(label)

        if output_format == "dense":
            features = [0 for i in range(len(tags_dict))]
            for tag in cols[start_pos:]:
                features[tags_dict.get(tag)] = 1

            features = np.asarray(features, dtype=data_type)
        else:
            indices = []
            data = []
            for tag in cols[start_pos:]:
                indices.append(tags_dict.get(tag))
                _data = 1
                if data_type in ["float", "float64"]:
                    _data = float(1)
                data.append(_data)

            features = SparseVector(indices, data, len(tags_dict))

        return Instance(inst_id=None,
                        features=features,
                        label=label)
Beispiel #6
0
        def convert(instances):
            if is_sparse:
                all_data = instances.features.get_all_data()
                indice = []
                sparse_value = []
                data_shape = instances.features.get_shape()
                for col_idx, col_value in all_data:
                    if col_idx in transform_cols_idx:
                        if col_value in abnormal_list:
                            indice.append(col_idx)
                            sparse_value.append(col_value)
                            continue
                        # Maybe it is because missing value add in sparse value, but
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        indice.append(col_idx)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        sparse_value.append(woe_value)
                    else:
                        indice.append(col_idx)
                        sparse_value.append(col_value)
                sparse_vector = SparseVector(indice, sparse_value, data_shape)
                instances.features = sparse_vector
            else:
                features = instances.features
                assert isinstance(features, np.ndarray)
                transform_cols_idx_set = set(transform_cols_idx)

                for col_idx, col_value in enumerate(features):
                    if col_idx in transform_cols_idx_set:
                        if col_value in abnormal_list:
                            features[col_idx] = col_value
                            continue
                        col_name = bin_inner_param.header[col_idx]
                        split_points = split_points_dict[col_name]
                        bin_num = BaseBinning.get_bin_num(col_value, split_points)
                        col_results = bin_res.all_cols_results.get(col_name)
                        woe_value = col_results.woe_array[bin_num]
                        features[col_idx] = woe_value
                instances.features = features
            return instances
    def join_feature_with_label(inst, leaf_indices, leaf_mapping_list, vec_len, dense):

        label = inst.label
        if dense:
            vec = np.zeros(vec_len)
            offset = 0
            for tree_idx, leaf_idx in enumerate(leaf_indices):
                vec[leaf_mapping_list[tree_idx][leaf_idx] + offset] = 1
                offset += len(leaf_mapping_list[tree_idx])
            return Instance(features=vec, label=label)

        else:
            indices, value = [], []
            offset = 0
            for tree_idx, leaf_idx in enumerate(leaf_indices):
                indices.append(leaf_mapping_list[tree_idx][leaf_idx] + offset)
                value.append(1)
                offset += len(leaf_mapping_list[tree_idx])
            return Instance(features=SparseVector(indices=indices, data=value, shape=vec_len), label=label)
Beispiel #8
0
    def gen_output_format(features,
                          data_type='float',
                          output_format='dense',
                          missing_impute=None):

        if output_format not in ["dense", "sparse"]:
            raise ValueError(
                "output format {} is not define".format(output_format))

        if output_format == "dense":
            return np.asarray(features, dtype=data_type)

        # The blow is to handle the sparse vector
        indices = []
        data = []
        column_shape = len(features)
        non_zero = 0

        for i in range(column_shape):
            if (missing_impute is not None and features[i] in missing_impute) or \
                    (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]):
                continue

            if data_type in ['float', 'float64']:
                if np.fabs(float(features[i])) < consts.FLOAT_ZERO:
                    continue

                indices.append(i)
                data.append(float(features[i]))
                non_zero += 1

            elif data_type in ['int']:
                if int(features[i]) == 0:
                    continue
                indices.append(i)
                data.append(int(features[i]))

            else:
                indices.append(i)
                data.append(features[i])

        return SparseVector(indices, data, column_shape)
Beispiel #9
0
    def setUp(self):
        self.data = []
        for i in range(100):
            dict = {}
            indices = []
            data = []
            for j in range(40):
                idx = random.randint(0, 49)
                if idx in dict:
                    continue
                dict[idx] = 1
                val = random.random()
                indices.append(idx)
                data.append(val)

            sparse_vec = SparseVector(indices, data, 50)
            self.data.append((str(i), Instance(features=sparse_vec,
                                               label=i % 2)))

        self.table = session.parallelize(self.data, include_key=True)
        self.table.schema = {"header": ["fid" + str(i) for i in range(50)]}
Beispiel #10
0
    def data_format_transform(row):
        """
        transform data into sparse format
        """

        if type(row.features).__name__ != consts.SPARSE_VECTOR:
            feature_shape = row.features.shape[0]
            indices = []
            data = []

            for i in range(feature_shape):
                if np.isnan(row.features[i]):
                    indices.append(i)
                    data.append(NoneType())
                elif np.abs(row.features[i]) < consts.FLOAT_ZERO:
                    continue
                else:
                    indices.append(i)
                    data.append(row.features[i])

            new_row = copy.deepcopy(row)
            new_row.features = SparseVector(indices, data, feature_shape)
            return new_row
        else:
            sparse_vec = row.features.get_sparse_vector()
            replace_key = []
            for key in sparse_vec:
                if sparse_vec.get(key) == NoneType() or np.isnan(
                        sparse_vec.get(key)):
                    replace_key.append(key)

            if len(replace_key) == 0:
                return row
            else:
                new_row = copy.deepcopy(row)
                new_sparse_vec = new_row.features.get_sparse_vector()
                for key in replace_key:
                    new_sparse_vec[key] = NoneType()
                return new_row
Beispiel #11
0
    def _convert_sparse_data(instances,
                             bin_inner_param: BinInnerParam,
                             bin_results: BinResults,
                             abnormal_list: list,
                             convert_type: str = 'bin_num'):
        instances = copy.deepcopy(instances)
        all_data = instances.features.get_all_data()
        data_shape = instances.features.get_shape()
        indice = []
        sparse_value = []
        transform_cols_idx = bin_inner_param.transform_bin_indexes
        split_points_dict = bin_results.all_split_points

        for col_idx, col_value in all_data:
            if col_idx in transform_cols_idx:
                if col_value in abnormal_list:
                    indice.append(col_idx)
                    sparse_value.append(col_value)
                    continue
                # Maybe it is because missing value add in sparse value, but
                col_name = bin_inner_param.header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = BaseBinning.get_bin_num(col_value, split_points)
                indice.append(col_idx)
                if convert_type == 'bin_num':
                    sparse_value.append(bin_num)
                elif convert_type == 'woe':
                    col_results = bin_results.all_cols_results.get(col_name)
                    woe_value = col_results.woe_array[bin_num]
                    sparse_value.append(woe_value)
                else:
                    sparse_value.append(col_value)
            else:
                indice.append(col_idx)
                sparse_value.append(col_value)

        sparse_vector = SparseVector(indice, sparse_value, data_shape)
        instances.features = sparse_vector
        return instances
Beispiel #12
0
    def _convert_sparse_data(instances, transform_cols_idx, split_points_dict, header):
        all_data = instances.features.get_all_data()
        data_shape = instances.features.get_shape()
        indice = []
        sparse_value = []
        # print("In _convert_sparse_data, transform_cols_idx: {}, header: {}, split_points_dict: {}".format(
        #     transform_cols_idx, header, split_points_dict
        # ))
        for col_idx, col_value in all_data:
            if col_idx in transform_cols_idx:
                col_name = header[col_idx]
                split_points = split_points_dict[col_name]
                bin_num = Binning.get_bin_num(col_value, split_points)
                indice.append(col_idx)
                sparse_value.append(bin_num)
            else:
                indice.append(col_idx)
                sparse_value.append(col_value)

        sparse_vector = SparseVector(indice, sparse_value, data_shape)
        instances.features = sparse_vector
        return instances
Beispiel #13
0
    def gen_output_format(features, data_type='float', output_format='dense'):

        if output_format not in ["dense", "sparse"]:
            raise ValueError("output format %s is not define" %
                             (output_format))

        if output_format == "dense":
            result = np.asarray(features, dtype=data_type)
            return np.asarray(features, dtype=data_type)

        elif output_format == "sparse":
            indices = []
            data = []
            column_shape = len(features)
            non_zero = 0

        for i in range(column_shape):
            if features[i] in ['', 'NULL', 'null', "NA"]:
                continue

            if data_type in ['float', 'float64']:
                if np.fabs(float(features[i])) < consts.FLOAT_ZERO:
                    continue
                indices.append(i)
                data.append(float(features[i]))
                non_zero += 1

            elif data_type in ['int']:
                if int(features[i]) == 0:
                    continue
                indices.append(i)
                data.append(int(features[i]))

            else:
                raise ValueError("data type %s is not define" % (data_type))

        return SparseVector(indices, data, column_shape)
Beispiel #14
0
    def test_instance(self):
        indices = []
        data = []
        for i in range(1, 10):
            indices.append(i * i)
            data.append(i**3)

        shape = 100

        sparse_data = SparseVector(indices, data, shape)
        self.assertTrue(sparse_data.shape == shape
                        and len(sparse_data.sparse_vec) == 9)
        self.assertTrue(sparse_data.count_zeros() == 91)
        self.assertTrue(sparse_data.count_non_zeros() == 9)

        for idx, val in zip(indices, data):
            self.assertTrue(sparse_data.get_data(idx) == val)
        for i in range(100):
            if i in indices:
                continue
            self.assertTrue(sparse_data.get_data(i, i**4) == i**4)

        self.assertTrue(
            dict(sparse_data.get_all_data()) == dict(zip(indices, data)))
Beispiel #15
0
    def to_instance(param_list, value):
        delimitor = param_list[0]
        data_type = param_list[1]
        tag_with_value = param_list[2]
        tag_value_delimitor = param_list[3]
        with_label = param_list[4]
        label_type = param_list[5]
        output_format = param_list[6]
        tags_dict = param_list[7]

        if output_format not in ["dense", "sparse"]:
            raise ValueError(
                "output format {} is not define".format(output_format))

        cols = value.split(delimitor, -1)
        start_pos = 0
        label = None

        if with_label:
            start_pos = 1
            label = cols[0]
            if label_type == 'int':
                label = int(label)
            elif label_type in ["float", "float64"]:
                label = float(label)

        if output_format == "dense":
            features = [0 for i in range(len(tags_dict))]
            for fea in cols[start_pos:]:
                if tag_with_value:
                    _tag, _val = fea.split(tag_value_delimitor, -1)
                    if _tag in tags_dict:
                        features[tags_dict.get(_tag)] = _val
                else:
                    if fea in tags_dict:
                        features[tags_dict.get(fea)] = 1

            features = np.asarray(features, dtype=data_type)
        else:
            indices = []
            data = []
            for fea in cols[start_pos:]:
                if tag_with_value:
                    _tag, _val = fea.split(tag_value_delimitor, -1)
                else:
                    _tag = fea
                    _val = 1

                if _tag not in tags_dict:
                    continue

                indices.append(tags_dict.get(_tag))
                if data_type in ["float", "float64"]:
                    _val = float(_val)
                elif data_type in ["int", "int64", "long"]:
                    _val = int(_val)
                elif data_type == "str":
                    _val = str(_val)

                data.append(_val)

            features = SparseVector(indices, data, len(tags_dict))

        return Instance(inst_id=None, features=features, label=label)
Beispiel #16
0
    def gen_data(self, data_num, partition):
        col_data = []
        header = [str(i) for i in range(6)]
        mode_num = int(0.8 * data_num)
        other_num = data_num - mode_num
        col_1 = np.array([1] * mode_num + [0] * other_num)
        random.shuffle(col_1)
        col_data.append(col_1)

        mode_num = int(0.799 * data_num)
        other_num = data_num - mode_num
        col_1 = np.array([1] * mode_num + [0] * other_num)
        random.shuffle(col_1)
        col_data.append(col_1)

        mode_num = int(0.801 * data_num)
        other_num = data_num - mode_num
        col_1 = np.array([1] * mode_num + [0] * other_num)
        random.shuffle(col_1)
        col_data.append(col_1)

        col_2 = np.random.randn(data_num)
        col_data.append(col_2)

        mode_num = int(0.2 * data_num)
        other_num = data_num - mode_num
        col_1 = np.array([0.5] * mode_num + list(np.random.randn(other_num)))
        print("col 0.5 count: {}".format(list(col_1).count(0.5)))
        random.shuffle(col_1)
        col_data.append(col_1)

        mode_num = int(0.79 * data_num)
        other_num = data_num - mode_num
        col_1 = np.array([0.5] * mode_num + list(np.random.randn(other_num)))
        random.shuffle(col_1)
        col_data.append(col_1)

        data = []
        data_2 = []
        for key in range(data_num):
            features = np.array([col[key] for col in col_data])
            inst = Instance(inst_id=key, features=features, label=key % 2)
            data.append((key, inst))

            sparse_vec = SparseVector(
                indices=[i for i in range(len(features))],
                data=features,
                shape=len(features))
            inst_2 = Instance(inst_id=key, features=sparse_vec, label=key % 2)
            data_2.append((key, inst_2))

        result = session.parallelize(data,
                                     include_key=True,
                                     partition=partition)
        result_2 = session.parallelize(data_2,
                                       include_key=True,
                                       partition=partition)
        result.schema = {'header': header}
        result_2.schema = {'header': header}

        self.header = header
        return result, result_2
Beispiel #17
0
    def gen_output_format(features,
                          data_type='float',
                          exclusive_data_type_fid_map=None,
                          output_format='dense',
                          missing_impute=None):

        if output_format not in ["dense", "sparse"]:
            raise ValueError(
                "output format {} is not define".format(output_format))

        if output_format == "dense":
            format_features = copy.deepcopy(features)
            if data_type in [
                    "int", "int64", "long", "float", "float64", "double"
            ]:
                for i in range(len(features)):
                    if (missing_impute is not None and features[i] in missing_impute) or \
                            (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]):
                        format_features[i] = np.nan

            if exclusive_data_type_fid_map:
                for fid in range(len(features)):
                    if fid in exclusive_data_type_fid_map:
                        dtype = exclusive_data_type_fid_map[fid]
                    else:
                        dtype = data_type

                    format_features[fid] = getattr(np, dtype)(features[fid])

                return np.asarray(format_features, dtype=object)
            else:
                return np.asarray(format_features, dtype=data_type)

        indices = []
        data = []
        column_shape = len(features)
        non_zero = 0

        for i in range(column_shape):
            if (missing_impute is not None and features[i] in missing_impute) or \
                    (missing_impute is None and features[i] in ['', 'NULL', 'null', "NA"]):
                indices.append(i)
                data.append(np.nan)
                non_zero += 1

            elif data_type in ['float', 'float64', "double"]:
                if np.fabs(float(features[i])) < consts.FLOAT_ZERO:
                    continue

                indices.append(i)
                data.append(float(features[i]))
                non_zero += 1

            elif data_type in ['int', "int64", "long"]:
                if int(features[i]) == 0:
                    continue
                indices.append(i)
                data.append(int(features[i]))

            else:
                indices.append(i)
                data.append(features[i])

        return SparseVector(indices, data, column_shape)
Beispiel #18
0
    def convert_instance_to_bin(instance, bin_split_points=None):
        """
        Method use by mapValues Api, convert an instance object's features to bins

        Parameters
        ----------
        instance : Instance Object

        bin_split_points: 2D numpy's ndarray,
            split points of each feature need to binning

        Returns
        -------
        instance: Instance Object, the instance object's features converted to bins

        """
        sparse_data = False
        if type(instance.features).__name__ == "ndarray":
            feature_shape = instance.features.shape[0]
        else:
            feature_shape = instance.features.get_shape()
            sparse_data = True
            indices = []

        data_format = type(instance.features).__name__

        features = instance.features

        bins = []

        if sparse_data:
            feature_values = [kv for kv in features.get_all_data()]
        else:
            feature_values = list(
                zip(range(features.shape[0]), features.tolist()))

        for fid, feature_value in feature_values:
            bin_id = 0

            if sparse_data:
                indices.append(fid)

            if bin_split_points[fid].shape[0] == 0:
                bins.append(bin_id)
                continue

            if bin_split_points[fid].shape[0] <= 20:
                bin_id = bin_split_points[fid].shape[0]
                for idx in range(bin_split_points[fid].shape[0]):
                    if feature_value <= bin_split_points[fid][idx]:
                        bin_id = idx
                        break

                bins.append(bin_id)
            else:
                if feature_value <= bin_split_points[fid][0]:
                    bin_id = 0
                elif feature_value > bin_split_points[fid][
                        bin_split_points[fid].shape[0] - 1]:
                    bin_id = bin_split_points[fid].shape[0]
                else:
                    left = 0
                    right = bin_split_points[fid].shape[0] - 1
                    while left <= right:
                        idx = (left + right) >> 1

                        if feature_value <= bin_split_points[fid][idx]:
                            bin_id = idx
                            right = idx - 1
                        else:
                            left = idx + 1

                bins.append(bin_id)

        if data_format == "ndarray":
            instance.features = np.array(bins, dtype='int')
        else:
            instance.features = SparseVector(indices, bins, feature_shape)

        return instance
Beispiel #19
0
    def convert_instance_to_bin(instance, bin_split_points=None):
        sparse_data = False
        if type(instance.features).__name__ == "ndarray":
            feature_shape = instance.features.shape[0]
        else:
            feature_shape = instance.features.get_shape()
            sparse_data = True
            indices = []

        data_format = type(instance.features).__name__

        features = instance.features

        bins = []

        if sparse_data:
            feature_values = [kv for kv in features.get_all_data()]
        else:
            feature_values = list(zip(range(features.shape[0]), features.tolist()))

        for fid, feature_value in feature_values:
            bin_id = 0

            if sparse_data:
                indices.append(fid)

            if bin_split_points[fid].shape[0] == 0:
                bins.append(bin_id)
                continue

            if bin_split_points[fid].shape[0] <= 20:
                bin_id = bin_split_points[fid].shape[0]
                for idx in range(bin_split_points[fid].shape[0]):
                    if feature_value <= bin_split_points[fid][idx]:
                        bin_id = idx
                        break

                bins.append(bin_id)
            else:
                if feature_value <= bin_split_points[fid][0]:
                    bin_id = 0
                elif feature_value > bin_split_points[fid][bin_split_points[fid].shape[0] - 1]:
                    bin_id = bin_split_points[fid].shape[0]
                else:
                    left = 0
                    right = bin_split_points[fid].shape[0] - 1
                    while left <= right:
                        idx = (left + right) >> 1

                        if feature_value <= bin_split_points[fid][idx]:
                            bin_id = idx
                            right = idx - 1
                        else:
                            left = idx + 1

                bins.append(bin_id)

        if data_format == "ndarray":
            instance.features = np.array(bins, dtype='int')
        else:
            instance.features = SparseVector(indices, bins, feature_shape)

        return instance