Example #1
0
    def test_tag_with_value_sparse_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "tag"
        dataio_param.data_type = "float"
        dataio_param.tag_with_value = True
        dataio_param.tag_value_delimitor = ":"
        dataio_param.delimitor = ' '
        dataio_param.with_label = False
        dataio_param.output_format = "sparse"
        reader = SparseTagReader(dataio_param)
        tag_insts = reader.read_data(self.table2, self.namespace)
        features = [inst.features for key, inst in tag_insts.collect()]

        tags = set()
        for row in self.data_with_value:
            tag_list = []
            for tag_with_value in row[1].split(" ", -1):
                tag_list.append(tag_with_value.split(":")[0])

            tags |= set(tag_list)

        tags = sorted(tags)
        tag_dict = dict(zip(tags, range(len(tags))))

        for i in range(len(self.data_with_value)):
            ori_feature = {}
            for tag_with_value in self.data_with_value[i][1].split(" ", -1):
                idx = tag_dict.get(tag_with_value.split(":")[0])
                val = float(tag_with_value.split(":")[1])

                self.assertTrue(
                    np.abs(val -
                           features[i].get_data(idx)) < consts.FLOAT_ZERO)
Example #2
0
 def test_sparse_output_format(self):
     dataio_param = DataIOParam()
     dataio_param.output_format = "sparse"
     reader = DenseFeatureReader(dataio_param)
     data = reader.read_data(self.table, self.namespace).collect()
     result = dict(data)
     vala = result['a']
     features = vala.features
     self.assertTrue(type(features).__name__ == "SparseVector")
     self.assertTrue(len(features.sparse_vec) == 4)
     self.assertTrue(features.shape == 6)
Example #3
0
 def test_with_label(self):
     dataio_param = DataIOParam()
     dataio_param.with_label = True
     dataio_param.label_idx = 2
     reader = DenseFeatureReader(dataio_param)
     data = reader.read_data(self.table, self.namespace).collect()
     result = dict(data)
     vala = result['a']
     label = vala.label
     features = vala.features
     self.assertTrue(label == -1)
     self.assertTrue(features.shape[0] == 5)
Example #4
0
    def test_dense_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "tag"
        dataio_param.data_type = 'int'
        dataio_param.delimitor = ' '
        dataio_param.with_label = False
        dataio_param.output_format = "dense"
        reader = SparseTagReader(dataio_param)
        tag_insts = reader.read_data(self.table, self.namespace)
        features = [inst.features for key, inst in tag_insts.collect()]

        tags = set()
        for row in self.data:
            tags |= set(row[1].split(" ", -1))

        tags = sorted(tags)
        tag_dict = dict(zip(tags, range(len(tags))))

        for i in range(len(self.data)):
            ori_feature = [0 for i in range(len(tags))]

            for tag in self.data[i][1].split(" ", -1):
                ori_feature[tag_dict.get(tag)] = 1

            ori_feature = np.asarray(ori_feature, dtype='int')
            self.assertTrue(
                np.abs(ori_feature - features).all() < consts.FLOAT_ZERO)
Example #5
0
    def test_tag_with_value_dense_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "tag"
        dataio_param.data_type = 'float'
        dataio_param.delimitor = ' '
        dataio_param.with_label = False
        dataio_param.output_format = "dense"
        reader = SparseTagReader(dataio_param)
        tag_insts = reader.read_data(self.table2, self.namespace)
        features = [inst.features for key, inst in tag_insts.collect()]

        tags = set()
        for row in self.data_with_value:
            tag_list = []
            for tag_with_value in row[1].split(" ", -1):
                tag_list.append(tag_with_value.split(":")[0])

            tags |= set(tag_list)

        tags = sorted(tags)
        tag_dict = dict(zip(tags, range(len(tags))))

        for i in range(len(self.data_with_value)):
            ori_feature = [0 for i in range(len(tags))]

            for tag_with_value in self.data_with_value[i][1].split(" ", -1):
                tag = tag_with_value.split(":", -1)[0]
                val = float(tag_with_value.split(":", -1)[1])
                ori_feature[tag_dict.get(tag)] = val

            ori_feature = np.asarray(ori_feature, dtype='float64')
            self.assertTrue(
                np.abs(ori_feature - features).all() < consts.FLOAT_ZERO)
Example #6
0
    def test_dense_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "sparse"
        dataio_param.delimitor = ' '
        dataio_param.output_format = "sparse"
        reader = SparseFeatureReader(dataio_param)
        insts = list(reader.read_data(self.table, self.namespace).collect())
        for i in range(100):
            features = insts[i][1].features
            self.assertTrue(type(features).__name__ == "SparseVector")
            self.assertTrue(features.get_shape() == self.max_feature + 1)
            self.assertTrue(insts[i][1].label == i % 2)

            row = self.data[i][1].split(" ")
            for j in range(1, len(row)):
                fid, val = row[j].split(":", -1)

                self.assertTrue(
                    np.fabs(features.get_data(int(fid)) -
                            float(val)) < consts.FLOAT_ZERO)
Example #7
0
 def test_missing_value_fill(self):
     dataio_param = DataIOParam()
     dataio_param.missing_fill = True
     dataio_param.with_label = False
     dataio_param.output_format = "sparse"
     dataio_param.default_value = 100
     dataio_param.missing_fill_method = "designated"
     dataio_param.data_type = 'int'
     reader = DenseFeatureReader(dataio_param)
     data = reader.read_data(self.table2, self.namespace).collect()
     result = dict(data)
     features = result['a'].features
     for i in range(1, 5):
         self.assertTrue(features.get_data(i) == 100)
Example #8
0
    def test_dense_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "sparse"
        dataio_param.delimitor = ' '
        dataio_param.output_format = "dense"
        reader = SparseFeatureReader(dataio_param)
        insts = list(reader.read_data(self.table, self.namespace).collect())
        for i in range(100):
            features = insts[i][1].features
            self.assertTrue(type(features).__name__ == "ndarray")
            self.assertTrue(features.shape[0] == self.max_feature + 1)
            self.assertTrue(insts[i][1].label == i % 2)

            row = self.data[i][1].split(" ")
            ori_feat = [0 for i in range(self.max_feature + 1)]
            for j in range(1, len(row)):
                fid, val = row[j].split(":", -1)
                ori_feat[int(fid)] = float(val)

            ori_feat = np.asarray(ori_feat, dtype=dataio_param.data_type)

            self.assertTrue(
                np.abs(ori_feat - features).any() < consts.FLOAT_ZERO)
Example #9
0
 def test_dense_output_format(self):
     dataio_param = DataIOParam()
     reader = DenseFeatureReader(dataio_param)
     data = reader.read_data(self.table, self.namespace).collect()
     result = dict(data)
     self.assertTrue(type(result['a']).__name__ == "Instance")
     self.assertTrue(type(result['b']).__name__ == "Instance")
     vala = result['a']
     features = vala.features
     weight = vala.weight
     label = vala.label
     self.assertTrue(np.abs(weight - 1.0) < consts.FLOAT_ZERO)
     self.assertTrue(type(features).__name__ == "ndarray")
     self.assertTrue(label == None)
     self.assertTrue(features.shape[0] == 6)
     self.assertTrue(features.dtype == "float64")
Example #10
0
    def test_sparse_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "sparse"
        dataio_param.delimitor = ' '
        dataio_param.default_value = 2**30
        dataio_param.output_format = "sparse"
        reader = SparseFeatureReader(dataio_param)
        insts = list(reader.read_data(self.table, self.namespace).collect())
        for i in range(100):
            self.assertTrue(
                insts[i][1].features.get_shape() == self.max_feature + 1)
            self.assertTrue(insts[i][1].label == i % 2)
            original_feat = {}
            row = self.data[i][1].split(" ")
            for j in range(1, len(row)):
                fid, val = row[j].split(":", -1)
                original_feat[int(fid)] = float(val)

            self.assertTrue(original_feat == insts[i][1].features.sparse_vec)
Example #11
0
    def test_sparse_output_format(self):
        dataio_param = DataIOParam()
        dataio_param.input_format = "tag"
        dataio_param.data_type = "int"
        dataio_param.delimitor = ' '
        dataio_param.with_label = False
        dataio_param.output_format = "sparse"
        reader = SparseTagReader(dataio_param)
        tag_insts = reader.read_data(self.table, self.namespace)
        features = [inst.features for key, inst in tag_insts.collect()]

        tags = set()
        for row in self.data:
            tags |= set(row[1].split(" ", -1))

        tags = sorted(tags)
        tag_dict = dict(zip(tags, range(len(tags))))

        for i in range(len(self.data)):
            ori_feature = {}
            for tag in self.data[i][1].split(" ", -1):
                ori_feature[tag_dict.get(tag)] = 1

            self.assertTrue(ori_feature == features[i].sparse_vec)