Exemple #1
0
    def split(self, data_inst, shuffle=True):
        header = data_inst.schema.get('header')

        data_sids_iter, data_size = collect_index(data_inst)

        data_sids = []
        for sid, _ in data_sids_iter:
            data_sids.append(sid)
        data_sids = np.array(data_sids)

        if shuffle:
            np.random.shuffle(data_sids)

        kf = sk_KFold(n_splits=self.n_splits)

        for train, test in kf.split(data_sids):
            train_sids = data_sids[train]
            test_sids = data_sids[test]
            train_sids_table = [(str(x), 1) for x in train_sids]
            test_sids_table = [(str(x), 1) for x in test_sids]
            # print(train_sids_table)
            train_table = eggroll.parallelize(train_sids_table,
                                              include_key=True,
                                              partition=data_inst._partitions)
            train_data = data_inst.join(train_table, lambda x, y: x)
            test_table = eggroll.parallelize(test_sids_table,
                                             include_key=True,
                                             partition=data_inst._partitions)
            test_data = data_inst.join(test_table, lambda x, y: x)
            train_data.schema['header'] = header
            test_data.schema['header'] = header
            yield train_data, test_data
Exemple #2
0
    def split(self, data_inst):
        np.random.seed(self.random_seed)

        header = data_inst.schema.get('header')

        data_sids_iter, data_size = collect_index(data_inst)
        data_sids = []
        key_type = None
        for sid, _ in data_sids_iter:
            if key_type is None:
                key_type = type(sid)
            data_sids.append(sid)
        data_sids = np.array(data_sids)
        if self.shuffle:
            np.random.shuffle(data_sids)

        kf = sk_KFold(n_splits=self.n_splits)

        n = 0
        for train, test in kf.split(data_sids):

            train_sids = data_sids[train]
            test_sids = data_sids[test]

            n += 1

            train_sids_table = [(key_type(x), 1) for x in train_sids]
            test_sids_table = [(key_type(x), 1) for x in test_sids]
            # print(train_sids_table)
            train_table = session.parallelize(train_sids_table,
                                              include_key=True,
                                              partition=data_inst._partitions)
            train_data = data_inst.join(train_table, lambda x, y: x)

            test_table = session.parallelize(test_sids_table,
                                             include_key=True,
                                             partition=data_inst._partitions)
            test_data = data_inst.join(test_table, lambda x, y: x)
            train_data.schema['header'] = header
            test_data.schema['header'] = header
            yield train_data, test_data