def split(self, data_inst, shuffle=True): header = data_inst.schema.get('header') data_sids_iter, data_size = collect_index(data_inst) data_sids = [] for sid, _ in data_sids_iter: data_sids.append(sid) data_sids = np.array(data_sids) if shuffle: np.random.shuffle(data_sids) kf = sk_KFold(n_splits=self.n_splits) for train, test in kf.split(data_sids): train_sids = data_sids[train] test_sids = data_sids[test] train_sids_table = [(str(x), 1) for x in train_sids] test_sids_table = [(str(x), 1) for x in test_sids] # print(train_sids_table) train_table = eggroll.parallelize(train_sids_table, include_key=True, partition=data_inst._partitions) train_data = data_inst.join(train_table, lambda x, y: x) test_table = eggroll.parallelize(test_sids_table, include_key=True, partition=data_inst._partitions) test_data = data_inst.join(test_table, lambda x, y: x) train_data.schema['header'] = header test_data.schema['header'] = header yield train_data, test_data
def split(self, data_inst): np.random.seed(self.random_seed) header = data_inst.schema.get('header') data_sids_iter, data_size = collect_index(data_inst) data_sids = [] key_type = None for sid, _ in data_sids_iter: if key_type is None: key_type = type(sid) data_sids.append(sid) data_sids = np.array(data_sids) if self.shuffle: np.random.shuffle(data_sids) kf = sk_KFold(n_splits=self.n_splits) n = 0 for train, test in kf.split(data_sids): train_sids = data_sids[train] test_sids = data_sids[test] n += 1 train_sids_table = [(key_type(x), 1) for x in train_sids] test_sids_table = [(key_type(x), 1) for x in test_sids] # print(train_sids_table) train_table = session.parallelize(train_sids_table, include_key=True, partition=data_inst._partitions) train_data = data_inst.join(train_table, lambda x, y: x) test_table = session.parallelize(test_sids_table, include_key=True, partition=data_inst._partitions) test_data = data_inst.join(test_table, lambda x, y: x) train_data.schema['header'] = header test_data.schema['header'] = header yield train_data, test_data