def _set_dataset(self): if self._dataset is not None: return self._seed() self._dataset = Dataset( filename=self.args.datafile, folder=self.args.dataroot, transformer=self.importer["transformer"], normalize=self.args.normalize, ) if self.args.verbose: print("dataset loaded, {} classes in total".format( self._dataset.num_classes)) print("train_shape = {}, test_shape = {}".format( self._dataset.train.X.shape, self._dataset.test.X.shape)) self._dataset.filter(labels=self.args.labels) if self.args.balance: self._dataset.balance() self._dataset.sample(train_size=self.args.size, test_size=self.args.size) if self.args.verbose: print("dataset downsampled, {} classes in total".format( self._dataset.num_classes)) print("train_shape = {}, test_shape = {}".format( self._dataset.train.X.shape, self._dataset.test.X.shape))
def __init__(self): self.val_inc_set = parameters.full_val_inc_set self.net_income_dict = None self.count_dict = None self.val_inc_count = None self.dataset = Dataset() self.initial_price = None self.return_ratio_dict = None self.full_inc_set = dict() self.full_count_dict = dict()
def main(): BATCH_SIZE = 32 NUM_EPOCH = 12 LR = 0.001 CLIP = 1 STEP_SIZE = 4 GAMMA = 0.1 ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() train_data, valid_data, test_data = dataset.build_dataset() train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device) INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] TRG_PAD_IDX = dataset.TRG.vocab.stoi[dataset.TRG.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.apply(init_weight) model.to(device) optimizer = Adam(model.parameters(), lr=LR) criterion = CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device) scheduler = StepLR(optimizer, STEP_SIZE, GAMMA) min_valid_loss = 1e10 for e in range(NUM_EPOCH): print("Epoch: {}".format(e + 1)) train_loss = train(model, train_iterator, optimizer, criterion, CLIP) print("Train loss: {}".format(train_loss)) valid_loss = evaluate(model, valid_iterator, criterion) print("Valid loss: {}".format(valid_loss)) if valid_loss < min_valid_loss: torch.save(model.state_dict(), "best_model.pt") min_valid_loss = valid_loss
def main(fpath): ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 ENC_HID_DIM = 512 DEC_HID_DIM = 512 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 device = torch.device('cuda') dataset = Dataset() INPUT_DIM = len(dataset.SRC.vocab) OUTPUT_DIM = len(dataset.TRG.vocab) SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token] encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT) attention = Attention(ENC_HID_DIM, DEC_HID_DIM) decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM, DEC_DROPOUT, attention) model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device) model.load_state_dict(torch.load("best_model.pt")) model.to(device) with open(fpath, "r") as f: sentences = f.readlines() translate_sentence(model, sentences, dataset.SRC, dataset.TRG, device)
class Worktable: def __init__(self): self.dataset = Dataset() self.dataset.get_data() def plot_price(self): time_axis = [2014.0 + 1.0 / 24 + i * 1.0 / 12 for i in range(72)] for inc_id in list(self.dataset.val_inc_set): price_list = self.dataset.price_dict[inc_id] plt.plot(time_axis, price_list) plt.xlabel('years') plt.ylabel('price per share') plt.xlim((2014, 2020)) plt.title(f'{inc_id}') plt.savefig(f'figure/price/{inc_id}.png') plt.clf()
def test_build_search_session(self): importer = ReflexiveImporter("neural_net_adam") dataset = Dataset(folder="../dataset") self.session = SearchSession(importer.model, importer.param_dist, dataset, n_iter=1, cv=3)
class Worktable: def __init__(self): self.dataset = Dataset() self.dataset.get_data() self.volatility = None def cal_volatility(self): # we use e as the base when calculating the log return self.volatility = {} for inc_id in list(self.dataset.val_inc_set): log_return_list = [] price_list = self.dataset.price_dict[inc_id] assert len(price_list)==72 for t in range(len(price_list)-1): log_return = math.log(price_list[t+1]) - math.log(price_list[t]) log_return_list.append(log_return) assert len(log_return_list)==71 return_mean = sum(log_return_list) / len(log_return_list) vol = 0 for r in log_return_list: vol += ( r - return_mean )**2 vol = (vol/(len(log_return_list)-1))**0.5 self.volatility[inc_id] = vol
def main(): """Load data, train network, visualize results.""" data_dir = 'data/' trainset = loadmat(data_dir + 'train_32x32.mat') testset = loadmat(data_dir + 'test_32x32.mat') dataset = Dataset(trainset, testset) tf.reset_default_graph() dcgan = DCGAN(dataset) losses, samples = dcgan.train() # samples, losses = dcgan.load_pickle_data() dcgan.view_samples(-1, samples) dcgan.visualize_loss(losses)
def build(self, datadir, test_only=False): self.logger.info("Building trainer class %s" % self.__class__.__name__) self.logger.info("Loading data from [%s]..." % (datadir)) self.dataset = Dataset.load_ds(datadir, test_only) self.logger.info(str(self.dataset)) # build model, loss, optimizer self.logger.info("Constructing model with hparams:\n%s" % (json.dumps(self.config['Model'], indent=4))) self._build_models() self.logger.info('Constructing optimizer: %s' % self.config['Trainer']['optimizer']) optimizer = getattr(torch.optim, self.config['Trainer']['optimizer']) self._opt = optimizer(self._model.parameters(), self.config['Trainer']['lr']) params = [(name, p.shape) for name, p in self._model.named_parameters()] self.logger.debug('Optimizing parameters: %s' % str(params))
def train_and_evaluate(model, epochs, batches, gpus=[], dual=False, plot_history=False, plot_model=False): import keras, tensorflow as tf from keras import utils if len(gpus) > 0: os.environ["CUDA_VISIBLE_DEVICES"]=','.join(gpus) config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=config) keras.backend.set_session(sess) keras.backend.get_session().run(tf.global_variables_initializer()) if plot_model: if dual: utils.plot_model(model, to_file='dual_model.png', show_shapes=True) else: utils.plot_model(model, to_file='single_model.png', show_shapes=True) fetcher = DataFetcher() current_epochs = 0 history = None if dual: data_type = 'split' else: data_type = 'stack' for samples in fetcher.fetch_inf(type=data_type): if current_epochs >= epochs: break if dual: (x_train1, x_train2, y_train), (x_test1, x_test2, y_test) = samples history = model.fit( [x_train1, x_train2], y_train, batch_size=batches, epochs=EPOCHS_BATCH + current_epochs, initial_epoch=current_epochs, verbose=1, validation_data=([x_test1, x_test2], y_test), ) model.save(DUAL_MODEL_NAME) else: (x_train, y_train), (x_test, y_test) = samples history = model.fit( x_train, y_train, batch_size=batches, epochs=EPOCHS_BATCH + current_epochs, initial_epoch=current_epochs, verbose=1, validation_data=(x_test, y_test), ) model.save(SINGLE_MODEL_NAME) current_epochs += EPOCHS_BATCH if plot_history: import matplotlib.pyplot as plt # Plot training & validation accuracy values plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() # Plot training & validation loss values plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() dataset = Dataset() dataset.load(number=0) if dual: (x_train1, x_train2, y_train), (x_test1, x_test2, y_test) = dataset.data(type='split') score = model.evaluate([x_test1, x_test2], y_test, verbose=0) model.save(DUAL_MODEL_NAME) else: (x_train, y_train), (x_test, y_test) = dataset.data(type='stack') score = model.evaluate(x_test, y_test, verbose=0) model.save(SINGLE_MODEL_NAME) print('Test loss:', score[0]) print('Test accuracy:', score[1])
BATCH = 16 START_LR = 1e-3 STOP_LR = 1e-4 DECAY_OVER = 400000 args.parse_args() with open(args.CONFIG, "r") as config: config = yaml.safe_load(config) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = DDSP(**config["model"]).to(device) dataset = Dataset(config["preprocess"]["out_dir"]) dataloader = torch.utils.data.DataLoader( dataset, args.BATCH, True, drop_last=True, ) mean_loudness, std_loudness = mean_std_loudness(dataloader) config["data"]["mean_loudness"] = mean_loudness config["data"]["std_loudness"] = std_loudness writer = SummaryWriter(path.join(args.ROOT, args.NAME), flush_secs=20) with open(path.join(args.ROOT, args.NAME, "config.yaml"), "w") as out_config:
def main(args): # set up logs and device args.save_dir = get_save_dir(args.save_dir, args.name) log = get_logger(args.save_dir, args.name) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # set random seed log.info(f'Using random seed {args.seed}...') set_seeds(args.seed) # create dataset using torchtext log.info(f'Build data fields and {args.bert_variant} tokenizer...') dataset = Dataset(args.bert_variant) TEXT, LABEL = dataset.get_fields() # train:valid:test = 17500:7500:25000 log.info('Build IMDb dataset using torchtext.datasets...') train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) train_data, valid_data = train_data.split( random_state=random.seed(args.seed)) # iterators train_iterator, valid_iterator, test_iterator = dataset.get_iterators( train_data, valid_data, test_data, args.batch_size, device) # build LABEL vocabulary LABEL.build_vocab(train_data) # define model log.info('Building model...') model = BERTSentiment(args.bert_variant, args.hidden_dim, args.output_dim, args.n_layers, args.bidirectional, args.dropout) # optimizer optimizer = optim.Adam(model.parameters()) # criterion criterion = nn.BCEWithLogitsLoss() # place model and criterion on device model = model.to(device) criterion = criterion.to(device) # train set and validation set best_valid_loss = float('inf') for epoch in range(args.num_epochs): start_time = time.time() log.info(f'Training, epoch = {epoch}...') train_loss, train_acc = train(model, train_iterator, optimizer, criterion) log.info(f'Evaluating, epoch = {epoch}...') valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: log.info(f'Saving best model...') best_valid_loss = valid_loss torch.save(model.state_dict(), f'{args.save_dir}/{args.model_name}') log.info('Model trained and evaluated...') # test set log.info('Testing...') model.load_state_dict(torch.load(f'{args.save_dir}/{args.model_name}')) test_loss, test_acc = evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
def __init__(self): self.dataset = Dataset() self.dataset.get_data() self.volatility = None
import os from preprocess import Dataset from global_utils import dump, JsonMetricQueueWriter from .search_session import SearchSession from .sklearn_args import SklearnSessionParser, SklearnSessionArgs from reflexive_import import ReflexiveImporter if __name__ == '__main__': parser = SklearnSessionParser() args = SklearnSessionArgs(parser) dataset = Dataset(args.datafile, args.dataroot) dataset.filter(args.labels) if args.balance: dataset.balance() dataset.sample(args.size) importer = ReflexiveImporter(module_name=args.model, var_list=["model", "parameter_distribution"], alias_list=["model", "param"]) session = SearchSession(importer["model"], importer["param"], dataset, args.n_iter, args.cv) session.report_args() # tune (search for) hyper-parameters session.fit() session.report_best() session.report_result() dump(session.search_results, os.path.join(args.output, "search-results.pkl"))
class BaseSessionBuilder: def __init__(self, args: TorchSessionArgs): self.args = args if self.args.verbose: print(self.args) self.importer = ReflexiveImporter( module_name=self.args.model, var_list=[ "builder_class", "model_args", "model_kwargs", "transformer" ], package_name="pytorch_models", ) self._dataset = None self._model = None self._device = None self._writer = None self._session = None self._set_device() self.static_model_kwargs = dict( pretrained_path=self.args.pretrained, device=self._device, ) def _seed(self): if self.args.seed is not None: np.random.seed(self.args.seed) if self.args.verbose: print("setting numpy random seed to {}".format(self.args.seed)) elif self.args.verbose: print("no random seed specified for numpy") def _set_dataset(self): if self._dataset is not None: return self._seed() self._dataset = Dataset( filename=self.args.datafile, folder=self.args.dataroot, transformer=self.importer["transformer"], normalize=self.args.normalize, ) if self.args.verbose: print("dataset loaded, {} classes in total".format( self._dataset.num_classes)) print("train_shape = {}, test_shape = {}".format( self._dataset.train.X.shape, self._dataset.test.X.shape)) self._dataset.filter(labels=self.args.labels) if self.args.balance: self._dataset.balance() self._dataset.sample(train_size=self.args.size, test_size=self.args.size) if self.args.verbose: print("dataset downsampled, {} classes in total".format( self._dataset.num_classes)) print("train_shape = {}, test_shape = {}".format( self._dataset.train.X.shape, self._dataset.test.X.shape)) def _set_model(self): if self._model is not None: return self._set_dataset() builder_class = self.importer["builder_class"] # type: callable model_args = self.importer["model_args"] # type: tuple model_kwargs = self.importer["model_kwargs"] # type: dict model_kwargs.update(self.static_model_kwargs) model_kwargs.update(dict(num_classes=self._dataset.num_classes)) model_builder = builder_class(*model_args, **model_kwargs) self._model = model_builder() if self.args.verbose: print("using model", self._model) def _set_device(self): if self._device is not None: return self._device = torch.device( "cuda" if self.args.cuda or torch.cuda.is_available() else "cpu") if self.args.verbose: print("using device: {}".format(self._device)) def _set_writer(self): if self._writer is not None: return self._writer = SummaryWriter(log_dir=self.args.logdir) if self.args.verbose: print("logging summaries at", self._writer.log_dir) def _set_session(self): if self._session is not None: return self._set_dataset() self._set_model() self._set_device() self._set_writer() @property def dataset(self): self._set_dataset() return self._dataset @property def model(self): self._set_model() return self._model @property def device(self): self._set_device() return self._device @property def writer(self): self._set_writer() return self._writer @property def session(self): self._set_session() return self._session def __call__(self, *args, **kwargs): return self.session
def __init__(self): self.dataset = Dataset() self.dataset.get_data()
class RDataset: def __init__(self): self.val_inc_set = parameters.full_val_inc_set self.net_income_dict = None self.count_dict = None self.val_inc_count = None self.dataset = Dataset() self.initial_price = None self.return_ratio_dict = None self.full_inc_set = dict() self.full_count_dict = dict() def get_data(self): self.dataset.get_data() self.initial_price = self.dataset.initial_price self.net_income_dict = {} self.count_dict = {} for y in range(2014, 2020): for q in [2, 4]: #with open(f"financial_report/U_{y}Q{q}.csv", encoding='big5-hkscs') as f: with open(f"financial_report/U_{y}Q{q}.csv") as f: lines = f.readlines() err = 0 #for i in range(38,len(lines)): for i in range(len(lines)): line = lines[i].strip() char_list = line.split(',') if char_list[1] and char_list[1][0] == '(': char_list[1] = char_list[1][1:-1] try: inc_id = int(char_list[0]) net_income = float(char_list[1]) #using full dict to check if company has full data self.full_inc_set[inc_id] = self.full_inc_set.get( inc_id, []) self.full_inc_set[inc_id].append(net_income) self.full_count_dict[ inc_id] = self.full_count_dict.get(inc_id, 0) + 1 if inc_id in self.val_inc_set: self.net_income_dict[ inc_id] = self.net_income_dict.get( inc_id, []) self.net_income_dict[inc_id].append(net_income) self.count_dict[inc_id] = self.count_dict.get( inc_id, 0) + 1 except: err += 1 ''' #print the number print('full_count_dict:', self.full_count_dict) count_list = [0 for i in range(13)] for v in self.full_count_dict.values(): count_list[v] +=1 print('count_list:',count_list) new_parameters = set() for inc,v in self.full_count_dict.items(): if v >= 10 and inc>=1000: print(v) new_parameters.add(inc) print('new_parameters:', new_parameters) print(done) ''' self.val_inc_count = 0 for inc, c in self.count_dict.items(): if c == 12: self.val_inc_count += 1 #compute income_sum_dict self.income_sum_dict = {} for inc_id in list(self.val_inc_set): print(len(self.net_income_dict[inc_id])) assert len(self.net_income_dict[inc_id]) >= 10 self.income_sum_dict[inc_id] = sum(self.net_income_dict[inc_id]) print('net_income_dict:', self.net_income_dict) print('num of inc:', len(self.net_income_dict.keys())) print('count_dict:', self.count_dict) print('val_inc_count:', self.val_inc_count) print('val_inc_set:', self.val_inc_set) print('initial_price:', self.initial_price) print('income_sum_dict:', self.income_sum_dict) def cal_return_ratio(self): self.return_ratio_dict = {} for inc_id in list(self.val_inc_set): assert len(self.net_income_dict[inc_id]) >= 10 income_sum = sum(self.net_income_dict[inc_id]) / len( self.net_income_dict[inc_id]) initial_price = self.initial_price[inc_id] return_ratio = income_sum / initial_price self.return_ratio_dict[inc_id] = return_ratio print('return_ratio_dict:', self.return_ratio_dict) print('max_return', max(self.return_ratio_dict.values())) def cal_volatility(self): self.dataset.cal_volatility() print('volatility:', self.dataset.volatility) def plot_scatter(self): x = [] y = [] for inc in list(self.val_inc_set): volatility = self.dataset.volatility[inc] return_ratio = self.return_ratio_dict[inc] x.append(volatility) y.append(return_ratio) plt.scatter(x, y) plt.xlabel('volatility') plt.ylabel('mean P2E ratio') #中文?? plt.title('mean P2E ratio vs. volatility') plt.savefig('figure/scatter/new_scatter.png') plt.clf() def get_corrcoef(self): x = [] y = [] for inc in list(self.val_inc_set): volatility = self.dataset.volatility[inc] return_ratio = self.return_ratio_dict[inc] if 0.05 <= volatility <= 0.15 and 0 <= return_ratio <= 2: #remove outliers x.append(volatility) y.append(return_ratio) x = np.array(x) y = np.array(y) self.corrcoef = np.corrcoef(x, y)[0][1] print('correlation coefficient:', self.corrcoef) def get_index_data(self): self.debt_ratio_dict = dict() self.d2n_ratio_dict = dict() self.report_score_dict = dict() self.cash_ratio_dict = dict() self.quick_ratio_dict = dict() self.current_ratio_dict = dict() self.ipm_dict = dict() self.cash_flow_ratio_dict = dict() self.dict_list = [ None, self.debt_ratio_dict, self.d2n_ratio_dict, self.report_score_dict, self.cash_ratio_dict, self.quick_ratio_dict, self.current_ratio_dict, self.ipm_dict, self.cash_flow_ratio_dict, None ] with open('index/new_mean.csv', encoding='utf-8') as f: lines = f.readlines() for line in lines[1:]: char_list = line.strip().split(',') inc_id = int(char_list[0]) for index in range(1, 10): try: self.dict_list[index][inc_id] = float(char_list[index]) except: continue def plot_index_scatter_and_get_corrcoef(self): self.xlabel_list = [ None, 'debt ratio', 'debt-to-net worth ratio', 'financial report score', 'cash ratio', 'quick ratio', 'current ratio', 'ipm', 'cash_flow_ratio', 'stability' ] for index in range(1, 10): x_list = [] y_list = [] x_dict = self.dict_list[index] if x_dict == None: continue y_dict = self.return_ratio_dict for inc in list(self.val_inc_set): if inc not in x_dict or inc not in y_dict: print(f'incomplete data :{inc}') else: if -1000 <= x_dict[inc] <= 1000: x_list.append(x_dict[inc]) y_list.append(y_dict[inc]) xlabel = self.xlabel_list[index] plt.scatter(x_list, y_list) plt.xlabel(xlabel) plt.ylabel('mean P2E ratio') plt.title(f'mean P2E ratio vs. {xlabel}') plt.savefig(f'figure/scatter/{xlabel}.png') plt.clf() #get corrcoef x_array = np.array(x_list) y_array = np.array(y_list) corrcoef = np.corrcoef(x_array, y_array)[0][1] with open(f'figure/corrcoef/{xlabel}.txt', 'w+') as f: f.write(f'corrcoef: {corrcoef}\n')
def setUp(self): self.dataset = Dataset(folder="../dataset") self.n_train = len(self.dataset.train) self.n_test = len(self.dataset.test)
class TestDataset(unittest.TestCase): def setUp(self): self.dataset = Dataset(folder="../dataset") self.n_train = len(self.dataset.train) self.n_test = len(self.dataset.test) def test_sample_size(self): self.assertEqual(self.dataset.train.X.shape[0], self.dataset.train.y.shape[0]) self.assertEqual(self.dataset.test.X.shape[0], self.dataset.test.y.shape[0]) def test_dimension_size(self): self.assertEqual(self.dataset.train.X.shape[1], self.dataset.test.X.shape[1]) self.assertEqual(len(self.dataset.train.y.shape), 1) self.assertEqual(len(self.dataset.test.y.shape), 1) def test_type(self): self.assertIsInstance(self.dataset.mapping, dict) self.assertIsInstance(self.dataset.train.X, np.ndarray) self.assertIsInstance(self.dataset.train.y, np.ndarray) self.assertIsInstance(self.dataset.test.X, np.ndarray) self.assertIsInstance(self.dataset.test.y, np.ndarray) def test_sample_train(self): self.dataset.sample_train(0.5) self.assertAlmostEqual(len(self.dataset.train), self.n_train * 0.5, delta=1) self.dataset.reset_train() self.dataset.sample_test(0.3) self.assertAlmostEqual(len(self.dataset.test), self.n_test * 0.3, delta=1) self.dataset.reset_test() self.dataset.sample_train(0.2).sample_train(0.5) self.assertAlmostEqual(len(self.dataset.train), self.n_train * 0.2 * 0.5, delta=1) self.dataset.reset_train() self.dataset.sample_test(0.9).sample_test(0.9) self.assertAlmostEqual(len(self.dataset.test), self.n_test * 0.9 * 0.9, delta=1) self.dataset.reset_test() self.dataset.sample_test(3.0) self.assertAlmostEqual(len(self.dataset.test), self.n_test, delta=1) self.dataset.reset_test() self.dataset.sample_train(1000) self.assertAlmostEqual(len(self.dataset.train), 1000, delta=1) self.dataset.reset_train() self.dataset.sample_train(10000000000) self.assertAlmostEqual(len(self.dataset.train), self.n_train, delta=1) self.dataset.reset_train() self.dataset.sample_test(3534) self.assertAlmostEqual(len(self.dataset.test), 3534, delta=1) self.dataset.reset_test()