y_shuffled = y[shuffle_indices] # Split train/test set n_dev_samples = int(0.1 * len(y)) # TODO: Create a f****n' correct cross validation procedure x_train, x_dev = x_shuffled[:-n_dev_samples], x_shuffled[-n_dev_samples:] y_train, y_dev = y_shuffled[:-n_dev_samples], y_shuffled[-n_dev_samples:] print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) vocab_file = '{}/vocab.pkl'.format(runs_dir) if os.path.exists(vocab_file): with open(vocab_file, 'rb') as fi: vocab = cPickle.load(fi) else: x_train_tokens = [ tokenize(sample) for sample in tqdm(x_train) ] vocab = Vocabulary(min_freq=5) vocab.fit(x_train_tokens) with open(vocab_file, 'wb') as fo: cPickle.dump(vocab, fo) f = h5py.File(dataset_file, 'w') x_train_dataset = f.create_dataset('x_train', shape=(len(x_train), FLAGS.length), dtype=np.int32) y_train_dataset = f.create_dataset('y_train', shape=y_train.shape, dtype=np.int32) x_dev_dataset = f.create_dataset('x_dev', shape=(len(x_dev), FLAGS.length), dtype=np.int32) y_dev_dataset = f.create_dataset('y_dev', shape=y_dev.shape, dtype=np.int32) y_train_dataset[:] = y_train y_dev_dataset[:] = y_dev vocab.max_sequence_length = FLAGS.length x_train_dataset[:] = vocab.transform(x_train_tokens).astype(np.int32) x_dev_dataset[:] = vocab.transform(x_dev).astype(np.int32) x_train = x_train_dataset[:] x_dev = x_dev_dataset[:]