コード例 #1
0
         y_shuffled = y[shuffle_indices]
         # Split train/test set
         n_dev_samples = int(0.1 * len(y))
         # TODO: Create a f****n' correct cross validation procedure
         x_train, x_dev = x_shuffled[:-n_dev_samples], x_shuffled[-n_dev_samples:]
         y_train, y_dev = y_shuffled[:-n_dev_samples], y_shuffled[-n_dev_samples:]
 print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
 
 vocab_file = '{}/vocab.pkl'.format(runs_dir)
 if os.path.exists(vocab_file):
     with open(vocab_file, 'rb') as fi:
         vocab = cPickle.load(fi)
 else:
     x_train_tokens = [ tokenize(sample) for sample in tqdm(x_train) ]
     vocab = Vocabulary(min_freq=5)
     vocab.fit(x_train_tokens)
     with open(vocab_file, 'wb') as fo:
         cPickle.dump(vocab, fo)
     
     f = h5py.File(dataset_file, 'w')
     x_train_dataset = f.create_dataset('x_train', shape=(len(x_train), FLAGS.length), dtype=np.int32)
     y_train_dataset = f.create_dataset('y_train', shape=y_train.shape, dtype=np.int32)
     x_dev_dataset = f.create_dataset('x_dev', shape=(len(x_dev), FLAGS.length), dtype=np.int32)
     y_dev_dataset = f.create_dataset('y_dev', shape=y_dev.shape, dtype=np.int32)
     y_train_dataset[:] = y_train
     y_dev_dataset[:] = y_dev
     vocab.max_sequence_length = FLAGS.length
     x_train_dataset[:] = vocab.transform(x_train_tokens).astype(np.int32)
     x_dev_dataset[:] = vocab.transform(x_dev).astype(np.int32)
     x_train = x_train_dataset[:]
     x_dev = x_dev_dataset[:]