def _compute_metrics(cls, editor, examples, num_evaluate_examples, noiser, batch_size=256, edit_dropout=False, draw_samples=False): with random_seed(0): sample = sample_if_large(examples, num_evaluate_examples, replace=False) if edit_dropout: noised_sample = noiser(sample) else: noised_sample = sample # compute loss and log to TensorBoard # need to break the sample into batches, in case the sample is too large to fit in GPU memory losses, weights = [], [] for batch in chunks(noised_sample, batch_size): weights.append(len(batch)) loss_var, _, _ = editor.loss(batch, draw_samples) losses.append(loss_var.data[0]) losses, weights = np.array(losses), np.array(weights) loss = np.sum(losses * weights) / np.sum(weights) # weighted average # compute BLEU score and log to TensorBoard outputs, edit_traces = editor.edit(noised_sample) bleus = [] for ex, output in izip(noised_sample, outputs): # outputs is a list(over batches)[ list(over beams) [ list(over tokens) [ unicode ] ] ] object. bleus.append(bleu(ex.target_words, output[0])) avg_bleu = np.mean(bleus) return loss, avg_bleu, edit_traces
def __init__(self, filenames, filename_to_examples, relative_path=True, shuffle=True): """Construct the dataset based on the data in the files. Args: filenames (unicode or list[unicode]): names of the files filename_to_examples: a callable that takes a filename and yields Examples relative_path: whether to resolve the filename on DataDirectory.root """ self._examples = [] if isinstance(filenames, basestring): filenames = [filenames] for filename in filenames: if relative_path: filename = os.path.join(DataDirectory.root, filename) self._examples.extend(filename_to_examples(filename)) if shuffle: with random_seed(42): random.shuffle(self._examples) logging.info('Read {} examples ({}) from {}'.format( len(self._examples), 'shuffled' if shuffle else 'not shuffled', filenames))
def emulate_distribution(shape, target_samples, seed=None): m = np.mean(target_samples) s = np.std(target_samples) with random_seed(seed): samples = np.random.normal(m, s, size=shape) return samples.astype(np.float32)
def emulate_distribution(shape, target_samples, seed=None): m = np.mean(target_samples) s = np.std(target_samples) with random_seed(seed): samples = np.random.normal(m, s, size=shape) return samples
def _initialize_train_state(cls, config): """Set up all the state necessary to begin training.""" with random_seed(config.seed): editor = cls._build_editor(config.editor, config.num_iter, config.eps, config.momentum) optimizer = optim.Adam(editor.parameters(), lr=config.optim.learning_rate) train_steps = 0 max_grad_norm = 0 random_state = RandomState() return TrainState(editor=editor, optimizer=optimizer, train_steps=train_steps, random_state=random_state, max_grad_norm=max_grad_norm)
def evaluate(self, step): print 'Evaluate at step {}'.format(step) num_examples = self.config.num_evaluate_examples with random_seed(0): train_sample = sample_if_large(self.train_examples, num_examples, replace=False) with random_seed(0): valid_sample = sample_if_large(self.valid_examples, num_examples, replace=False) train_eval = self.evaluate_on_examples(step, train_sample, self.train_visualizer) valid_eval = self.evaluate_on_examples(step, valid_sample, self.valid_visualizer) # Log to TensorBoard train_eval.json_summarize(self.workspace.codalab, step) train_eval.tboard_summarize(self.tb_logger, step) valid_eval.json_summarize(self.workspace.codalab, step) valid_eval.tboard_summarize(self.tb_logger, step)
def evaluate_helper(examples, prefix): with random_seed(0): sample = sample_if_large(examples, num_samples, replace=False) eval = self.evaluate_on_examples(step=step, examples=sample, visualizer=silent_visualizer) # wrap with BernoulliSequenceStat, for conf intervals for name, stat in eval.stats.items(): if name.startswith('denoAcc'): eval.stats[name] = BernoulliSequenceStat(stat) with open(full_eval_path, 'a') as f: eval.summarize(f, prefix=prefix) eval.tboard_summarize(self.tb_logger, step, prefix=prefix) eval.json_summarize(self.workspace.codalab, step, prefix=prefix)
def __init__(self, filenames, filename_to_examples, relative_path=True, shuffle=True): """Construct the dataset based on the data in the files. Args: filenames (unicode or list[unicode]): names of the files filename_to_examples: a callable that takes a filename and yields Examples relative_path: whether to resolve the filename on DataDirectory.root """ self._examples = [] if isinstance(filenames, basestring): filenames = [filenames] for filename in filenames: if relative_path: filename = os.path.join(DataDirectory.root, filename) self._examples.extend(filename_to_examples(filename)) if shuffle: with random_seed(42): random.shuffle(self._examples) logging.info('Read {} examples ({}) from {}'.format( len(self._examples), 'shuffled' if shuffle else 'not shuffled', filenames))
def case_sample(examples): """Get a random sample of supervised ParseCases.""" with random_seed(0): example_sample = sample_if_large(examples, 30) return list(examples_to_supervised_cases(example_sample))