class RecursiveNNSupervisedEncoder(AbstractEncoder): def __init__(self, training_filename: str, hyperparameters: dict, combination_type='eqnet'): self.__hyperparameters = hyperparameters self.__dataset_extractor = TreeDatasetExtractor(training_filename) self.__rng = RandomStreams() self.__rnn = RNN(self.__hyperparameters['memory_size'], self.__hyperparameters, self.__rng, self.__dataset_extractor, combination_type=combination_type) check_hyperparameters(self.REQUIRED_HYPERPARAMETERS | self.__rnn.required_hyperparameters, self.__hyperparameters) target_embeddings = np.random.randn(self.__hyperparameters['memory_size'], self.__dataset_extractor.num_equivalent_classes) * 10 ** \ self.__hyperparameters[ "log_init_scale_embedding"] self.__target_embeddings = theano.shared(target_embeddings.astype(theano.config.floatX), name="target_embeddings") self.__target_embeddings_dropout = dropout(self.__hyperparameters['dropout_rate'], self.__rng, self.__target_embeddings, True) self.__target_bias = np.log(self.__dataset_extractor.training_empirical_distribution) self.__trainable_params = list(self.__rnn.get_params().values()) + [self.__target_embeddings] self.__compiled_methods = None self.__trained_parameters = None REQUIRED_HYPERPARAMETERS = {'log_learning_rate', 'rmsprop_rho', 'momentum', 'minibatch_size', 'grad_clip', 'memory_size', 'log_init_scale_embedding', 'dropout_rate', 'curriculum_initial_size', 'curriculum_step', 'accuracy_margin'} @property def rnn(self): return self.__rnn @property def rng(self): return self.__rng @property def hyperparameters(self): return self.__hyperparameters @property def dataset_extractor(self): return self.__dataset_extractor @property def trained_parameters(self): params = {} param_names = list(self.__rnn.get_params()) + ["target_embeddings"] for param, value in zip(param_names, self.__trained_parameters): params[param] = value return params def __get_loss(self, use_dropout: bool, iteration_number=0): _, all_node_encodings, additional_objective = self.__rnn.get_encoding(use_dropout, iteration_number) target_embeddings = self.__target_embeddings_dropout if use_dropout else self.__target_embeddings s = T.dot(all_node_encodings, target_embeddings) + self.__target_bias logprobs = log_softmax(s) eq_symbol = self.__rnn.get_input_variables().eq_symbol targets = T.extra_ops.to_one_hot(eq_symbol.dimshuffle('x'), self.__dataset_extractor.num_equivalent_classes) correct = logprobs[-1, eq_symbol] rest = T.max(T.flatten(logprobs[-1, (1 - targets).nonzero()])) ll = -T.nnet.relu(rest - correct + self.__hyperparameters['accuracy_margin']) return logprobs[-1], ll + additional_objective def __compile_train_functions(self): iteration_number = T.iscalar(name="iteration_number") _, ll = self.__get_loss(True, iteration_number) grad = T.grad(ll, self.__trainable_params, add_names=True) grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in self.__trainable_params] + [theano.shared(0, name="sample_count")] inputs = list(self.__rnn.get_input_variables()) + [iteration_number] self.__compiled_methods.grad_accumulate = theano.function( inputs=inputs, updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [(grad_acc[-1], grad_acc[-1] + 1)], outputs=T.mean(ll)) normalized_grads = [T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in grad_acc[:-1]] step_updates, ratios = nesterov_rmsprop_multiple(self.__trainable_params, normalized_grads, learning_rate=10 ** self.__hyperparameters[ "log_learning_rate"], rho=self.__hyperparameters["rmsprop_rho"], momentum=self.__hyperparameters["momentum"], grad_clip=self.__hyperparameters["grad_clip"], output_ratios=True) step_updates.extend( [(v, T.zeros(v.shape).astype(theano.config.floatX)) for v in grad_acc[:-1]]) # Set accumulators to 0 step_updates.append((grad_acc[-1], 0)) self.__compiled_methods.grad_step = theano.function(inputs=[], updates=step_updates, outputs=ratios) def __compile_test_functions(self): logprobs, ll = self.__get_loss(False) inputs = list(self.__rnn.get_input_variables()) self.__compiled_methods.ll_and_logprobs = theano.function( inputs=inputs, outputs=[T.mean(ll), logprobs]) self.__compiled_methods.encode = theano.function(inputs=self.__rnn.get_input_variables()[:-1], outputs=self.__rnn.get_encoding(False)[0]) def __compile_if_needed(self): if self.__compiled_methods is None: print("Compiling Methods...") if self.__trained_parameters is not None: self.set_parameter_values(self.__trained_parameters) self.__compiled_methods = Bunch() self.__compile_test_functions() self.__compile_train_functions() print("Compilation Finished...") def set_parameter_values(self, parameter_values: list): for param, value in zip(self.__trainable_params, parameter_values): param.set_value(value) def save(self, filename: str): tmp, self.__compiled_methods = self.__compiled_methods, None AbstractEncoder.save(self, filename) self.__compiled_methods = tmp def get_representation_vector_size(self) -> int: return self.__hyperparameters['memory_size'] def get_encoding(self, data: tuple) -> np.array: self.__compile_if_needed() converted_tree = self.__dataset_extractor.convert_tree_to_array(data[1], ignore_eq_symbols=True)[:-1] return self.__compiled_methods.encode(*converted_tree) def prediction_accuracy(self, dataset_file): self.__compile_if_needed() data = import_data(dataset_file) dataset = list((self.__dataset_extractor.get_dataset_for_encoder(data, return_num_tokens=True))) correct = 0 for tree in dataset: all_args = list(tree[0]) ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args) if np.argmax(logprobs) == all_args[-1]: correct += 1 return correct / len(dataset) def train(self, training_file, validation_file, max_iter=5000, patience=50, validation_check_limit=2, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list(self.__dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True)) validation_set = list(self.__dataset_extractor.get_dataset_for_encoder(import_data(validation_file), return_num_tokens=True)) print("Num classes: %s" % self.__dataset_extractor.num_equivalent_classes) def compute_validation_score() -> float: print("Train Accuracy %s" % compute_score(training_set, False, True)[1]) return compute_score(validation_set) def compute_score(dataset, print_score=True, return_accuracy=False) -> float: # Get all encodings sum_ll = 0. correct = 0 for tree in dataset: all_args = list(tree[0]) ll, logprobs = self.__compiled_methods.ll_and_logprobs(*all_args) sum_ll += ll if np.argmax(logprobs) == all_args[-1]: correct += 1 if print_score: print("Accuracy: %s, LL: %s" % (correct / len(dataset) * 100, sum_ll / len(dataset))) if return_accuracy: return sum_ll / len(dataset), (correct / len(dataset) * 100) return (correct / len(dataset) * 100) if self.__trained_parameters is None: best_score = float('-inf') else: best_score = compute_validation_score() print("Previous best validation score: %s" % best_score) try: print("[%s] Training Started..." % time.asctime()) ratios = np.zeros(len(self.__trainable_params)) epochs_not_improved = 0 historic_data = defaultdict(list) # Clump minibatches and disallow minibatches that are smaller than their given size, since they may # cause instability. current_max_size = self.__hyperparameters['curriculum_initial_size'] curriculum_step = self.__hyperparameters['curriculum_step'] for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 sum_train_loss = 0 num_elements = 0 num_minibatches = max(1, min(int(np.floor(float(len(sample_ordering)) / minibatch_size)), 10)) for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] args = list(training_set[current_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate(*args) sum_train_loss += loss num_elements += 1 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: print("Iteration %s Stats" % i) current_score = compute_validation_score() historic_data['validation_score'].append(current_score) if current_score > best_score: best_score = current_score self.__trained_parameters = [p.get_value() for p in self.__trainable_params] print("At %s validation: current_score=%s [best so far]" % (i, current_score)) epochs_not_improved = 0 else: print("At %s validation: current_score=%s" % (i, current_score)) epochs_not_improved += 1 for k in range(len(self.__trainable_params)): print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches)) print("Train ll: %s" % (sum_train_loss / num_elements)) ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run(historic_data) if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt): print("Interrupted. Exiting training gracefully...") return best_score, historic_data
class SequenceGruSiameseEncoder(AbstractEncoder): """ Train an encoder """ def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False): """ :param training_file: :type hyperparameters: dict :return: """ self.__hyperparameters = hyperparameters self.dataset_extractor = TokenAutoencoderDatasetExtractor(training_file) empirical_distribution = get_empirical_distribution(self.dataset_extractor.feature_map, chain(*self.dataset_extractor.get_nonnoisy_samples( import_data(training_file)))) self.__encoder = SequenceGruSiameseEncoderModel(self.__hyperparameters["embedding_size"], len(self.dataset_extractor.feature_map), empirical_distribution, self.__hyperparameters["representation_size"], self.__hyperparameters, encoder_type=encoder_type, use_centroid=use_centroid) self.__trained_parameters = None self.__compiled_methods = None REQUIRED_HYPERPARAMETERS = {'log_learning_rate', 'rmsprop_rho', 'momentum', 'grad_clip', 'minibatch_size', 'embedding_size', 'representation_size', 'log_init_noise', 'dropout_rate'} def __get_siamese_loss(self, use_dropout, scale_similar=1, scale_dissimilar=1): encoder_copy = self.__encoder.copy_full(name="siameseEncoder") encoding_1 = self.__encoder.get_encoding() encoding_2 = encoder_copy.get_encoding() representation_distance = (encoding_1 - encoding_2).norm(2) similar_loss = -scale_similar * T.pow(representation_distance, 2) margin = self.__hyperparameters['dissimilar_margin'] dissimilar_loss = -scale_dissimilar * T.pow(T.nnet.relu(margin - representation_distance), 2) return dissimilar_loss, similar_loss, encoder_copy, encoding_1, encoding_2 def __compile_train_functions(self): dissimilar_loss, similar_loss, encoder_copy, repr1, repr2 = self.__get_siamese_loss(True) wrt_vars = list(self.__encoder.parameters.values()) grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in wrt_vars] \ + [theano.shared(0, name="sample_count")] grad = T.grad(similar_loss, wrt_vars) self.__compiled_methods.grad_siamese_similar = theano.function( inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable], updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [ (grad_acc[-1], grad_acc[-1] + 1)], outputs=[similar_loss, repr1, repr2]) grad = T.grad(dissimilar_loss, wrt_vars) self.__compiled_methods.grad_siamese_dissimilar = theano.function( inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable], updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [ (grad_acc[-1], grad_acc[-1] + 1)], outputs=[dissimilar_loss, repr1, repr2]) normalized_grads = [T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in grad_acc[:-1]] step_updates, ratios = nesterov_rmsprop_multiple(wrt_vars, normalized_grads, learning_rate=10 ** self.__hyperparameters[ "log_learning_rate"], rho=self.__hyperparameters["rmsprop_rho"], momentum=self.__hyperparameters["momentum"], grad_clip=self.__hyperparameters["grad_clip"], output_ratios=True) step_updates.extend([(v, T.zeros(v.shape)) for v in grad_acc[:-1]]) # Set accumulators to 0 step_updates.append((grad_acc[-1], 0)) self.__compiled_methods.grad_step = theano.function(inputs=[], updates=step_updates, outputs=ratios) def __compile_test_functions(self): dissimilar_loss, similar_loss, encoder_copy, _, _ = self.__get_siamese_loss(False) self.__compiled_methods.test_similar_loss = theano.function( inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable], outputs=similar_loss) self.__compiled_methods.test_dissimilar_loss = theano.function( inputs=[encoder_copy.input_sequence_variable, self.__encoder.input_sequence_variable], outputs=dissimilar_loss) self.__compiled_methods.encode = theano.function(inputs=[self.__encoder.input_sequence_variable], outputs=self.__encoder.get_encoding()) def __compile_if_needed(self): if self.__compiled_methods is None: print("Compiling Methods...") self.__compiled_methods = Bunch() self.__compile_train_functions() self.__compile_test_functions() print("Compilation Finished...") def train(self, training_file: str, validation_file: str, max_iter: int = 1000, patience: int = 25, validation_check_limit: int = 1, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list(self.dataset_extractor.get_dataset_for_encoder(training_data, return_num_tokens=True)) validation_set = list( self.dataset_extractor.get_dataset_for_encoder(import_data(validation_file), return_num_tokens=True)) best_score = float('-inf') train_x_ent = 0 epochs_not_improved = 0 historic_values = [] trainable_parameters = list(self.__encoder.parameters.values()) print("Num classes: %s" % self.dataset_extractor.num_equivalence_classes) def compute_validation_score() -> float: return compute_score(validation_set) def compute_score(dataset) -> float: # Get all encodings encodings = [] equivalents = defaultdict(set) for i, tree in enumerate(dataset): encodings.append(self.__compiled_methods.encode(tree[0])) equivalents[tree[2]].add(i) encodings = np.array(encodings, dtype=theano.config.floatX) distances = pdist(encodings, metric='euclidean') is_similar = np.zeros_like(distances, dtype=np.int) for equivalence_set in equivalents.values(): for i, j in permutations(equivalence_set, 2): if i > j: is_similar[encodings.shape[0] * j - int(j * (j + 1) / 2) + i - 1 - j] = 1 similar_score = -np.sum(np.power(distances * is_similar, 2)) margin = self.__hyperparameters['dissimilar_margin'] differences = margin - distances rectified_diffs = differences * (differences > 0) dissimilar_score = -np.sum(np.power(rectified_diffs * (1 - is_similar), 2)) print("Similar Loss: %s Dissimilar Loss: %s" % (-similar_score, -dissimilar_score)) return similar_score + dissimilar_score if self.__trained_parameters is None: best_score = float('-inf') else: best_score = compute_validation_score() print("Previous best validation score: %s" % best_score) try: print("[%s] Training Started..." % time.asctime()) sum_similar_loss = 0 num_similar_loss = 0 sum_dissimilar_loss = 0 num_dissimilar_loss = 0 ratios = np.zeros(len(list(self.__encoder.parameters.values()))) epochs_not_improved = 0 # Clump minibatches and disallow minibatches that are smaller than their given size, since they may # cause instability. num_minibatches = max(1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 2)) current_max_size = 4. curriculum_step = .1 for i in range(max_iter): sample_ordering = [] for j, tree in enumerate(training_set): if tree[-1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] # Add siamese gradients, by picking num_examples num_examples = 1 # The max number of examples to pick from TODO: as parameter similar_snippet_idxs = [] dissimilar_snippet_idxs = [] for l in range(len(sample_ordering)): if l == k: continue other_idx = sample_ordering[l] if training_set[current_idx][2] == training_set[other_idx][2]: similar_snippet_idxs.append(other_idx) else: dissimilar_snippet_idxs.append(other_idx) dissimilar_snippet_idxs = np.array(dissimilar_snippet_idxs) np.random.shuffle(similar_snippet_idxs) for other_idx in similar_snippet_idxs: loss, repr1, repr2 = self.__compiled_methods.grad_siamese_similar( list(training_set[current_idx][0]), list(training_set[other_idx][0])) sum_similar_loss += loss num_similar_loss += 1 for other_idx in dissimilar_snippet_idxs: loss, repr1, repr2 = self.__compiled_methods.grad_siamese_dissimilar( training_set[current_idx][0], training_set[other_idx][0]) sum_dissimilar_loss += loss num_dissimilar_loss += 1 if loss < 0 else 0 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: print("Iteration %s Stats" % i) current_score = compute_validation_score() if current_score > best_score: best_score = current_score self.__trained_parameters = [p.get_value() for p in list(self.__encoder.parameters.values())] print("At %s validation: current_score=%s [best so far]" % (i, current_score)) epochs_not_improved = 0 else: print("At %s validation: current_score=%s" % (i, current_score)) epochs_not_improved += 1 for k in range(len(list(self.__encoder.parameters.values()))): print("%s: %.0e" % (list(self.__encoder.parameters.values())[k].name, ratios[k] / n_batches)) print("Train sum similar-loss: %s (%s samples)" % (sum_similar_loss, num_similar_loss)) print("Train sum dissimilar-loss: %s (%s samples)" % (sum_dissimilar_loss, num_dissimilar_loss)) print("Training Set stats: %s" % compute_score(training_set[:500])) historic_values.append({"validation_xent": current_score}) sum_similar_loss = 0 num_similar_loss = 0 sum_dissimilar_loss = 0 num_dissimilar_loss = 0 ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run() if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt): print("Interrupted. Exiting training gracefully...") return best_score, historic_values def __save_current_params_as_best(self): self.__trained_parameters = [p.get_value() for p in list(self.__encoder.parameters.values())] def save(self, filename: str): tmp, self.__compiled_methods = self.__compiled_methods, None AbstractEncoder.save(self, filename) self.__compiled_methods = tmp def get_representation_vector_size(self) -> int: return self.__hyperparameters["representation_size"] def get_encoding(self, data: tuple) -> np.array: self.__compile_if_needed() converted_tokens = self.dataset_extractor.tokens_to_array(data[0]) return self.__compiled_methods.encode(converted_tokens) def decoder_loss(self, data: tuple, representation: np.array) -> float: raise NotImplementedError("An encoder cannot do this operation")
class RecursiveNNSiameseEncoder(AbstractEncoder): def __init__(self, training_filename: str, hyperparameters: dict, combination_type='residual_with_ae'): self.__hyperparameters = hyperparameters self.__dataset_extractor = TreeDatasetExtractor(training_filename) self.__rng = RandomStreams() self.__rnn = RNN(self.__hyperparameters['memory_size'], self.__hyperparameters, self.__rng, self.__dataset_extractor, combination_type=combination_type) self.__trainable_params = list(self.__rnn.get_params().values()) check_hyperparameters( self.REQUIRED_HYPERPARAMETERS | self.__rnn.required_hyperparameters, self.__hyperparameters) self.__compiled_methods = None self.__trained_parameters = None @staticmethod def get_encoder_from_supervised(supervised_encoder, dissimilar_margin: float): siamese = RecursiveNNSiameseEncoder.__new__(RecursiveNNSiameseEncoder) siamese.__rng = supervised_encoder.rng siamese.__rnn = supervised_encoder.rnn siamese.__dataset_extractor = supervised_encoder.dataset_extractor siamese.__hyperparameters = supervised_encoder.hyperparameters siamese.__hyperparameters['dissimilar_margin'] = dissimilar_margin siamese.__trainable_params = list(siamese.__rnn.get_params().values()) saved_parameters = supervised_encoder.trained_parameters # print(saved_parameters) # siamese.set_parameter_values([saved_parameters[name] for name in siamese.__rnn.get_params()]) # Ignore the target embeddings siamese.__trained_parameters = [ p.get_value() for p in siamese.__trainable_params ] siamese.__compiled_methods = None return siamese REQUIRED_HYPERPARAMETERS = { 'log_learning_rate', 'rmsprop_rho', 'momentum', 'minibatch_size', 'grad_clip', 'memory_size', 'log_init_scale_embedding', 'dropout_rate', 'dissimilar_margin', 'curriculum_initial_size', 'curriculum_step', 'max_num_similar_examples', 'max_num_dissimilar_examples' } def __get_loss(self, use_dropout, iteration_number=0): node_encoding1, _, extra_loss1 = self.__rnn.get_encoding( use_dropout, iteration_number) node_encoding1 /= node_encoding1.norm(2) copy_rnn = self.__rnn.copy_full() node_encoding2, _, extra_loss2 = copy_rnn.get_encoding( use_dropout, iteration_number) node_encoding2 /= node_encoding2.norm(2) distance = (node_encoding1 - node_encoding2).norm(2) are_non_equivalent = self.__rnn.get_input_variables( ).eq_symbol - copy_rnn.get_input_variables().eq_symbol margin = self.__hyperparameters['dissimilar_margin'] siamese_loss = -T.power( T.switch(are_non_equivalent, T.nnet.relu(margin - distance), distance), 2) return siamese_loss + extra_loss1 + extra_loss2, copy_rnn def __compile_train_functions(self): iteration_number = T.iscalar('iteration_number') prob_correct, other_rnn = self.__get_loss(True, iteration_number) grad = T.grad(prob_correct, self.__trainable_params, add_names=True) grad_acc = [ theano.shared( np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in self.__trainable_params ] + [theano.shared(0, name="sample_count")] inputs = list(self.__rnn.get_input_variables()) + list( other_rnn.get_input_variables()) + [iteration_number] self.__compiled_methods.grad_accumulate = theano.function( inputs=inputs, updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [(grad_acc[-1], grad_acc[-1])], # TODO: Remove accumulator if indeed not needed outputs=T.mean(prob_correct)) normalized_grads = [ T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in grad_acc[:-1] ] step_updates, ratios = nesterov_rmsprop_multiple( self.__trainable_params, normalized_grads, learning_rate=10**self.__hyperparameters["log_learning_rate"], rho=self.__hyperparameters["rmsprop_rho"], momentum=self.__hyperparameters["momentum"], grad_clip=self.__hyperparameters["grad_clip"], output_ratios=True) step_updates.extend([(v, T.zeros(v.shape).astype(theano.config.floatX)) for v in grad_acc[:-1]]) # Set accumulators to 0 step_updates.append((grad_acc[-1], 0)) self.__compiled_methods.grad_step = theano.function( inputs=[], updates=step_updates, outputs=ratios) def __compile_test_functions(self): prob_correct, other_rnn = self.__get_loss(False) inputs = list(self.__rnn.get_input_variables()) + list( other_rnn.get_input_variables()) self.__compiled_methods.probability = theano.function( inputs=inputs, outputs=[prob_correct]) encoding, _, _ = self.__rnn.get_encoding(False) encoding /= encoding.norm(2) self.__compiled_methods.encode = theano.function( inputs=self.__rnn.get_input_variables()[:-1], outputs=encoding) def __compile_if_needed(self): if self.__compiled_methods is None: print("Compiling Methods...") if self.__trained_parameters is not None: self.set_parameter_values(self.__trained_parameters) self.__compiled_methods = Bunch() self.__compile_test_functions() self.__compile_train_functions() print("Compilation Finished...") def set_parameter_values(self, parameter_values: list): for param, value in zip(self.__trainable_params, parameter_values): param.set_value(value) def save(self, filename: str): tmp, self.__compiled_methods = self.__compiled_methods, None AbstractEncoder.save(self, filename) self.__compiled_methods = tmp def get_representation_vector_size(self) -> int: return self.__hyperparameters['memory_size'] def get_encoding(self, data: tuple) -> np.array: self.__compile_if_needed() converted_tree = self.__dataset_extractor.convert_tree_to_array( data[1])[:-1] return self.__compiled_methods.encode(*converted_tree) def train(self, training_file, validation_file, max_iter=1000, patience=25, validation_check_limit=1, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list( self.__dataset_extractor.get_dataset_for_encoder( training_data, return_num_tokens=True)) validation_set = list( self.__dataset_extractor.get_dataset_for_encoder( import_data(validation_file), return_num_tokens=True)) def compute_validation_score() -> float: return compute_score(validation_set) def compute_score(dataset) -> float: # Get all encodings encodings = [] equivalents = defaultdict(set) for i, tree in enumerate(dataset): encodings.append(self.__compiled_methods.encode(*tree[0][:-1])) equivalents[tree[2]].add(i) encodings = np.array(encodings, dtype=theano.config.floatX) # Get all cosine similarities distances = pdist(encodings) is_similar = np.zeros_like(distances, dtype=np.int) for equivalence_set in equivalents.values(): for i, j in permutations(equivalence_set, 2): if i > j: is_similar[encodings.shape[0] * j - int(j * (j + 1) / 2) + i - 1 - j] = 1 similar_score = -np.sum(np.power(distances * is_similar, 2)) margin = self.__hyperparameters['dissimilar_margin'] differences = margin - distances rectified_diffs = differences * (differences > 0) dissimilar_score = -np.sum( np.power(rectified_diffs * (1 - is_similar), 2)) print("Similar Loss: %s Dissimilar Loss: %s" % (similar_score, dissimilar_score)) return similar_score + dissimilar_score if self.__trained_parameters is None: best_score = float('-inf') else: best_score = compute_validation_score() print("Previous best validation score: %s" % best_score) try: print("[%s] Training Started..." % time.asctime()) sum_similar_loss = 0. num_similar_loss = 0 sum_dissimilar_loss = 0. num_dissimilar_loss = 0 ratios = np.zeros(len(self.__trainable_params)) epochs_not_improved = 0 historic_data = defaultdict(list) # Clump minibatches and disallow minibatches that are smaller than their given size, since they may # cause instability. num_minibatches = max( 1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 10)) current_max_size = self.__hyperparameters[ 'curriculum_initial_size'] curriculum_step = self.__hyperparameters['curriculum_step'] num_examples = self.__hyperparameters['max_num_similar_examples'] num_dissimilar_examples = self.__hyperparameters[ 'max_num_dissimilar_examples'] for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] # Add siamese gradients, by picking num_examples similar_snippet_idxs = [] dissimilar_snippet_idxs = [] for l in range(len(sample_ordering)): if l == k: continue other_idx = sample_ordering[l] if training_set[current_idx][2] == training_set[ other_idx][2]: similar_snippet_idxs.append(other_idx) else: dissimilar_snippet_idxs.append(other_idx) dissimilar_snippet_idxs = np.array( dissimilar_snippet_idxs) np.random.shuffle(similar_snippet_idxs) np.random.shuffle(dissimilar_snippet_idxs) for other_idx in similar_snippet_idxs[:num_examples]: args = list(training_set[current_idx][0]) + list( training_set[other_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate( *args) sum_similar_loss += loss num_similar_loss += 1 for other_idx in dissimilar_snippet_idxs[: num_dissimilar_examples]: args = list(training_set[current_idx][0]) + list( training_set[other_idx][0]) + [i] loss = self.__compiled_methods.grad_accumulate( *args) sum_dissimilar_loss += loss num_dissimilar_loss += 1 if loss < 0 else 0 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: print("Iteration %s Stats" % i) current_score = compute_validation_score() historic_data['validation_score'].append(current_score) if current_score > best_score: best_score = current_score self.__trained_parameters = [ p.get_value() for p in self.__trainable_params ] print( "At %s validation: current_score=%s [best so far]" % (i, current_score)) epochs_not_improved = 0 else: print("At %s validation: current_score=%s" % (i, current_score)) epochs_not_improved += 1 for k in range(len(self.__trainable_params)): print("%s: %.0e" % (self.__trainable_params[k].name, ratios[k] / n_batches)) print("Train sum similar-loss: %s (%s samples)" % (sum_similar_loss, num_similar_loss)) print("Train sum dissimilar-loss: %s (%s samples)" % (sum_dissimilar_loss, num_dissimilar_loss)) # print("Training Set stats: %s" % compute_score(training_set[:500])) sum_similar_loss = 0 num_similar_loss = 0 sum_dissimilar_loss = 0 num_dissimilar_loss = 0 ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run(historic_data) if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt): print("Interrupted. Exiting training gracefully...") return best_score, historic_data
class SequenceGruSupervisedEncoder(AbstractEncoder): """ Train an encoder """ def __init__(self, training_file, hyperparameters, encoder_type='gru', use_centroid=False): """ :param training_file: :type hyperparameters: dict :return: """ self.__hyperparameters = hyperparameters self.dataset_extractor = TokenAutoencoderDatasetExtractor( training_file) empirical_distribution = get_empirical_distribution( self.dataset_extractor.feature_map, chain(*self.dataset_extractor.get_nonnoisy_samples( import_data(training_file)))) self.__encoder = SequenceGruSupervisedEncoderModel( self.__hyperparameters["embedding_size"], len(self.dataset_extractor.feature_map), empirical_distribution, self.__hyperparameters["representation_size"], self.__hyperparameters, encoder_type=encoder_type, use_centroid=use_centroid) target_embeddings = np.random.randn(self.__hyperparameters["representation_size"], self.dataset_extractor.num_equivalence_classes) * 10 ** \ self.__hyperparameters[ "log_init_noise"] self.__target_embeddings = theano.shared(target_embeddings.astype( theano.config.floatX), name="target_embeddings") self.__target_embeddings_dropout = dropout( self.__hyperparameters['dropout_rate'], self.__encoder.rng, self.__target_embeddings, True) self.__trained_parameters = None self.__compiled_methods = None REQUIRED_HYPERPARAMETERS = { 'log_learning_rate', 'rmsprop_rho', 'momentum', 'grad_clip', 'minibatch_size', 'embedding_size', 'representation_size', 'log_init_noise', 'dropout_rate' } def __get_loss(self, target_class, use_dropout): encoding = self.__encoder.get_encoding() target_embeddings = self.__target_embeddings_dropout if use_dropout else self.__target_embeddings logprobs = log_softmax( T.dot(encoding / encoding.norm(2), target_embeddings).dimshuffle('x', 0))[0] return logprobs, logprobs[target_class] def __compile_train_functions(self): target_class = T.iscalar(name="target_class") _, ll = self.__get_loss(target_class, True) wrt_vars = list( self.__encoder.parameters.values()) + [self.__target_embeddings] grad = T.grad(ll, wrt_vars) grad_acc = [theano.shared(np.zeros(param.get_value().shape).astype(theano.config.floatX)) for param in wrt_vars] \ + [theano.shared(0, name="sample_count")] self.__compiled_methods.grad_accumulate = theano.function( inputs=[self.__encoder.input_sequence_variable, target_class], updates=[(v, v + g) for v, g in zip(grad_acc, grad)] + [(grad_acc[-1], grad_acc[-1] + 1)], outputs=ll) normalized_grads = [ T.switch(grad_acc[-1] > 0, g / grad_acc[-1].astype(theano.config.floatX), g) for g in grad_acc[:-1] ] step_updates, ratios = nesterov_rmsprop_multiple( wrt_vars, normalized_grads, learning_rate=10**self.__hyperparameters["log_learning_rate"], rho=self.__hyperparameters["rmsprop_rho"], momentum=self.__hyperparameters["momentum"], grad_clip=self.__hyperparameters["grad_clip"], output_ratios=True) step_updates.extend([(v, T.zeros(v.shape)) for v in grad_acc[:-1]]) # Set accumulators to 0 step_updates.append((grad_acc[-1], 0)) self.__compiled_methods.grad_step = theano.function( inputs=[], updates=step_updates, outputs=ratios) def __compile_test_functions(self): target_class = T.iscalar(name="target_class") logprobs, ll = self.__get_loss(target_class, False) self.__compiled_methods.ll_and_logprobs = theano.function( inputs=[self.__encoder.input_sequence_variable, target_class], outputs=[ll, logprobs]) self.__compiled_methods.encode = theano.function( inputs=[self.__encoder.input_sequence_variable], outputs=self.__encoder.get_encoding()) def __compile_if_needed(self): if self.__compiled_methods is None: print("Compiling Methods...") self.__compiled_methods = Bunch() self.__compile_train_functions() self.__compile_test_functions() print("Compilation Finished...") def train(self, training_file: str, validation_file: str, max_iter: int = 1000, patience: int = 25, validation_check_limit: int = 1, semantically_equivalent_noise: bool = False, additional_code_to_run=None) -> tuple: self.__compile_if_needed() minibatch_size = self.__hyperparameters["minibatch_size"] training_data = import_data(training_file) training_set = list( self.dataset_extractor.get_dataset_for_encoder( training_data, return_num_tokens=True)) validation_set = list( self.dataset_extractor.get_dataset_for_encoder( import_data(validation_file), return_num_tokens=True)) best_score = float('-inf') train_x_ent = 0 epochs_not_improved = 0 historic_values = [] trainable_parameters = list( self.__encoder.parameters.values()) + [self.__target_embeddings] print("Num classes: %s" % self.dataset_extractor.num_equivalence_classes) def compute_validation_score() -> float: return compute_score(validation_set) def compute_score(dataset) -> float: # Get all encodings sum_ll = 0. correct = 0 for data in dataset: ll, logprobs = self.__compiled_methods.ll_and_logprobs( data[0], data[2]) sum_ll += ll if np.argmax(logprobs) == data[2]: correct += 1 print("Accuracy: %s" % (correct / len(dataset) * 100)) return sum_ll / len(dataset) num_minibatches = max( 1, min(int(np.floor(float(len(training_set)) / minibatch_size)), 25)) # Clump minibatches try: print("[%s] Training Started..." % time.asctime()) ratios = np.zeros(len(trainable_parameters)) n_batches = 0 current_max_size = 3. curriculum_step = .2 for i in range(max_iter): sample_ordering = [] for j, tree_data in enumerate(training_set): if tree_data[1] <= current_max_size: sample_ordering.append(j) current_max_size += curriculum_step np.random.shuffle(np.array(sample_ordering, dtype=np.int32)) n_batches = 0 sum_train_loss = 0 num_elements = 0 for j in trange(num_minibatches, desc="Minibatch"): for k in trange(j * minibatch_size, min((j + 1) * minibatch_size, len(sample_ordering)), desc="Sample", leave=False): current_idx = sample_ordering[k] loss = self.__compiled_methods.grad_accumulate( training_set[current_idx][0], training_set[current_idx][2]) sum_train_loss += loss num_elements += 1 n_batches += 1 ratios += self.__compiled_methods.grad_step() if i % validation_check_limit == validation_check_limit - 1: current_ll = compute_validation_score() if current_ll > best_score: best_score = current_ll self.__save_current_params_as_best() print("At %s validation: current_ll=%s [best so far]" % (i, current_ll)) epochs_not_improved = 0 else: print("At %s validation: current_ll=%s" % (i, current_ll)) epochs_not_improved += 1 for k in range(len(trainable_parameters)): print("%s: %.0e" % (trainable_parameters[k].name, ratios[k] / n_batches)) print("Train ll: %s" % (sum_train_loss / num_elements)) ratios = np.zeros_like(ratios) if additional_code_to_run is not None: additional_code_to_run() if epochs_not_improved >= patience: print("Not improved for %s epochs. Stopping..." % patience) break print("[%s] Training Finished..." % time.asctime()) except (InterruptedError, KeyboardInterrupt, SystemExit): print("Interrupted. Exiting training gracefully...") return best_score, historic_values def __save_current_params_as_best(self): self.__trained_parameters = [ p.get_value() for p in list(self.__encoder.parameters.values()) + [self.__target_embeddings] ] def save(self, filename: str): tmp, self.__compiled_methods = self.__compiled_methods, None AbstractEncoder.save(self, filename) self.__compiled_methods = tmp def get_representation_vector_size(self) -> int: return self.__hyperparameters["representation_size"] def get_encoding(self, data: tuple) -> np.array: self.__compile_if_needed() converted_tokens = self.dataset_extractor.tokens_to_array(data[0]) return self.__compiled_methods.encode(converted_tokens)