def train_siamese_triplets_cnn(options_dict): """Train and save a Siamese CNN using the specified options.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler( root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"]) train_x, train_matches_vec, train_labels = datasets[0] dev_x, dev_matches_vec, dev_labels = datasets[1] test_x, test_matches_vec, test_labels = datasets[2] # Flatten data d_in = 39 * 200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) # Make batch iterators train_batch_iterator = BatchIteratorTriplets( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True) validate_batch_iterator = BatchIteratorTriplets( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False) test_batch_iterator = BatchIteratorTriplets( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False) # Setup model logger.info("Building Siamese triplets CNN") # Symbolic variables x1 = T.matrix("x1") x2 = T.matrix("x2") x3 = T.matrix("x3") x1_indices = T.ivector("x1_indices") x2_indices = T.ivector("x2_indices") x3_indices = T.ivector("x3_indices") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseTripletCNN( rng, x1, x2, x3, input_shape, conv_layer_specs=options_dict["conv_layer_specs"], hidden_layer_specs=options_dict["hidden_layer_specs"], srng=srng, dropout_rates=options_dict["dropout_rates"], ) if options_dict["loss"] == "hinge_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_hinge_cos(options_dict["margin"]) else: loss = model.loss_hinge_cos(options_dict["margin"]) error = model.loss_hinge_cos( options_dict["margin"] ) # doesn't include regularization or dropout else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[ "l2_weight"] * model.l2 # Compile test functions same_distance = model.cos_same( ) # track the distances of same and different pairs separately diff_distance = model.cos_diff() outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") validate_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: dev_x[x1_indices], x2: dev_x[x2_indices], x3: dev_x[x3_indices], }, mode=theano_mode, ) test_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: test_x[x1_indices], x2: test_x[x2_indices], x3: test_x[x3_indices], }, mode=theano_mode, ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices], x2: train_x[x2_indices], x3: train_x[x3_indices], }, mode=theano_mode, ) # Train model logger.info("Training Siamese triplets CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_siamese_triplets_lstm_nn(options_dict): """Train and save a Siamese CNN using the specified options.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff_mask(rng, options_dict["data_dir"]) train_x, train_mask, train_lengths, train_matches_vec, train_labels = datasets[0] dev_x, dev_mask, dev_lengths, dev_matches_vec, dev_labels = datasets[1] test_x, test_mask, test_lengths, test_matches_vec, test_labels = datasets[2] # Make batch iterators train_triplet_iterator = BatchIteratorTriplets( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True, ) validate_triplet_iterator = BatchIteratorTriplets( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False, ) test_triplet_iterator = BatchIteratorTriplets( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False, ) # Setup model logger.info("Building Siamese triplets LSTM") # Symbolic variables x1 = tensor.tensor3("x1", dtype=THEANOTYPE) x2 = tensor.tensor3("x2", dtype=THEANOTYPE) x3 = tensor.tensor3("x3", dtype=THEANOTYPE) m1 = tensor.matrix("m1", dtype=THEANOTYPE) m2 = tensor.matrix("m2", dtype=THEANOTYPE) m3 = tensor.matrix("m3", dtype=THEANOTYPE) x1_indices = tensor.ivector("x1_indices") x2_indices = tensor.ivector("x2_indices") x3_indices = tensor.ivector("x3_indices") l1 = tensor.iscalar("l1") l2 = tensor.iscalar("l2") l3 = tensor.iscalar("l3") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseTripletBatchLSTMNN( rng, x1, x2, x3, m1, m2, m3, n_in=39, n_lstm_hiddens=options_dict["n_hiddens"], mlp_hidden_specs=options_dict["hidden_layer_specs"], ) if options_dict["loss"] == "hinge_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_hinge_cos(options_dict["margin"]) else: loss = model.loss_hinge_cos(options_dict["margin"]) error = model.loss_hinge_cos(options_dict["margin"]) # doesn't include regularization or dropout else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l2_weight"] > 0.0: loss = loss + options_dict["l2_weight"] * model.l2 # Compile test functions same_distance = model.cos_same() # track the distances of same and different pairs separately diff_distance = model.cos_diff() outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") validate_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: dev_x[x1_indices].swapaxes(0, 1)[: dev_lengths[x1_indices].max()], m1: dev_mask[x1_indices].T[: dev_lengths[x1_indices].max()], x2: dev_x[x2_indices].swapaxes(0, 1)[: dev_lengths[x2_indices].max()], m2: dev_mask[x2_indices].T[: dev_lengths[x2_indices].max()], x3: dev_x[x3_indices].swapaxes(0, 1)[: dev_lengths[x3_indices].max()], m3: dev_mask[x3_indices].T[: dev_lengths[x3_indices].max()], }, mode=theano_mode, ) test_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: test_x[x1_indices].swapaxes(0, 1)[: test_lengths[x1_indices].max()], m1: test_mask[x1_indices].T[: test_lengths[x1_indices].max()], x2: test_x[x2_indices].swapaxes(0, 1)[: test_lengths[x2_indices].max()], m2: test_mask[x2_indices].T[: test_lengths[x2_indices].max()], x3: test_x[x3_indices].swapaxes(0, 1)[: test_lengths[x3_indices].max()], m3: test_mask[x3_indices].T[: test_lengths[x3_indices].max()], }, mode=theano_mode, ) # test_model = theano.function( # inputs=[x1_indices, x2_indices, x3_indices], # outputs=outputs, # givens={ # l1: test_lengths[x1_indices].max(), # x1: test_x[x1_indices].swapaxes(0, 1)[:l1], # m1: test_mask[x1_indices][:l1], # l2: test_lengths[x2_indices].max(), # x2: test_x[x2_indices].swapaxes(0, 1)[:l2], # m2: test_mask[x2_indices][:l2], # l3: test_lengths[x3_indices].max(), # x3: test_x[x3_indices].swapaxes(0, 1)[:l3], # m3: test_mask[x3_indices][:l3], # }, # mode=theano_mode, # ) # Gradients and training updates parameters = model.parameters gradients = tensor.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices].swapaxes(0, 1)[: train_lengths[x1_indices].max()], m1: train_mask[x1_indices].T[: train_lengths[x1_indices].max()], x2: train_x[x2_indices].swapaxes(0, 1)[: train_lengths[x2_indices].max()], m2: train_mask[x2_indices].T[: train_lengths[x2_indices].max()], x3: train_x[x3_indices].swapaxes(0, 1)[: train_lengths[x3_indices].max()], m3: train_mask[x3_indices].T[: train_lengths[x3_indices].max()], }, mode=theano_mode, ) # train_model = theano.function( # inputs=[x1_indices, x2_indices, x3_indices], # outputs=outputs, # updates=updates, # givens={ # l1: train_lengths[x1_indices].max(), # x1: train_x[x1_indices].swapaxes(0, 1)[:l1], # m1: train_mask[x1_indices][:l1], # l2: train_lengths[x2_indices].max(), # x2: train_x[x2_indices].swapaxes(0, 1)[:l2], # m2: train_mask[x2_indices][:l2], # l3: train_lengths[x3_indices].max(), # x3: train_x[x3_indices].swapaxes(0, 1)[:l3], # m3: train_mask[x3_indices][:l3], # }, # mode=theano_mode, # ) # Train model logger.info("Training Siamese triplets CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_triplet_iterator=train_triplet_iterator, validate_model=validate_model, validate_triplet_iterator=validate_triplet_iterator, test_model=test_model, test_triplet_iterator=test_triplet_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645 ) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_mlp(options_dict): """Train and save a word classifier MLP.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler( root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None # Load and format data # Load into shared variables datasets, word_to_i_map = data_io.load_swbd_labelled( rng, options_dict["data_dir"], options_dict["min_count"]) train_x, train_y = datasets[0] dev_x, dev_y = datasets[1] test_x, test_y = datasets[2] # Get batch sizes and iterators class BatchIterator(object): def __init__(self, n_batches): self.n_batches = n_batches def __iter__(self): for i_batch in xrange(self.n_batches): yield [i_batch] n_train_batches = train_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] n_dev_batches = dev_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] n_test_batches = test_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] train_batch_iterator = BatchIterator(n_train_batches) validate_batch_iterator = BatchIterator(n_dev_batches) test_batch_iterator = BatchIterator(n_test_batches) # Flatten data d_in = 39 * 200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) d_out = len(word_to_i_map) options_dict["d_out"] = d_out # Save `options_dict` options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Setup model logger.info("Building MLP") # Symbolic variables i_batch = T.lscalar() # batch index x = T.matrix("x") # flattened data of shape (n_data, d_in) y = T.ivector("y") # labels # Build model logger.info("No. of word type targets: " + str(options_dict["d_out"])) model = mlp.MLP(rng, x, d_in, options_dict["d_out"], options_dict["hidden_layer_specs"], srng, options_dict["dropout_rates"]) if options_dict["dropout_rates"] is not None: loss = model.dropout_negative_log_likelihood(y) else: loss = model.negative_log_likelihood(y) error = model.errors(y) # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[ "l2_weight"] * model.l2 # Compile test functions outputs = [error, loss] validate_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: dev_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: dev_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }) test_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: test_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: test_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[i_batch], outputs=outputs, updates=updates, givens={ x: train_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: train_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }, ) # Train model logger.info("Training MLP") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"]) utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_cnn(options_dict): """Train and save a word classifier CNN.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) # root_logger = logging.getLogger() # formatter = root_logger.handlers[0].formatter # root_logger.removeHandler(root_logger.handlers[0]) # file_handler = logging.FileHandler(log_fn, "a") # file_handler.setFormatter(formatter) # root_logger.addHandler(file_handler) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None # Load and format data # Load into shared variables datasets, word_to_i_map = data_io.load_swbd_labelled(rng, options_dict["data_dir"], options_dict["min_count"]) train_x, train_y = datasets[0] dev_x, dev_y = datasets[1] test_x, test_y = datasets[2] # Get batch sizes and iterators class BatchIterator(object): def __init__(self, n_batches): self.n_batches = n_batches def __iter__(self): for i_batch in xrange(self.n_batches): yield [i_batch] n_train_batches = train_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] n_dev_batches = dev_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] n_test_batches = test_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] train_batch_iterator = BatchIterator(n_train_batches) validate_batch_iterator = BatchIterator(n_dev_batches) test_batch_iterator = BatchIterator(n_test_batches) # Flatten data d_in = 39*200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) d_out = len(word_to_i_map) options_dict["d_out"] = d_out # Save `options_dict` options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Setup model logger.info("Building CNN") # Symbolic variables i_batch = T.lscalar() # batch index x = T.matrix("x") # flattened data of shape (n_data, d_in) y = T.ivector("y") # labels # Build model logger.info("No. of word type targets: " + str(options_dict["d_out"])) input_shape = (options_dict["batch_size"], 1, 39, 200) model = cnn.CNN( rng, x, input_shape, options_dict["conv_layer_specs"], options_dict["hidden_layer_specs"], options_dict["d_out"], srng, options_dict["dropout_rates"] ) if options_dict["dropout_rates"] is not None: loss = model.dropout_negative_log_likelihood(y) else: loss = model.negative_log_likelihood(y) error = model.errors(y) # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2 # Compile test functions outputs = [error, loss] validate_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: dev_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: dev_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] } ) test_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: test_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: test_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] } ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta( parameters, gradients, learning_rule["rho"], learning_rule["epsilon"] ) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[i_batch], outputs=outputs, updates=updates, givens={ x: train_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: train_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] }, ) # Train model logger.info("Training CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"] ) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_siamese_cnn(options_dict): # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"]) train_x, train_matches_vec, train_labels = datasets[0] dev_x, dev_matches_vec, dev_labels = datasets[1] test_x, test_matches_vec, test_labels = datasets[2] # Flatten data d_in = 39*200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) # Make batch iterators train_batch_iterator = BatchIteratorSameDifferent( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True ) validate_batch_iterator = BatchIteratorSameDifferent( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False ) test_batch_iterator = BatchIteratorSameDifferent( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False ) # Setup model logger.info("Building Siamese CNN") # Symbolic variables y = T.ivector("y") # indicates whether x1 and x2 is same (1) or different (0) x1 = T.matrix("x1") x2 = T.matrix("x2") x1_indices = T.ivector("x1_indices") x2_indices = T.ivector("x2_indices") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseCNN( rng, x1, x2, input_shape, conv_layer_specs=options_dict["conv_layer_specs"], hidden_layer_specs=options_dict["hidden_layer_specs"], srng=srng, dropout_rates=options_dict["dropout_rates"], ) if options_dict["loss"] == "cos_cos2": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos2(y) else: loss = model.loss_cos_cos2(y) error = model.loss_cos_cos2(y) # doesn't include regularization or dropout elif options_dict["loss"] == "cos_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos(y) else: loss = model.loss_cos_cos(y) error = model.loss_cos_cos(y) elif options_dict["loss"] == "cos_cos_margin": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos_margin(y) else: loss = model.loss_cos_cos_margin(y) error = model.loss_cos_cos_margin(y) elif options_dict["loss"] == "euclidean_margin": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_euclidean_margin(y) else: loss = model.loss_euclidean_margin(y) error = model.loss_euclidean_margin(y) else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2 # Compile test functions same_distance = model.cos_same(y) # track the distances of same and different pairs separately diff_distance = model.cos_diff(y) outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") test_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, givens={ x1: test_x[x1_indices], x2: test_x[x2_indices], }, mode=theano_mode, ) validate_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, givens={ x1: dev_x[x1_indices], x2: dev_x[x2_indices], }, mode=theano_mode, ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta( parameters, gradients, learning_rule["rho"], learning_rule["epsilon"] ) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices], x2: train_x[x2_indices], }, mode=theano_mode, ) # Train model logger.info("Training Siamese CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers(options_dict["model_dir"], "dev", batch_size=645) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")