def load_swbd_same_diff_nopadding(rng, data_dir): logger.info("Loading same and different pairs: " + data_dir) datasets = [] for set in ["train", "dev", "test"]: npz_fn = path.join(data_dir, "swbd." + set + ".npz") logger.info("Reading: " + npz_fn) # Load data and shuffle npz = np.load(npz_fn) utt_ids = sorted(npz.keys()) rng.shuffle(utt_ids) xs = [npz[i] for i in utt_ids] ls = np.asarray([len(x) for x in xs], dtype=int) base_inds = np.cumsum(ls) ends = theano.shared(base_inds, borrow=True) base_begins = base_inds.copy() base_begins[1:] = base_inds[:-1] base_begins[0] = 0 begins = theano.shared(base_begins, borrow=True) # Get labels for each utterance labels = swbd_utts_to_labels(utt_ids) matches_vec = samediff.generate_matches_array(labels) try: shared_x = theano.shared(np.asarray(np.vstack(xs), dtype=THEANOTYPE), borrow=True) except: import pdb; pdb.set_trace() # Create a tuple for this set and add to `data_sets` datasets.append((shared_x, begins, ends, matches_vec, labels)) return datasets
def load_swbd_same_diff(rng, data_dir): logger.info("Loading same and different pairs: " + data_dir) datasets = [] for set in ["train", "dev", "test"]: npz_fn = path.join(data_dir, "swbd." + set + ".npz") logger.info("Reading: " + npz_fn) # Load data and shuffle npz = np.load(npz_fn) utt_ids = sorted(npz.keys()) while "width" in utt_ids: utt_ids.remove("width") while "padding" in utt_ids: utt_ids.remove("padding") rng.shuffle(utt_ids) x = [npz[i] for i in utt_ids] # Get labels for each utterance labels = swbd_utts_to_labels(utt_ids) matches_vec = samediff.generate_matches_array(labels) shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True) # Create a tuple for this set and add to `data_sets` datasets.append((shared_x, matches_vec, labels)) return datasets
def eval_samediff(segments_dict): """Returns average precision and recision-recall breakeven.""" # Generate list of pairs segment_keys = sorted(segments_dict.keys()) pairs = [] m = len(segment_keys) for i in range(0, m - 1): for j in range(i + 1, m): pairs.append((segment_keys[i], segment_keys[j])) # print("No. pairs: {}".format(len(pairs))) print("Calculating distances:") costs = np.zeros(len(pairs)) for i_pair, pair in enumerate(tqdm(pairs)): utt_id_1, utt_id_2 = pair costs[i_pair] = dtw_cost_func( np.array(segments_dict[utt_id_1], dtype=np.double, order="c"), np.array(segments_dict[utt_id_2], dtype=np.double, order="c"), True) # Same-different distances_vec = np.asarray(costs) labels = [key.split("_")[0] for key in segment_keys] matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances_vec[matches == True], distances_vec[matches == False], False) return (ap, prb)
def load_swbd_same_diff(rng, data_dir): logger.info("Loading same and different pairs: " + data_dir) datasets = [] for set in ["train", "dev", "test"]: npz_fn = path.join(data_dir, "swbd." + set + ".npz") logger.info("Reading: " + npz_fn) # Load data and shuffle npz = np.load(npz_fn) utt_ids = sorted(npz.keys()) rng.shuffle(utt_ids) x = [npz[i] for i in utt_ids] # Get labels for each utterance labels = swbd_utts_to_labels(utt_ids) matches_vec = samediff.generate_matches_array(labels) shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True) # Create a tuple for this set and add to `data_sets` datasets.append((shared_x, matches_vec, labels)) return datasets
def GetTestData(dict): labels = [] data = [] for i in dict.files: labels.append(i[: i.find('_')]) data.append(dict[i]) data, lengths = Padding(data) matches = samediff.generate_matches_array(labels) return data, lengths, matches
def main(): args = check_argv() print datetime.datetime.now() print "Reading:", args.npz_fn npz = np.load(args.npz_fn) print datetime.datetime.now() # if args.normalize: # print "Normalizing embeddings" # else: print "Ordering embeddings" n_embeds = 0 X = [] ids = [] for label in sorted(npz): ids.append(label) X.append(npz[label]) n_embeds += 1 X = np.array(X) print "No. embeddings:", n_embeds print "Embedding dimensionality:", X.shape[1] print datetime.datetime.now() print "Calculating distances" distances = pdist(X, metric=args.metric) print datetime.datetime.now() print "Getting labels" labels = [] for utt_id in ids: word = "_".join(utt_id.split("_")[:-2]) labels.append(word) if args.mean_ap: print datetime.datetime.now() print "Calculating mean average precision" mean_ap, mean_prb, ap_dict = samediff.mean_average_precision(distances, labels) print "Mean average precision:", mean_ap print "Mean precision-recall breakeven:", mean_prb print datetime.datetime.now() print "Calculating average precision" matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) print "Average precision:", ap print "Precision-recall breakeven:", prb print datetime.datetime.now()
def samediff_val(normalise=True): # Embed validation np.random.seed(options_dict["rnd_seed"]) val_batch_iterator = batching.SimpleIterator(val_x, len(val_x), False) labels = [val_labels[i] for i in val_batch_iterator.indices] speakers = [val_speakers[i] for i in val_batch_iterator.indices] saver = tf.train.Saver() with tf.Session() as session: saver.restore(session, val_model_fn) for batch_x_padded, batch_x_lengths in val_batch_iterator: np_x = batch_x_padded np_x_lengths = batch_x_lengths np_z = session.run([encoding], feed_dict={ x: np_x, x_lengths: np_x_lengths })[0] break # single batch embed_dict = {} for i, utt_key in enumerate( [val_keys[i] for i in val_batch_iterator.indices]): embed_dict[utt_key] = np_z[i] # Same-different if normalise: np_z_normalised = (np_z - np_z.mean(axis=0)) / np_z.std(axis=0) distances = pdist(np_z_normalised, metric="cosine") else: distances = pdist(np_z, metric="cosine") # matches = samediff.generate_matches_array(labels) # ap, prb = samediff.average_precision( # distances[matches == True], distances[matches == False] # ) word_matches = samediff.generate_matches_array(labels) speaker_matches = samediff.generate_matches_array(speakers) sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp( distances[np.logical_and(word_matches, speaker_matches)], distances[np.logical_and(word_matches, speaker_matches == False)], distances[word_matches == False]) # return [sw_prb, -sw_ap, swdp_prb, -swdp_ap] return [swdp_prb, -swdp_ap]
def samediff_val(normalise=True): # Embed validation np.random.seed(options_dict["rnd_seed"]) val_batch_iterator = batching.SimpleIterator(val_x, len(val_x), False) labels = [val_labels[i] for i in val_batch_iterator.indices] saver = tf.train.Saver() with tf.Session() as session: saver.restore(session, val_model_fn) for batch_x_padded, batch_x_lengths in val_batch_iterator: np_x = batch_x_padded np_x_lengths = batch_x_lengths # np_z = session.run( # [z], feed_dict={a: np_x, a_lengths: np_x_lengths} # )[0] np_z = session.run( [z_mean], feed_dict={a: np_x, a_lengths: np_x_lengths} )[0] # print(np_z) break # single batch embed_dict = {} for i, utt_key in enumerate( [val_keys[i] for i in val_batch_iterator.indices]): embed_dict[utt_key] = np_z[i] # Same-different if normalise: # print(np_z.shape) np_z_normalised = (np_z - np_z.mean(axis=0))/np_z.std(axis=0) distances = pdist(np_z_normalised, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision( distances[matches == True], distances[matches == False] ) else: distances = pdist(np_z, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision( distances[matches == True], distances[matches == False] ) return [prb, -ap]
def main(): args = check_argv() print(datetime.now()) print("Reading:", args.npz_fn) npz = np.load(args.npz_fn) print(datetime.now()) print("Ordering embeddings") n_embeds = 0 X = [] ids = [] for label in sorted(npz): ids.append(label) X.append(npz[label]) n_embeds += 1 X = np.array(X) print("No. embeddings:", n_embeds) print("Embedding dimensionality:", X.shape[1]) if args.mvn: normed = (X - X.mean(axis=0)) / X.std(axis=0) X = normed print(datetime.now()) print("Calculating distances") metric = args.metric if metric == "kl": import scipy.stats metric = scipy.stats.entropy distances = pdist(X, metric=metric) print("Getting labels") labels = [] for utt_id in ids: word = utt_id.split("_")[0] # "_".join(utt_id.split("_")[:-2]) labels.append(word) print("Calculating average precision") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) print("Average precision: {:.4f}".format(ap)) print("Precision-recall breakeven: {:.4f}".format(prb)) print(datetime.now())
def main(): args = check_argv() print datetime.datetime.now() print "Reading:", args.npz_fn npz = np.load(args.npz_fn) print datetime.datetime.now() # if args.normalize: # print "Normalizing embeddings" # else: print "Ordering embeddings" n_embeds = 0 X = [] ids = [] for label in sorted(npz): ids.append(label) X.append(npz[label]) n_embeds += 1 X = np.array(X) print "No. embeddings:", n_embeds print "Embedding dimensionality:", X.shape[1] print datetime.datetime.now() print "Calculating distances" metric = args.metric if metric == "kl": import scipy.stats metric = scipy.stats.entropy distances = pdist(X, metric=metric) print "Getting labels" labels = [] for utt_id in ids: word = "_".join(utt_id.split("_")[:-2]) labels.append(word) print "Calculating average precision" matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) print "Average precision:", ap print "Precision-recall breakeven:", prb print datetime.datetime.now()
def samediff_val(normalise=False): # Embed validation np.random.seed(options_dict["rnd_seed"]) val_batch_iterator = batching.LabelledIterator( val_x, None, val_x.shape[0], False ) labels = [val_labels[i] for i in val_batch_iterator.indices] saver = tf.train.Saver() with tf.Session() as session: saver.restore(session, val_model_fn) for batch_x in val_batch_iterator: np_z = session.run( [output], feed_dict={x: batch_x} )[0] break # single batch embed_dict = {} for i, utt_key in enumerate( [val_keys[i] for i in val_batch_iterator.indices]): embed_dict[utt_key] = np_z[i] # Same-different if normalise: np_z_normalised = (np_z - np_z.mean(axis=0))/np_z.std(axis=0) distances = pdist(np_z_normalised, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision( distances[matches == True], distances[matches == False] ) else: distances = pdist(np_z, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision( distances[matches == True], distances[matches == False] ) return [prb, -ap]
def load_swbd_same_diff_mask_old(rng, data_dir, filter_length=None, seq_length=200): logger.info("Loading same and different pairs: " + data_dir) datasets = [] for set in ["train", "dev", "test"]: npz_fn = path.join(data_dir, "swbd." + set + ".npz") logger.info("Reading: " + npz_fn) # Load data and shuffle npz = np.load(npz_fn) utt_ids = sorted(npz.keys()) rng.shuffle(utt_ids) ls = np.asarray([len(npz[i]) for i in utt_ids], dtype=np.int32) max_length = ls.max() if seq_length is None else seq_length xs = np.zeros((len(ls), max_length, npz[utt_ids[0]].shape[1]), dtype=THEANOTYPE) mask = np.zeros((len(ls), max_length), dtype=THEANOTYPE) for j, i in enumerate(utt_ids): xs[j][:ls[j]] = npz[i] mask[j][:ls[j]] = 1.0 # perform adjustment for convlstms, since we perform a convolution # first over the time series if filter_length is not None: ls -= filter_length - 1 # Get labels for each utterance labels = swbd_utts_to_labels(utt_ids) matches_vec = samediff.generate_matches_array(labels) shared_x = theano.shared(xs, borrow=True) shared_mask = theano.shared(mask, borrow=True) shared_ls = theano.shared(ls, borrow=True) # Create a tuple for this set and add to `data_sets` datasets.append((shared_x, shared_mask, shared_ls, matches_vec, labels)) return datasets
def load_swbd_same_diff_mask(rng, data_dir, filter_length=None): logger.info("Loading same and different pairs: " + data_dir) datasets = [] for set in ["train", "dev", "test"]: npz_fn = path.join(data_dir, "swbd." + set + ".npz") width_fn = path.join(data_dir, "width." + set + ".npz") padding_fn = path.join(data_dir, "padding." + set + ".npz") logger.info("Reading: " + npz_fn) # Load data and shuffle npz = np.load(npz_fn) widths = np.load(width_fn) paddings = np.load(padding_fn) utt_ids = sorted(npz.keys()) rng.shuffle(utt_ids) x = [npz[i].T for i in utt_ids] # Get labels for each utterance labels = swbd_utts_to_labels(utt_ids) matches_vec = samediff.generate_matches_array(labels) mask = np.zeros((len(x), len(x[0])), dtype=THEANOTYPE) for i, utt_id in enumerate(utt_ids): padding = int(paddings[utt_id]) width = int(widths[utt_id]) mask[i][padding: padding + width] = 1.0 shared_x = theano.shared(np.asarray(x, dtype=THEANOTYPE), borrow=True) shared_m = theano.shared(mask, borrow=True) # Create a tuple for this set and add to `data_sets` datasets.append((shared_x, shared_m, matches_vec, labels)) return datasets
def train_mlp(options_dict): """Train and save a word classifier MLP.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler( root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None # Load and format data # Load into shared variables datasets, word_to_i_map = data_io.load_swbd_labelled( rng, options_dict["data_dir"], options_dict["min_count"]) train_x, train_y = datasets[0] dev_x, dev_y = datasets[1] test_x, test_y = datasets[2] # Get batch sizes and iterators class BatchIterator(object): def __init__(self, n_batches): self.n_batches = n_batches def __iter__(self): for i_batch in xrange(self.n_batches): yield [i_batch] n_train_batches = train_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] n_dev_batches = dev_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] n_test_batches = test_x.get_value( borrow=True).shape[0] / options_dict["batch_size"] train_batch_iterator = BatchIterator(n_train_batches) validate_batch_iterator = BatchIterator(n_dev_batches) test_batch_iterator = BatchIterator(n_test_batches) # Flatten data d_in = 39 * 200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) d_out = len(word_to_i_map) options_dict["d_out"] = d_out # Save `options_dict` options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Setup model logger.info("Building MLP") # Symbolic variables i_batch = T.lscalar() # batch index x = T.matrix("x") # flattened data of shape (n_data, d_in) y = T.ivector("y") # labels # Build model logger.info("No. of word type targets: " + str(options_dict["d_out"])) model = mlp.MLP(rng, x, d_in, options_dict["d_out"], options_dict["hidden_layer_specs"], srng, options_dict["dropout_rates"]) if options_dict["dropout_rates"] is not None: loss = model.dropout_negative_log_likelihood(y) else: loss = model.negative_log_likelihood(y) error = model.errors(y) # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[ "l2_weight"] * model.l2 # Compile test functions outputs = [error, loss] validate_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: dev_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: dev_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }) test_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: test_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: test_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[i_batch], outputs=outputs, updates=updates, givens={ x: train_x[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]], y: train_y[i_batch * options_dict["batch_size"]:(i_batch + 1) * options_dict["batch_size"]] }, ) # Train model logger.info("Training MLP") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"]) utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
for i in xrange(0, N_train, batchsize): x_batch = x_train[i : i + batchsize] model.forward(x_batch,test=False) logger.info("Extracting final layer") save_to = args.save_to X=[] for i in xrange(0, N_test): utt_id = utt_ids_tst[i] x_batch = x_test[i : i + 1] X.append(cuda.to_cpu(F.softmax(model.forward(x_batch,test=True)).data)) X=numpy.asarray(X)[:,0,:] logger.info("Calcurating average precision") start_time = timeit.default_timer() labels = swbd_utts_to_labels(utt_ids_tst) distances = pdist(X, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) end_time = timeit.default_timer() logger.info("Average precision: %s (processing time: %f [sec])" % (str(ap), end_time-start_time)) logger.info('Saving output layer to %s' % save_to+".npz") numpy.savez_compressed(save_to, X) # dataset = load_swbd_dataset(args.dataset, ratio=1) # x_train, y_train = dataset[0] # x_test, y_test = dataset[2] # N_test=x_test.shape[0] # N_train=x_train.shape[0] # print "Applying batch normalization" # for i in xrange(0, N_train, batchsize): # x_batch = x_train[i : i + batchsize]
def train_siamese_cnn(options_dict): # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"]) train_x, train_matches_vec, train_labels = datasets[0] dev_x, dev_matches_vec, dev_labels = datasets[1] test_x, test_matches_vec, test_labels = datasets[2] # Flatten data d_in = 39*200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) # Make batch iterators train_batch_iterator = BatchIteratorSameDifferent( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True ) validate_batch_iterator = BatchIteratorSameDifferent( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False ) test_batch_iterator = BatchIteratorSameDifferent( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False ) # Setup model logger.info("Building Siamese CNN") # Symbolic variables y = T.ivector("y") # indicates whether x1 and x2 is same (1) or different (0) x1 = T.matrix("x1") x2 = T.matrix("x2") x1_indices = T.ivector("x1_indices") x2_indices = T.ivector("x2_indices") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseCNN( rng, x1, x2, input_shape, conv_layer_specs=options_dict["conv_layer_specs"], hidden_layer_specs=options_dict["hidden_layer_specs"], srng=srng, dropout_rates=options_dict["dropout_rates"], ) if options_dict["loss"] == "cos_cos2": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos2(y) else: loss = model.loss_cos_cos2(y) error = model.loss_cos_cos2(y) # doesn't include regularization or dropout elif options_dict["loss"] == "cos_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos(y) else: loss = model.loss_cos_cos(y) error = model.loss_cos_cos(y) elif options_dict["loss"] == "cos_cos_margin": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_cos_cos_margin(y) else: loss = model.loss_cos_cos_margin(y) error = model.loss_cos_cos_margin(y) elif options_dict["loss"] == "euclidean_margin": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_euclidean_margin(y) else: loss = model.loss_euclidean_margin(y) error = model.loss_euclidean_margin(y) else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2 # Compile test functions same_distance = model.cos_same(y) # track the distances of same and different pairs separately diff_distance = model.cos_diff(y) outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") test_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, givens={ x1: test_x[x1_indices], x2: test_x[x2_indices], }, mode=theano_mode, ) validate_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, givens={ x1: dev_x[x1_indices], x2: dev_x[x2_indices], }, mode=theano_mode, ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta( parameters, gradients, learning_rule["rho"], learning_rule["epsilon"] ) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, y], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices], x2: train_x[x2_indices], }, mode=theano_mode, ) # Train model logger.info("Training Siamese CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers(options_dict["model_dir"], "dev", batch_size=645) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_siamese_triplets_lstm_nn(options_dict): """Train and save a Siamese CNN using the specified options.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff_mask(rng, options_dict["data_dir"]) train_x, train_mask, train_lengths, train_matches_vec, train_labels = datasets[0] dev_x, dev_mask, dev_lengths, dev_matches_vec, dev_labels = datasets[1] test_x, test_mask, test_lengths, test_matches_vec, test_labels = datasets[2] # Make batch iterators train_triplet_iterator = BatchIteratorTriplets( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True, ) validate_triplet_iterator = BatchIteratorTriplets( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False, ) test_triplet_iterator = BatchIteratorTriplets( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False, ) # Setup model logger.info("Building Siamese triplets LSTM") # Symbolic variables x1 = tensor.tensor3("x1", dtype=THEANOTYPE) x2 = tensor.tensor3("x2", dtype=THEANOTYPE) x3 = tensor.tensor3("x3", dtype=THEANOTYPE) m1 = tensor.matrix("m1", dtype=THEANOTYPE) m2 = tensor.matrix("m2", dtype=THEANOTYPE) m3 = tensor.matrix("m3", dtype=THEANOTYPE) x1_indices = tensor.ivector("x1_indices") x2_indices = tensor.ivector("x2_indices") x3_indices = tensor.ivector("x3_indices") l1 = tensor.iscalar("l1") l2 = tensor.iscalar("l2") l3 = tensor.iscalar("l3") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseTripletBatchLSTMNN( rng, x1, x2, x3, m1, m2, m3, n_in=39, n_lstm_hiddens=options_dict["n_hiddens"], mlp_hidden_specs=options_dict["hidden_layer_specs"], ) if options_dict["loss"] == "hinge_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_hinge_cos(options_dict["margin"]) else: loss = model.loss_hinge_cos(options_dict["margin"]) error = model.loss_hinge_cos(options_dict["margin"]) # doesn't include regularization or dropout else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l2_weight"] > 0.0: loss = loss + options_dict["l2_weight"] * model.l2 # Compile test functions same_distance = model.cos_same() # track the distances of same and different pairs separately diff_distance = model.cos_diff() outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") validate_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: dev_x[x1_indices].swapaxes(0, 1)[: dev_lengths[x1_indices].max()], m1: dev_mask[x1_indices].T[: dev_lengths[x1_indices].max()], x2: dev_x[x2_indices].swapaxes(0, 1)[: dev_lengths[x2_indices].max()], m2: dev_mask[x2_indices].T[: dev_lengths[x2_indices].max()], x3: dev_x[x3_indices].swapaxes(0, 1)[: dev_lengths[x3_indices].max()], m3: dev_mask[x3_indices].T[: dev_lengths[x3_indices].max()], }, mode=theano_mode, ) test_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: test_x[x1_indices].swapaxes(0, 1)[: test_lengths[x1_indices].max()], m1: test_mask[x1_indices].T[: test_lengths[x1_indices].max()], x2: test_x[x2_indices].swapaxes(0, 1)[: test_lengths[x2_indices].max()], m2: test_mask[x2_indices].T[: test_lengths[x2_indices].max()], x3: test_x[x3_indices].swapaxes(0, 1)[: test_lengths[x3_indices].max()], m3: test_mask[x3_indices].T[: test_lengths[x3_indices].max()], }, mode=theano_mode, ) # test_model = theano.function( # inputs=[x1_indices, x2_indices, x3_indices], # outputs=outputs, # givens={ # l1: test_lengths[x1_indices].max(), # x1: test_x[x1_indices].swapaxes(0, 1)[:l1], # m1: test_mask[x1_indices][:l1], # l2: test_lengths[x2_indices].max(), # x2: test_x[x2_indices].swapaxes(0, 1)[:l2], # m2: test_mask[x2_indices][:l2], # l3: test_lengths[x3_indices].max(), # x3: test_x[x3_indices].swapaxes(0, 1)[:l3], # m3: test_mask[x3_indices][:l3], # }, # mode=theano_mode, # ) # Gradients and training updates parameters = model.parameters gradients = tensor.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices].swapaxes(0, 1)[: train_lengths[x1_indices].max()], m1: train_mask[x1_indices].T[: train_lengths[x1_indices].max()], x2: train_x[x2_indices].swapaxes(0, 1)[: train_lengths[x2_indices].max()], m2: train_mask[x2_indices].T[: train_lengths[x2_indices].max()], x3: train_x[x3_indices].swapaxes(0, 1)[: train_lengths[x3_indices].max()], m3: train_mask[x3_indices].T[: train_lengths[x3_indices].max()], }, mode=theano_mode, ) # train_model = theano.function( # inputs=[x1_indices, x2_indices, x3_indices], # outputs=outputs, # updates=updates, # givens={ # l1: train_lengths[x1_indices].max(), # x1: train_x[x1_indices].swapaxes(0, 1)[:l1], # m1: train_mask[x1_indices][:l1], # l2: train_lengths[x2_indices].max(), # x2: train_x[x2_indices].swapaxes(0, 1)[:l2], # m2: train_mask[x2_indices][:l2], # l3: train_lengths[x3_indices].max(), # x3: train_x[x3_indices].swapaxes(0, 1)[:l3], # m3: train_mask[x3_indices][:l3], # }, # mode=theano_mode, # ) # Train model logger.info("Training Siamese triplets CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_triplet_iterator=train_triplet_iterator, validate_model=validate_model, validate_triplet_iterator=validate_triplet_iterator, test_model=test_model, test_triplet_iterator=test_triplet_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645 ) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
x_batch = x_train[i:i + batchsize] model.forward(x_batch, test=False) logger.info("Extracting final layer") save_to = args.save_to X = [] for i in xrange(0, N_test): utt_id = utt_ids_tst[i] x_batch = x_test[i:i + 1] X.append( cuda.to_cpu(F.softmax(model.forward(x_batch, test=True)).data)) X = numpy.asarray(X)[:, 0, :] logger.info("Calcurating average precision") start_time = timeit.default_timer() labels = swbd_utts_to_labels(utt_ids_tst) distances = pdist(X, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) end_time = timeit.default_timer() logger.info("Average precision: %s (processing time: %f [sec])" % (str(ap), end_time - start_time)) logger.info('Saving output layer to %s' % save_to + ".npz") numpy.savez_compressed(save_to, X) # dataset = load_swbd_dataset(args.dataset, ratio=1) # x_train, y_train = dataset[0] # x_test, y_test = dataset[2] # N_test=x_test.shape[0] # N_train=x_train.shape[0] # print "Applying batch normalization"
def main(): args = check_argv() print(datetime.now()) print("Reading:", args.npz_fn) npz = np.load(args.npz_fn) print(datetime.now()) # if args.normalize: # print("Normalizing embeddings") # else: print("Ordering embeddings") n_embeds = 0 X = [] ids = [] for label in sorted(npz): ids.append(label) X.append(npz[label]) n_embeds += 1 X = np.array(X) print("No. embeddings:", n_embeds) print("Embedding dimensionality:", X.shape[1]) if args.mvn: normed = (X - X.mean(axis=0)) / X.std(axis=0) X = normed print(datetime.now()) print("Calculating distances") distances = pdist(X, metric=args.metric) print(datetime.now()) print("Getting labels") labels = [] for utt_id in ids: word = utt_id.split("_")[0] labels.append(word) if args.mean_ap: print(datetime.now()) print("Calculating mean average precision") mean_ap, mean_prb, ap_dict = samediff.mean_average_precision( distances, labels) print("Mean average precision:", mean_ap) print("Mean precision-recall breakeven:", mean_prb) print(datetime.now()) print("Calculating average precision") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) print("Average precision:", ap) print("Precision-recall breakeven:", prb) print(datetime.now())
def main(): args = check_argv() print(datetime.now()) print("Reading:", args.npz_fn) npz = np.load(args.npz_fn) print(datetime.now()) print("Ordering embeddings") n_embeds = 0 X = [] ids = [] for label in sorted(npz): ids.append(label) X.append(npz[label]) n_embeds += 1 X = np.array(X) print("No. embeddings:", n_embeds) print("Embedding dimensionality:", X.shape[1]) if args.mvn: normed = (X - X.mean(axis=0)) / X.std(axis=0) X = normed print(datetime.now()) print("Calculating distances") metric = args.metric if metric == "kl": import scipy.stats metric = scipy.stats.entropy distances = pdist(X, metric=metric) print(datetime.now()) print("Getting labels and speakers") labels = [] speakers = [] for utt_id in ids: utt_id = utt_id.split("_") word = utt_id[0] speaker = utt_id[1] labels.append(word) speakers.append(speaker) if args.mean_ap: print(datetime.now()) print("Calculating mean average precision") mean_ap, mean_prb, ap_dict = samediff.mean_average_precision( distances, labels) print("Mean average precision:", mean_ap) print("Mean precision-recall breakeven:", mean_prb) print(datetime.now()) print("Calculating average precision") # matches = samediff.generate_matches_array(labels) # Temp word_matches = samediff.generate_matches_array(labels) speaker_matches = samediff.generate_matches_array(speakers) print("No. same-word pairs:", sum(word_matches)) print("No. same-speaker pairs:", sum(speaker_matches)) sw_ap, sw_prb, swdp_ap, swdp_prb = samediff.average_precision_swdp( distances[np.logical_and(word_matches, speaker_matches)], distances[np.logical_and(word_matches, speaker_matches == False)], distances[word_matches == False]) print("-" * 79) print("Average precision: {:.8f}".format(sw_ap)) print("Precision-recall breakeven: {:.8f}".format(sw_prb)) print("SWDP average precision: {:.8f}".format(swdp_ap)) print("SWDP precision-recall breakeven: {:.8f}".format(swdp_prb)) print("-" * 79) print(datetime.now())
def train_cnn(options_dict): """Train and save a word classifier CNN.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler(root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) # root_logger = logging.getLogger() # formatter = root_logger.handlers[0].formatter # root_logger.removeHandler(root_logger.handlers[0]) # file_handler = logging.FileHandler(log_fn, "a") # file_handler.setFormatter(formatter) # root_logger.addHandler(file_handler) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None # Load and format data # Load into shared variables datasets, word_to_i_map = data_io.load_swbd_labelled(rng, options_dict["data_dir"], options_dict["min_count"]) train_x, train_y = datasets[0] dev_x, dev_y = datasets[1] test_x, test_y = datasets[2] # Get batch sizes and iterators class BatchIterator(object): def __init__(self, n_batches): self.n_batches = n_batches def __iter__(self): for i_batch in xrange(self.n_batches): yield [i_batch] n_train_batches = train_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] n_dev_batches = dev_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] n_test_batches = test_x.get_value(borrow=True).shape[0] / options_dict["batch_size"] train_batch_iterator = BatchIterator(n_train_batches) validate_batch_iterator = BatchIterator(n_dev_batches) test_batch_iterator = BatchIterator(n_test_batches) # Flatten data d_in = 39*200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) d_out = len(word_to_i_map) options_dict["d_out"] = d_out # Save `options_dict` options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Setup model logger.info("Building CNN") # Symbolic variables i_batch = T.lscalar() # batch index x = T.matrix("x") # flattened data of shape (n_data, d_in) y = T.ivector("y") # labels # Build model logger.info("No. of word type targets: " + str(options_dict["d_out"])) input_shape = (options_dict["batch_size"], 1, 39, 200) model = cnn.CNN( rng, x, input_shape, options_dict["conv_layer_specs"], options_dict["hidden_layer_specs"], options_dict["d_out"], srng, options_dict["dropout_rates"] ) if options_dict["dropout_rates"] is not None: loss = model.dropout_negative_log_likelihood(y) else: loss = model.negative_log_likelihood(y) error = model.errors(y) # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2 # Compile test functions outputs = [error, loss] validate_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: dev_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: dev_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] } ) test_model = theano.function( inputs=[i_batch], outputs=outputs, givens={ x: test_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: test_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] } ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta( parameters, gradients, learning_rule["rho"], learning_rule["epsilon"] ) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"] ) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[i_batch], outputs=outputs, updates=updates, givens={ x: train_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]], y: train_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]] }, ) # Train model logger.info("Training CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"] ) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def train_siamese_triplets_cnn(options_dict): """Train and save a Siamese CNN using the specified options.""" # Preliminary logger.info(datetime.now()) if not path.isdir(options_dict["model_dir"]): os.makedirs(options_dict["model_dir"]) if "log_to_file" in options_dict and options_dict["log_to_file"] is True: log_fn = path.join(options_dict["model_dir"], "log") print "Writing:", log_fn root_logger = logging.getLogger() if len(root_logger.handlers) > 0: root_logger.removeHandler( root_logger.handlers[0]) # close open file handler logging.basicConfig(filename=log_fn, level=logging.DEBUG) else: logging.basicConfig(level=logging.DEBUG) rng = np.random.RandomState(options_dict["rnd_seed"]) if options_dict["dropout_rates"] is not None: srng = RandomStreams(seed=options_dict["rnd_seed"]) else: srng = None options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz") logger.info("Saving options: " + options_dict_fn) f = data_io.smart_open(options_dict_fn, "wb") pickle.dump(options_dict, f, -1) f.close() logger.info("Options: " + str(options_dict)) # Load and format data # Load into shared variables datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"]) train_x, train_matches_vec, train_labels = datasets[0] dev_x, dev_matches_vec, dev_labels = datasets[1] test_x, test_matches_vec, test_labels = datasets[2] # Flatten data d_in = 39 * 200 train_x = train_x.reshape((-1, d_in)) dev_x = dev_x.reshape((-1, d_in)) test_x = test_x.reshape((-1, d_in)) # Make batch iterators train_batch_iterator = BatchIteratorTriplets( rng, train_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True) validate_batch_iterator = BatchIteratorTriplets( rng, dev_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False) test_batch_iterator = BatchIteratorTriplets( rng, test_matches_vec, options_dict["batch_size"], n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=False) # Setup model logger.info("Building Siamese triplets CNN") # Symbolic variables x1 = T.matrix("x1") x2 = T.matrix("x2") x3 = T.matrix("x3") x1_indices = T.ivector("x1_indices") x2_indices = T.ivector("x2_indices") x3_indices = T.ivector("x3_indices") # Build model input_shape = (options_dict["batch_size"], 1, 39, 200) model = siamese.SiameseTripletCNN( rng, x1, x2, x3, input_shape, conv_layer_specs=options_dict["conv_layer_specs"], hidden_layer_specs=options_dict["hidden_layer_specs"], srng=srng, dropout_rates=options_dict["dropout_rates"], ) if options_dict["loss"] == "hinge_cos": if options_dict["dropout_rates"] is not None: loss = model.dropout_loss_hinge_cos(options_dict["margin"]) else: loss = model.loss_hinge_cos(options_dict["margin"]) error = model.loss_hinge_cos( options_dict["margin"] ) # doesn't include regularization or dropout else: assert False, "Invalid loss: " + options_dict["loss"] # Add regularization if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.: loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[ "l2_weight"] * model.l2 # Compile test functions same_distance = model.cos_same( ) # track the distances of same and different pairs separately diff_distance = model.cos_diff() outputs = [error, loss, same_distance, diff_distance] theano_mode = theano.Mode(linker="cvm") validate_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: dev_x[x1_indices], x2: dev_x[x2_indices], x3: dev_x[x3_indices], }, mode=theano_mode, ) test_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, givens={ x1: test_x[x1_indices], x2: test_x[x2_indices], x3: test_x[x3_indices], }, mode=theano_mode, ) # Gradients and training updates parameters = model.parameters gradients = T.grad(loss, parameters) learning_rule = options_dict["learning_rule"] if learning_rule["type"] == "adadelta": updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]) elif learning_rule["type"] == "momentum": updates = training.learning_rule_momentum( parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]) else: assert False, "Invalid learning rule: " + learning_rule["type"] # Compile training function train_model = theano.function( inputs=[x1_indices, x2_indices, x3_indices], outputs=outputs, updates=updates, givens={ x1: train_x[x1_indices], x2: train_x[x2_indices], x3: train_x[x3_indices], }, mode=theano_mode, ) # Train model logger.info("Training Siamese triplets CNN") record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz") record_dict = training.train_fixed_epochs_with_validation( options_dict["n_max_epochs"], train_model=train_model, train_batch_iterator=train_batch_iterator, validate_model=validate_model, validate_batch_iterator=validate_batch_iterator, test_model=test_model, test_batch_iterator=test_batch_iterator, save_model_func=model.save, save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"), record_dict_fn=record_dict_fn, ) # Extrinsic evaluation # Pass data trough model logger.info("Performing same-different evaluation") layers_output_dict = apply_layers.apply_layers( options_dict["model_dir"], "dev", batch_size=645) # batch size covers 10965 out of 10966 tokens utt_ids = sorted(layers_output_dict.keys()) embeddings = np.array([layers_output_dict[i] for i in utt_ids]) labels = data_io.swbd_utts_to_labels(utt_ids) # Perform same-different distances = pdist(embeddings, metric="cosine") matches = samediff.generate_matches_array(labels) ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False]) logger.info("Validation average precision: " + str(ap)) ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt") with open(ap_fn, "w") as f: f.write(str(ap) + "\n")
def main(): args = check_argv() print("Reading:", args.npz_fn) embeddings = np.load(args.npz_fn) # # Temp # data = {} # a = list(embeddings) # random.shuffle(a) # for key in a[:100]: # data[key] = embeddings[key] # embeddings = data print("Ordering embeddings:") n_embeds = 0 X = [] utt_keys = [] labels = [] speakers = [] for utt_key in tqdm(sorted(embeddings)): utt_keys.append(utt_key) X.append(embeddings[utt_key]) utt_key = utt_key.split("_") label = utt_key[0] speaker = utt_key[1] labels.append(label) speakers.append(speaker) X = np.array(X) print("No. embeddings:", X.shape[0]) print("Embedding dimensionality:", X.shape[1]) # Normalise normed = (X - X.mean(axis=0)) / X.std(axis=0) X = normed print("Calculating distances") distances = pdist(X, metric="cosine") # Plot: Matching words print("Getting word matches") word_matches = samediff.generate_matches_array(labels) print("Total no. pairs:", word_matches.shape[0]) print("No. same-word pairs:", sum(word_matches)) distances_pos_avg = np.mean(distances[word_matches == True]) distances_neg_avg = np.mean(distances[word_matches == False]) distances_pos_std = np.std(distances[word_matches == True]) distances_neg_std = np.std(distances[word_matches == False]) plt.figure() plt.bar([0, 1], [distances_neg_avg, distances_pos_avg], yerr=[distances_neg_std, distances_pos_std]) plt.xticks([0, 1], ("No", "Yes")) plt.xlabel("Matching words") plt.ylabel("Cosine distance") plt.ylim([0, 1.2]) # Plot: Same speakers print("Getting speaker matches") speaker_matches = samediff.generate_matches_array(speakers) print("No. same-speaker pairs:", sum(speaker_matches)) distances_pos_avg = np.mean(distances[np.logical_and( word_matches, speaker_matches)]) distances_neg_avg = np.mean(distances[np.logical_and( word_matches, speaker_matches == False)]) distances_pos_std = np.std(distances[np.logical_and( word_matches, speaker_matches)]) distances_neg_std = np.std(distances[np.logical_and( word_matches, speaker_matches == False)]) # distances_pos_avg = np.mean(distances[speaker_matches == True]) # distances_neg_avg = np.mean(distances[speaker_matches == False]) # distances_pos_std = np.std(distances[speaker_matches == True]) # distances_neg_std = np.std(distances[speaker_matches == False]) plt.figure() plt.bar([0, 1], [distances_neg_avg, distances_pos_avg], yerr=[distances_neg_std, distances_pos_std]) plt.xticks([0, 1], ("No", "Yes")) plt.xlabel("Matching speakers") plt.ylabel("Cosine distance") plt.ylim([0, 1.2]) plt.title("Distances between same-word pairs") # Plot: Edit distances if args.pronunciation is not None: # Pronunciations pron_fn = path.join("lists", args.pronunciation, "dev.prons") print("Reading:", pron_fn) pronunciations = read_pronunciations(pron_fn) pron_labels = [] for utt_key in utt_keys: pron_labels.append(pronunciations[utt_key]) # Get distances print("Getting edit distances:") # edit_distances = editdistance_array(labels) edit_distances = editdistance_array(pron_labels) # Plot distances edits = sorted(set(edit_distances)) averages = [] stds = [] for edit in edits: averages.append(np.mean(distances[edit_distances == edit])) stds.append(np.std(distances[edit_distances == edit])) plt.figure() plt.bar(edits, averages, yerr=stds) plt.ylim([0, 1.2]) plt.xlabel("Phone edit distance") plt.ylabel("Cosine distance") plt.show()