def train_siamese_triplets_cnn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True)
    validate_batch_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)
    test_batch_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)

    # Setup model

    logger.info("Building Siamese triplets CNN")

    # Symbolic variables
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x3 = T.matrix("x3")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")
    x3_indices = T.ivector("x3_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletCNN(
        rng,
        x1,
        x2,
        x3,
        input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(
            options_dict["margin"]
        )  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same(
    )  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            x3: dev_x[x3_indices],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            x3: test_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            x3: train_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev",
        batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_siamese_triplets_lstm_nn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff_mask(rng, options_dict["data_dir"])
    train_x, train_mask, train_lengths, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_mask, dev_lengths, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_mask, test_lengths, test_matches_vec, test_labels = datasets[2]

    # Make batch iterators
    train_triplet_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True,
    )
    validate_triplet_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )
    test_triplet_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )

    # Setup model

    logger.info("Building Siamese triplets LSTM")

    # Symbolic variables
    x1 = tensor.tensor3("x1", dtype=THEANOTYPE)
    x2 = tensor.tensor3("x2", dtype=THEANOTYPE)
    x3 = tensor.tensor3("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)
    x1_indices = tensor.ivector("x1_indices")
    x2_indices = tensor.ivector("x2_indices")
    x3_indices = tensor.ivector("x3_indices")
    l1 = tensor.iscalar("l1")
    l2 = tensor.iscalar("l2")
    l3 = tensor.iscalar("l3")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletBatchLSTMNN(
        rng,
        x1,
        x2,
        x3,
        m1,
        m2,
        m3,
        n_in=39,
        n_lstm_hiddens=options_dict["n_hiddens"],
        mlp_hidden_specs=options_dict["hidden_layer_specs"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(options_dict["margin"])  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l2_weight"] > 0.0:
        loss = loss + options_dict["l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same()  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")

    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices].swapaxes(0, 1)[: dev_lengths[x1_indices].max()],
            m1: dev_mask[x1_indices].T[: dev_lengths[x1_indices].max()],
            x2: dev_x[x2_indices].swapaxes(0, 1)[: dev_lengths[x2_indices].max()],
            m2: dev_mask[x2_indices].T[: dev_lengths[x2_indices].max()],
            x3: dev_x[x3_indices].swapaxes(0, 1)[: dev_lengths[x3_indices].max()],
            m3: dev_mask[x3_indices].T[: dev_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices].swapaxes(0, 1)[: test_lengths[x1_indices].max()],
            m1: test_mask[x1_indices].T[: test_lengths[x1_indices].max()],
            x2: test_x[x2_indices].swapaxes(0, 1)[: test_lengths[x2_indices].max()],
            m2: test_mask[x2_indices].T[: test_lengths[x2_indices].max()],
            x3: test_x[x3_indices].swapaxes(0, 1)[: test_lengths[x3_indices].max()],
            m3: test_mask[x3_indices].T[: test_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # test_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     givens={
    #         l1: test_lengths[x1_indices].max(),
    #         x1: test_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: test_mask[x1_indices][:l1],
    #         l2: test_lengths[x2_indices].max(),
    #         x2: test_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: test_mask[x2_indices][:l2],
    #         l3: test_lengths[x3_indices].max(),
    #         x3: test_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: test_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Gradients and training updates
    parameters = model.parameters
    gradients = tensor.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
        )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices].swapaxes(0, 1)[: train_lengths[x1_indices].max()],
            m1: train_mask[x1_indices].T[: train_lengths[x1_indices].max()],
            x2: train_x[x2_indices].swapaxes(0, 1)[: train_lengths[x2_indices].max()],
            m2: train_mask[x2_indices].T[: train_lengths[x2_indices].max()],
            x3: train_x[x3_indices].swapaxes(0, 1)[: train_lengths[x3_indices].max()],
            m3: train_mask[x3_indices].T[: train_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # train_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     updates=updates,
    #     givens={
    #         l1: train_lengths[x1_indices].max(),
    #         x1: train_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: train_mask[x1_indices][:l1],
    #         l2: train_lengths[x2_indices].max(),
    #         x2: train_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: train_mask[x2_indices][:l2],
    #         l3: train_lengths[x3_indices].max(),
    #         x3: train_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: train_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_triplet_iterator=train_triplet_iterator,
        validate_model=validate_model,
        validate_triplet_iterator=validate_triplet_iterator,
        test_model=test_model,
        test_triplet_iterator=test_triplet_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645
    )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_mlp(options_dict):
    """Train and save a word classifier MLP."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(
        rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches

        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]

    n_train_batches = train_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Setup model

    logger.info("Building MLP")

    # Symbolic variables
    i_batch = T.lscalar()  # batch index
    x = T.matrix("x")  # flattened data of shape (n_data, d_in)
    y = T.ivector("y")  # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    model = mlp.MLP(rng, x, d_in, options_dict["d_out"],
                    options_dict["hidden_layer_specs"], srng,
                    options_dict["dropout_rates"])
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            dev_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]],
            y:
            dev_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]]
        })
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            test_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]],
            y:
            test_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]]
        })

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x:
            train_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]],
            y:
            train_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]]
        },
    )

    # Train model

    logger.info("Training MLP")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"],
        "dev",
        batch_size=645,
        i_layer=options_dict["i_layer_eval"])
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_cnn(options_dict):
    """Train and save a word classifier CNN."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
        # root_logger = logging.getLogger()
        # formatter = root_logger.handlers[0].formatter
        # root_logger.removeHandler(root_logger.handlers[0])
        # file_handler = logging.FileHandler(log_fn, "a")
        # file_handler.setFormatter(formatter)
        # root_logger.addHandler(file_handler)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None


    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches
        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]
    n_train_batches = train_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Setup model

    logger.info("Building CNN")

    # Symbolic variables
    i_batch = T.lscalar()   # batch index
    x = T.matrix("x")       # flattened data of shape (n_data, d_in)
    y = T.ivector("y")      # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = cnn.CNN(
        rng, x, input_shape, options_dict["conv_layer_specs"],
        options_dict["hidden_layer_specs"], options_dict["d_out"], srng,
        options_dict["dropout_rates"] 
        )
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: dev_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: dev_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: test_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: test_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x: train_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: train_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            },
        )


    # Train model

    logger.info("Training CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"]
        )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def train_siamese_cnn(options_dict):

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorSameDifferent(
        rng, train_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True
        )
    validate_batch_iterator = BatchIteratorSameDifferent(
        rng, dev_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )
    test_batch_iterator = BatchIteratorSameDifferent(
        rng, test_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )


    # Setup model

    logger.info("Building Siamese CNN")

    # Symbolic variables
    y = T.ivector("y")      # indicates whether x1 and x2 is same (1) or different (0)
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseCNN(
        rng, x1, x2, input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
        )
    if options_dict["loss"] == "cos_cos2":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos2(y)
        else:
            loss = model.loss_cos_cos2(y)
        error = model.loss_cos_cos2(y)  # doesn't include regularization or dropout
    elif options_dict["loss"] == "cos_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos(y)
        else:
            loss = model.loss_cos_cos(y)
        error = model.loss_cos_cos(y)
    elif options_dict["loss"] == "cos_cos_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos_margin(y)
        else:
            loss = model.loss_cos_cos_margin(y)
        error = model.loss_cos_cos_margin(y)
    elif options_dict["loss"] == "euclidean_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_euclidean_margin(y)
        else:
            loss = model.loss_euclidean_margin(y)
        error = model.loss_euclidean_margin(y)
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    same_distance = model.cos_same(y)  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff(y)
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            },
        mode=theano_mode,
        )
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            },
        mode=theano_mode,
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            },
        mode=theano_mode,
        )


    # Train model

    logger.info("Training Siamese CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(options_dict["model_dir"], "dev", batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")