def __init__(self,
                 context: mx.context.Context,
                 inputs: str,
                 references: str,
                 model: str,
                 max_input_len: Optional[int] = None,
                 beam_size: int = C.DEFAULT_BEAM_SIZE,
                 bucket_width_source: int = 10,
                 length_penalty_alpha: float = 1.0,
                 length_penalty_beta: float = 0.0,
                 softmax_temperature: Optional[float] = None,
                 max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH,
                 ensemble_mode: str = 'linear',
                 sample_size: int = -1,
                 random_seed: int = 42) -> None:
        self.context = context
        self.max_input_len = max_input_len
        self.max_output_length_num_stds = max_output_length_num_stds
        self.ensemble_mode = ensemble_mode
        self.beam_size = beam_size
        self.batch_size = 16
        self.bucket_width_source = bucket_width_source
        self.length_penalty_alpha = length_penalty_alpha
        self.length_penalty_beta = length_penalty_beta
        self.softmax_temperature = softmax_temperature
        self.model = model
        with data_io.smart_open(inputs) as inputs_fin, data_io.smart_open(references) as references_fin:
            input_sentences = inputs_fin.readlines()
            target_sentences = references_fin.readlines()
            utils.check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match")
            if sample_size <= 0:
                sample_size = len(input_sentences)
            if sample_size < len(input_sentences):
                # custom random number generator to guarantee the same samples across runs in order to be able to
                # compare metrics across independent runs
                random_gen = random.Random(random_seed)
                self.input_sentences, self.target_sentences = zip(
                    *random_gen.sample(list(zip(input_sentences, target_sentences)),
                                       sample_size))
            else:
                self.input_sentences, self.target_sentences = input_sentences, target_sentences

        logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)",
                    max_input_len if max_input_len is not None else -1,
                    beam_size, model, len(self.input_sentences))

        with data_io.smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \
                data_io.smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out:
            [trg_out.write(s) for s in self.target_sentences]
            [src_out.write(s) for s in self.input_sentences]
def load_siamese_cnn(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseCNN(
        rng, x1, x2, input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        dropout_rates=None,  # dropout is not performed after training
        )

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
Exemple #3
0
def load_siamese_triplets_lstm_minibatch(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    model = siamese.SiameseTripleBatchLSTM(
        rng,
        x1,
        x2,
        x3,
        m1,
        m2,
        m3,
        n_in=39,
        n_hiddens=options_dict["n_hiddens"],
        output_type=options_dict["sequence_output_type"])

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
def load_siamese_triplets_lstm_nn_minibatch(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    model = siamese.SiameseTripleBatchLSTM(rng, x1, x2, x3, m1, m2, m3, n_in=39, n_hiddens=options_dict["n_hiddens"])

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
def load_siamese_triplets_cnn(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x3 = T.matrix("x3")

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletCNN(
        rng,
        x1,
        x2,
        x3,
        input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        dropout_rates=None,  # dropout is not performed after training
    )

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
def load_siamese_triplets_lstm(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    if options_dict.has_key("sequence_output_type"):
        sequence_output_type = options_dict["sequence_output_type"]
    else:
        sequence_output_type = "last"
    model = siamese.SiameseTripletLSTM(
        rng, x1, x2, x3, n_in=39, n_hiddens=options_dict["n_hiddens"], output_type=sequence_output_type)
    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
def load_cnn(options_dict):
    
    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x = T.matrix("x")       # flattened data of shape (n_data, d_in)
    y = T.ivector("y")      # labels

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = cnn.CNN(
        rng, x, input_shape, options_dict["conv_layer_specs"],
        options_dict["hidden_layer_specs"], options_dict["d_out"],
        dropout_rates=None,  # dropout is not performed after training
        )

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
def load_mlp(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x = T.matrix("x")  # flattened data of shape (n_data, d_in)
    y = T.ivector("y")  # labels

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    d_in = 39 * 200
    model = mlp.MLP(
        rng,
        x,
        d_in,
        options_dict["d_out"],
        options_dict["hidden_layer_specs"],
        dropout_rates=None  # dropout is not performed after training
    )

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
Exemple #9
0
def load_siamese_triplets_lstm_nn(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    model = siamese.SiameseTripleLSTMNN(
        rng,
        x1,
        x2,
        x3,
        n_in=39,
        n_hiddens=options_dict["n_hiddens"],
        mlp_hidden_specs=options_dict["hidden_layer_specs"])

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
Exemple #10
0
def read_and_translate(translator: inference.Translator, output_handler: output_handler.OutputHandler,
                       chunk_size: Optional[int], source: Optional[str] = None,
                       reference: Optional[str] = None,
                       dictionary: Optional[dict] = None) -> None:
    """
    Reads from either a file or stdin and translates each line, calling the output_handler with the result.

    :param output_handler: Handler that will write output to a stream.
    :param translator: Translator that will translate each line of input.
    :param chunk_size: The size of the portion to read at a time from the input.
    :param source: Path to file which will be translated line-by-line if included, if none use stdin.
    :param reference: Path to reference file.
    :param dictionary: dictionary to constrain translation.
    """
    source_data = sys.stdin if source is None else data_io.smart_open(source)
    reference_data = None if reference is None else data_io.smart_open(reference)

    batch_size = translator.batch_size
    if chunk_size is None:
        if translator.batch_size == 1:
            # No batching, therefore there is not need to read segments in chunks.
            chunk_size = C.CHUNK_SIZE_NO_BATCHING
        else:
            # Get a constant number of batches per call to Translator.translate.
            chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size
    else:
        if chunk_size < translator.batch_size:
            logger.warning("You specified a chunk size (%d) smaller than the batch size (%d). This will lead to "
                           "a degregation of translation speed. Consider choosing a larger chunk size." % (chunk_size,
                                                                                                           batch_size))

    logger.info("Translating...")

    total_time, total_lines = 0.0, 0
    for chunk, reference_chunk in itertools.zip_longest(grouper(source_data, chunk_size), grouper(reference_data, chunk_size)
                                            if reference_data is not None else [None]):
        chunk_time = translate(output_handler, chunk, translator, total_lines, reference_chunk)
        total_lines += len(chunk)
        total_time += chunk_time

    if total_lines != 0:
        logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f",
                    total_lines, ceil(total_lines / batch_size), total_time,
                    total_time / total_lines, total_lines / total_time)
    else:
        logger.info("Processed 0 lines.")
Exemple #11
0
def main():
    args = check_argv()

    model_fn = path.join(args.model_dir, "model.pkl.gz")
    options_dict_fn = path.join(args.model_dir, "options_dict.pkl.gz")
    record_dict_fn = path.join(args.model_dir, "record_dict.pkl.gz")

    print "Reading:", options_dict_fn
    f = smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    f.close()

    print "Reading:", record_dict_fn
    f = smart_open(record_dict_fn)
    record_dict = pickle.load(f)
    f.close()

    plotting.plot_record_dict(record_dict)

    model = train_mlp.load_mlp(options_dict)

    # Plot some filters
    analyze_layer = 0
    W = model.layers[analyze_layer].W.get_value(borrow=True).T
    plot_fn = path.join(args.model_dir,
                        "filters.layer_" + str(analyze_layer) + ".png")
    image = Image.fromarray(
        plotting.tile_images(W, image_shape=(39, 200), tile_shape=(5, 6)))
    print("Saving: " + plot_fn)
    image.save(plot_fn)
    plt.figure()
    plt.imshow(image, cmap=plt.cm.Greys_r, interpolation="nearest")

    analyze_layer = -1
    W = model.layers[analyze_layer].W.get_value(borrow=True)
    plot_fn = path.join(args.model_dir,
                        "filters.layer_" + str(analyze_layer) + ".png")
    image = Image.fromarray(plotting.array_to_pixels(W))
    image.save(plot_fn)
    print("Saving: " + plot_fn)
    plt.figure()
    plt.imshow(image, cmap=plt.cm.Greys_r, interpolation="nearest")
    # plt.axis("off")

    plt.show()
def main():
    args = check_argv()

    model_fn = path.join(args.model_dir, "model.pkl.gz")
    options_dict_fn = path.join(args.model_dir, "options_dict.pkl.gz")
    record_dict_fn = path.join(args.model_dir, "record_dict.pkl.gz")

    print "Reading:", options_dict_fn
    f = smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    f.close()

    print "Reading:", record_dict_fn
    f = smart_open(record_dict_fn)
    record_dict = pickle.load(f)
    f.close()

    plotting.plot_record_dict(record_dict)

    model = train_mlp.load_mlp(options_dict)

    # Plot some filters
    analyze_layer = 0
    W =  model.layers[analyze_layer].W.get_value(borrow=True).T
    plot_fn = path.join(args.model_dir, "filters.layer_" + str(analyze_layer) + ".png")
    image = Image.fromarray(plotting.tile_images(
        W,
        image_shape=(39, 200), tile_shape=(5, 6)
        ))
    print("Saving: " + plot_fn)
    image.save(plot_fn)
    plt.figure()
    plt.imshow(image, cmap=plt.cm.Greys_r, interpolation="nearest")

    analyze_layer = -1
    W = model.layers[analyze_layer].W.get_value(borrow=True)
    plot_fn = path.join(args.model_dir, "filters.layer_" + str(analyze_layer) + ".png")
    image = Image.fromarray(plotting.array_to_pixels(W))
    image.save(plot_fn)
    print("Saving: " + plot_fn)
    plt.figure()
    plt.imshow(image, cmap=plt.cm.Greys_r, interpolation="nearest")
    # plt.axis("off")

    plt.show()
def test_multisaveload():
    rng = numpy.random.RandomState(0)
    x = tensor.matrix("x", dtype=THEANOTYPE)
    n_in = 3
    n_hiddens = [10, 10]
    multi_lstm = MultiLayerLSTM(rng, x, n_in, n_hiddens, output_type="last")
    f0 = theano.function(inputs=[x], outputs=multi_lstm.output)
    save_file = data_io.smart_open("model.pkl.gz", "wb")
    multi_lstm.save(save_file)
    save_file.close()

    n_in1 = 4
    n_hiddens1 = [11, 11]
    multi_lstm1 = MultiLayerLSTM(rng, x, n_in, n_hiddens, output_type="last")
    load_file = data_io.smart_open("model.pkl.gz", "rb")
    multi_lstm1.load(load_file)
    load_file.close()
    f1 = theano.function(inputs=[x], outputs=multi_lstm1.layers[0].output)
    
    n_data = 10
    x0 = rng.randn(n_data, n_in).astype(THEANOTYPE)
    import pdb; pdb.set_trace()
Exemple #14
0
def test_multisaveload():
    rng = numpy.random.RandomState(0)
    x = tensor.matrix("x", dtype=THEANOTYPE)
    n_in = 3
    n_hiddens = [10, 10]
    multi_lstm = MultiLayerLSTM(rng, x, n_in, n_hiddens, output_type="last")
    f0 = theano.function(inputs=[x], outputs=multi_lstm.output)
    save_file = data_io.smart_open("model.pkl.gz", "wb")
    multi_lstm.save(save_file)
    save_file.close()

    n_in1 = 4
    n_hiddens1 = [11, 11]
    multi_lstm1 = MultiLayerLSTM(rng, x, n_in, n_hiddens, output_type="last")
    load_file = data_io.smart_open("model.pkl.gz", "rb")
    multi_lstm1.load(load_file)
    load_file.close()
    f1 = theano.function(inputs=[x], outputs=multi_lstm1.layers[0].output)

    n_data = 10
    x0 = rng.randn(n_data, n_in).astype(THEANOTYPE)
    import pdb
    pdb.set_trace()
def test_saveload():
    rng = numpy.random.RandomState(0)
    x = tensor.matrix("x", dtype=THEANOTYPE)
    n_in = 3
    n_hidden = 10
    lstm = LSTM(rng, x, n_in, n_hidden, output_type="last")
    n_data = 10
    x0 = rng.randn(n_data, n_in).astype(THEANOTYPE)

    f0 = theano.function(inputs=[x], outputs=lstm.output)
    h0 = f0(x0)
    save_file = data_io.smart_open("model.pkl.gz", "wb")
    lstm.save(save_file)
    save_file.close()

    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    lstm1 = LSTM(rng, x1, n_in, n_hidden, output_type="last")
    load_file = data_io.smart_open("model.pkl.gz", "rb")
    f1 = theano.function(inputs=[x1], outputs=lstm1.output)
    h1 = f1(x0)
    lstm1.load(load_file)
    load_file.close()
    h2 = f1(x0)
    numpy.testing.assert_array_almost_equal(h0, h2)
Exemple #16
0
def test_saveload():
    rng = numpy.random.RandomState(0)
    x = tensor.matrix("x", dtype=THEANOTYPE)
    n_in = 3
    n_hidden = 10
    lstm = LSTM(rng, x, n_in, n_hidden, output_type="last")
    n_data = 10
    x0 = rng.randn(n_data, n_in).astype(THEANOTYPE)

    f0 = theano.function(inputs=[x], outputs=lstm.output)
    h0 = f0(x0)
    save_file = data_io.smart_open("model.pkl.gz", "wb")
    lstm.save(save_file)
    save_file.close()

    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    lstm1 = LSTM(rng, x1, n_in, n_hidden, output_type="last")
    load_file = data_io.smart_open("model.pkl.gz", "rb")
    f1 = theano.function(inputs=[x1], outputs=lstm1.output)
    h1 = f1(x0)
    lstm1.load(load_file)
    load_file.close()
    h2 = f1(x0)
    numpy.testing.assert_array_almost_equal(h0, h2)
Exemple #17
0
def build_from_paths(paths: List[str],
                     num_words: int = 50000,
                     min_count: int = 1) -> Dict[str, int]:
    """
    Creates vocabulary from paths to a file in sentence-per-line format. A sentence is just a whitespace delimited
    list of tokens. Note that special symbols like the beginning of sentence (BOS) symbol will be added to the
    vocabulary.

    :param paths: List of paths to files with one sentence per line.
    :param num_words: Maximum number of words in the vocabulary.
    :param min_count: Minimum occurrences of words to be included in the vocabulary.
    :return: Word-to-id mapping.
    """
    with ExitStack() as stack:
        logger.info("Building vocabulary from dataset(s): %s", paths)
        files = (stack.enter_context(smart_open(path)) for path in paths)
        return build_vocab(chain(*files), num_words, min_count)
    def decode_and_evaluate(self,
                            checkpoint: Optional[int] = None,
                            output_name: str = os.devnull) -> Dict[str, float]:
        """
        Decodes data set and evaluates given a checkpoint.

        :param checkpoint: Checkpoint to load parameters from.
        :param output_name: Filename to write translations to. Defaults to /dev/null.
        :return: Mapping of metric names to scores.
        """
        models, vocab_source, vocab_target = inference.load_models(self.context,
                                                                   self.max_input_len,
                                                                   self.beam_size,
                                                                   self.batch_size,
                                                                   [self.model],
                                                                   [checkpoint],
                                                                   softmax_temperature=self.softmax_temperature,
                                                                   max_output_length_num_stds=self.max_output_length_num_stds)
        translator = inference.Translator(self.context,
                                          self.ensemble_mode,
                                          self.bucket_width_source,
                                          inference.LengthPenalty(self.length_penalty_alpha, self.length_penalty_beta),
                                          models,
                                          vocab_source,
                                          vocab_target)
        trans_wall_time = 0.0
        translations = []
        with data_io.smart_open(output_name, 'w') as output:
            handler = output_handler.StringOutputHandler(output)
            tic = time.time()
            trans_inputs = [translator.make_input(i, line) for i, line in enumerate(self.input_sentences)]
            trans_outputs = translator.translate(trans_inputs)
            trans_wall_time = time.time() - tic
            for trans_input, trans_output in zip(trans_inputs, trans_outputs):
                handler.handle(trans_input, trans_output)
                translations.append(trans_output.translation)
        avg_time = trans_wall_time / len(self.input_sentences)

        # TODO(fhieber): eventually add more metrics (METEOR etc.)
        return {C.BLEU_VAL: evaluate.raw_corpus_bleu(hypotheses=translations,
                                                     references=self.target_sentences,
                                                     offset=0.01),
                C.CHRF_VAL: chrf.corpus_chrf(hypotheses=translations,
                                             references=self.target_sentences,
                                             trim_whitespaces=True),
                C.AVG_TIME: avg_time}
def load_lstm_mlp(options_dict):
    
    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x = T.matrix("x")       # flattened data of shape (n_data, d_in)
    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    model = lstm.MultiLayerLSTMMLP(
        rng, x, 39, options_dict["d_out"], options_dict["n_hiddens"], options_dict["hidden_layer_specs"], output_type="last", prefix="lstms")

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
Exemple #20
0
def lexicon_iterator(path: str,
                     vocab_source: Dict[str, int],
                     vocab_target: Dict[str, int]) -> Generator[Tuple[int, int, float], None, None]:
    """
    Yields lines from a translation table of format: src, trg, logprob.

    :param path: Path to lexicon file.
    :param vocab_source: Source vocabulary.
    :param vocab_target: Target vocabulary.
    :return: Generator returning tuples (src_id, trg_id, prob).
    """
    assert C.UNK_SYMBOL in vocab_source
    assert C.UNK_SYMBOL in vocab_target
    src_unk_id = vocab_source[C.UNK_SYMBOL]
    trg_unk_id = vocab_target[C.UNK_SYMBOL]
    with smart_open(path) as fin:
        for line in fin:
            src, trg, logprob = line.rstrip("\n").split("\t")
            prob = np.exp(float(logprob))
            src_id = vocab_source.get(src, src_unk_id)
            trg_id = vocab_target.get(trg, trg_unk_id)
            yield src_id, trg_id, prob
def load_siamese_triplets_convlstm_minibatch(options_dict):

    model_fn = path.join(options_dict["model_dir"], "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    # Build model
    input_shape = (options_dict["batch_size"], 1, 200, 39)
    model = siamese.SiameseTripletBatchConvLSTM(
        rng, x1, x2, x3, m1, m2, m3, input_shape,
        filter_shape=options_dict["filter_shape"],
        n_lstm_hiddens=options_dict["n_hiddens"],
        n_outputs=options_dict["embedding_dim"],
        output_type=options_dict["sequence_output_type"],
        srng=srng, dropout=options_dict["dropout_rates"],
        use_dropout_regularization=options_dict["use_dropout_regularization"],
        stabilize_activations=options_dict["stabilize_activations"]
    )

    # Load saved parameters
    logger.info("Reading: " + model_fn)
    f = data_io.smart_open(model_fn)
    model.load(f)
    f.close()

    return model
Exemple #22
0
def get_output_handler(output_type: str,
                       output_fname: Optional[str],
                       sure_align_threshold: float) -> 'OutputHandler':
    """

    :param output_type: Type of output handler.
    :param output_fname: Output filename. If none sys.stdout is used.
    :param sure_align_threshold: Threshold to consider an alignment link as 'sure'.
    :raises: ValueError for unknown output_type.
    :return: Output handler.
    """
    output_stream = sys.stdout if output_fname is None else data_io.smart_open(output_fname, mode='w')
    if output_type == C.OUTPUT_HANDLER_TRANSLATION:
        return StringOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_SCORE:
        return StringWithScoreOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENTS:
        return StringWithAlignmentsOutputHandler(output_stream, sure_align_threshold)
    elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_ALIGNMENT_MATRIX:
        return StringWithAlignmentMatrixOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_BENCHMARK:
        return BenchmarkOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_ALIGN_PLOT:
        return AlignPlotHandler(plot_prefix="align" if output_fname is None else output_fname)
    elif output_type == C.OUTPUT_HANDLER_ALIGN_TEXT:
        return AlignTextHandler(sure_align_threshold)
    elif output_type == C.OUTPUT_HANDLER_ALIGNMENT:
        return AlignmentsOutputHandler(output_stream)
    elif output_type == C.OUTPUT_HANDLER_JOINT:
        return JointOutputHandler(output_stream, mode='hard')
    elif output_type == C.OUTPUT_HANDLER_JOINT_SOFT:
        return JointOutputHandler(output_stream, mode='soft')
    elif output_type == C.OUTPUT_HANDLER_ALIGNMENT_ONE_HOT:
        return StringWithAlignmentOneHotMatrixOutputHandler(output_stream)
    else:
        raise ValueError("unknown output type")
def train_siamese_cnn(options_dict):

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorSameDifferent(
        rng, train_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"], sample_diff_every_epoch=True
        )
    validate_batch_iterator = BatchIteratorSameDifferent(
        rng, dev_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )
    test_batch_iterator = BatchIteratorSameDifferent(
        rng, test_matches_vec, options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False
        )


    # Setup model

    logger.info("Building Siamese CNN")

    # Symbolic variables
    y = T.ivector("y")      # indicates whether x1 and x2 is same (1) or different (0)
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseCNN(
        rng, x1, x2, input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
        )
    if options_dict["loss"] == "cos_cos2":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos2(y)
        else:
            loss = model.loss_cos_cos2(y)
        error = model.loss_cos_cos2(y)  # doesn't include regularization or dropout
    elif options_dict["loss"] == "cos_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos(y)
        else:
            loss = model.loss_cos_cos(y)
        error = model.loss_cos_cos(y)
    elif options_dict["loss"] == "cos_cos_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_cos_cos_margin(y)
        else:
            loss = model.loss_cos_cos_margin(y)
        error = model.loss_cos_cos_margin(y)
    elif options_dict["loss"] == "euclidean_margin":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_euclidean_margin(y)
        else:
            loss = model.loss_euclidean_margin(y)
        error = model.loss_euclidean_margin(y)
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    same_distance = model.cos_same(y)  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff(y)
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            },
        mode=theano_mode,
        )
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            },
        mode=theano_mode,
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, y],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            },
        mode=theano_mode,
        )


    # Train model

    logger.info("Training Siamese CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(options_dict["model_dir"], "dev", batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Exemple #24
0
def train_fixed_epochs_with_validation(n_epochs, train_model,
        train_batch_iterator, validate_model, validate_batch_iterator,
        test_model=None, test_batch_iterator=None, save_model_func=None,
        save_model_fn=None, record_dict_fn=None):
    """
    Train for a fixed number of epochs, using validation to decide which model
    to save.

    Parameters
    ----------
    train_model : Theano function
        Should take input from `train_batch_iterator` and output the training
        loss. The function can provide more than one output, which is averaged.
        This is useful for example to output both negative log likelihood (the
        model loss) and zero-one loss (the number of errors).
    train_batch_iterator : generator
        Provides the training batches.
    validate_model : Theano function
        Should take input from `validate_batch_iterator` and output the
        validation loss. The function can provide more than one output (which
        would be averaged), but for the validation only the first output will
        be used (except if `validate_extrinsic` is provided).
    validate_extrinsic : function
        Extrinsic evaluation can be performed using this function. If provided,
        validation is performed on the output of this function instead of using
        the output from `validate_model`.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the best model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tracked, while the dict
        value is a list of (epoch, statistic) tuples giving the statistic-value
        at a particular epoch.
    """

    record_dict = {}
    record_dict["train_loss"] = []          # each element is (epoch, loss)
    record_dict["validation_loss"] = []     # validation is not necessarily performed every epoch
    if test_model is not None:
        record_dict["test_loss"] = []       # and neither is testing
    # if validate_extrinsic is not None:
    #     record_dict["validation_extrinsic"] = []
    record_dict["epoch_time"] = []

    logger.info(datetime.now())

    # Training epochs
    best_validation_loss0 = np.inf
    test_loss = np.inf
    i_epoch_best = 0
    for i_epoch in xrange(n_epochs):

        # Loop over training batches
        # train_losses = []
        start_time = timeit.default_timer()
        train_losses = [train_model(*batch) for batch in train_batch_iterator]
        # for i_batch in xrange(n_train_batches):
        # for batch in train_batch_iterator()
            # Calculate training loss for this batch and update parameters
            # train_losses.append(train_model(*batch))

        # Validate the model
        validation_losses = [validate_model(*batch) for batch in validate_batch_iterator]
        validation_loss = np.mean(validation_losses, axis=0)
        logger.info("Epoch " + str(i_epoch + 1) + ": "
            "validation loss: " + str(validation_loss)
            )
        record_dict["validation_loss"].append((i_epoch, validation_loss))
        
        # print math.isnan(validation_loss)
        if hasattr(validation_loss, "__len__"):
            validation_loss0 = validation_loss[0]
        else:
            validation_loss0 = validation_loss

        # If this is the best model, test and save
        if validation_loss0 < best_validation_loss0:

            best_validation_loss0 = validation_loss0
            i_epoch_best = i_epoch

            # Test model
            if test_model is not None:
                test_losses = [test_model(*batch) for batch in test_batch_iterator]
                test_loss = np.mean(test_losses, axis=0)
                logger.info("    Test loss: " + str(test_loss))
                record_dict["test_loss"].append((i_epoch, test_loss))

            # Write the best model
            if save_model_func is not None:
                f = smart_open(save_model_fn, "wb")
                save_model_func(f)
                f.close()

        # Training statistics for this epoch
        end_time = timeit.default_timer()
        train_loss = np.mean(train_losses, axis=0)
        epoch_time = end_time - start_time
        # logger.info("Training loss: " + str(train_loss)  # + ", " + 
            # )
        logger.info("Time: %f" % (epoch_time) + " sec, " + 
            "training loss: " + str(train_loss)  # + ", " + 
            )
        record_dict["epoch_time"].append((i_epoch, epoch_time))
        record_dict["train_loss"].append((i_epoch, train_loss))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info("Training complete: %f min" % (total_time / 60.))
    logger.info(
        "Best validation epoch: " + str(i_epoch_best + 1) + ", "
        "best validation loss: " + str(best_validation_loss0)
        )
    if test_model is not None:
        logger.info("Test loss: " + str(test_loss))
    if save_model_func is not None:
        logger.info("Best validation model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    logger.info(datetime.now())

    return record_dict
Exemple #25
0
def train_early_stopping(n_train_batches, n_validation_batches, train_model,
        validate_model, test_model=None, n_test_batches=None,
        n_max_epochs=1000, n_batches_validation_frequency=None,
        n_patience=5000, patience_increase_factor=2,
        improvement_threshold=0.995, save_model_func=None, save_model_fn=None,
        record_dict_fn=None, learning_rate_update=None):
    """
    Train model using early stopping, using the provided training function.

    Parameters
    ----------
    n_train_batches : int
        Total number of training batches.
    n_validation_batches : int
        Total number of validation batches.
    train_model : Theano function
        Should take as input a batch index and output the training loss and
        error (e.g. negative log likelihood and zero-one loss).
    validate_model : Theano function
        Should take as input a batch index and output the validation loss and
        error.
    test_model : Theano function
        Should take as input a batch index and output the test loss and error.
        If not provided, testing is not performed over the training iterations.
    n_test_batches : int
        Total number of test batches.
    n_batches_validation_frequency : int
        Number of batches between calculating the validation error; if not
        provided, is set to min(n_train_batches, n_patience / 2) which means
        that at a minimum validation will be performed every epoch (i.e. every
        time after seeing `n_train_batches` batches).
    n_patience : int
        Number of minibatches to consider at a minimum before completing
        training.
    patience_increase_factor : int
        When a new validation minimum is found, the number of seen minibatches
        are multiplied by this factor to give the new minimum number of
        minibatches before stopping.
    improvement_threshold : float
        The minimum relative improvement in validation error to be warrant an
        increase in `n_patience` by `patience_increase_factor`.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the current model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.
    learning_rate_update : Theano function
        If provided, this function is called (without any parameters) at the
        beginning of every epoch to update the learning rate.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tract, while the dict value
        is a list of (epoch, statistic) tuples giving the statistic-value at a
        particular epoch.
    """

    assert (save_model_func is None) or (save_model_fn is not None)
    assert (test_model is None) or (n_test_batches is not None)

    # Set default if not provided
    if n_batches_validation_frequency is None:
        n_batches_validation_frequency = min(n_train_batches, n_patience / 2)

    record_dict = {}
    record_dict["train_loss"] = []          # each element is (epoch, loss)
    record_dict["train_error"] = []
    record_dict["validation_loss"] = []     # validation is not necessarily performed every epoch
    record_dict["validation_error"] = []
    if test_model is not None:
        record_dict["test_loss"] = []       # and neither is testing
        record_dict["test_error"] = []
    record_dict["epoch_time"] = []

    # Training epochs
    i_epoch = 0
    done_looping = False
    best_validation_error = np.inf
    n_batches_best = 0
    i_epoch_best = 0
    while (i_epoch < n_max_epochs) and (not done_looping):

        train_losses = []
        train_errors = []
        start_time = timeit.default_timer()

        if learning_rate_update is not None:
            learning_rate = learning_rate_update(i_epoch)

        # Minibatches
        for i_batch in xrange(n_train_batches):

            # Calculate cost for this minibatch, updating the parameters
            minibatch_train_loss, minibatch_train_errors = train_model(i_batch)
            train_errors.append(minibatch_train_errors)
            train_losses.append(minibatch_train_loss)

            # print train_losses
            # print i_batch, train_model(i_batch)
            # break

            n_seen_batches = i_epoch * n_train_batches + i_batch

            # Use n_seen_batches + 1 to avoid checking very first batch
            if (n_seen_batches + 1) % n_batches_validation_frequency == 0:

                # Validate model
                validation_losses_errors = [validate_model(i) for i in xrange(n_validation_batches)]
                validation_loss = np.mean([i[0] for i in validation_losses_errors])
                validation_error = np.mean([i[1] for i in validation_losses_errors])

                logger.info(
                    "Validation: epoch %i, minibatch %i/%i, loss %f, error %.2f%%" %
                    (i_epoch + 1, i_batch + 1, n_train_batches, validation_loss, validation_error * 100.)
                    )
                record_dict["validation_loss"].append((i_epoch, validation_loss))
                record_dict["validation_error"].append((i_epoch, validation_error))

                # Check validation to see if we have new best model
                if validation_error < best_validation_error:
                    if validation_error < best_validation_error * improvement_threshold:
                        n_patience = max(n_patience, n_seen_batches * patience_increase_factor)
                    best_validation_error = validation_error
                    n_batches_best = n_seen_batches
                    i_epoch_best = i_epoch

                    if test_model is not None:
                        # test_losses = [test_model(i) for i in xrange(n_test_batches)]
                        test_losses_errors = [test_model(i) for i in xrange(n_test_batches)]
                        test_loss = np.mean([i[0] for i in test_losses_errors])
                        test_error = np.mean([i[1] for i in test_losses_errors])

                        logger.info("\tTest: loss %f, error %.2f%%" % (test_loss, test_error * 100.))
                        # logger.info(
                        #     "Test: epoch %i, minibatch %i/%i, error %f%%" %
                        #     (i_epoch + 1, i_batch + 1, n_train_batches, test_loss * 100)
                        #     )
                        record_dict["test_loss"].append((i_epoch, test_loss))
                        record_dict["test_error"].append((i_epoch, test_error))

                    # Write the best model
                    if save_model_func is not None:
                        f = smart_open(save_model_fn, "wb")
                        save_model_func(f)
                        f.close()

            # break

            # Check if training is done
            if n_patience <= n_seen_batches:
                done_looping = True
                break

        end_time = timeit.default_timer()
        epoch_time = end_time - start_time
        record_dict["epoch_time"].append((i_epoch, epoch_time))

        # print train_losses
        # print train_errors
        cur_train_loss = np.mean(train_losses)
        cur_train_error = np.mean(train_errors)
        if learning_rate_update is not None:
            logger.info(
                "Train: lr %f, epoch %i, %f sec/epoch, loss %f, error %.2f%%" % (
                    learning_rate, i_epoch + 1,
                    epoch_time, cur_train_loss, cur_train_error*100.
                    )
                )
        else:
            logger.info(
                "Train: epoch %i, %f sec/epoch, loss %f, error %.2f%%" % (
                    i_epoch + 1,
                    epoch_time, cur_train_loss, cur_train_error*100.
                    )
                )
        record_dict["train_loss"].append((i_epoch, cur_train_loss))
        record_dict["train_error"].append((i_epoch, cur_train_error))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

        i_epoch += 1

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info(
        "Training complete: %d epochs, %f sec/epoch, total time %f min" %
        ( i_epoch, 1. * total_time / i_epoch, total_time / 60. )
        )
    logger.info(
        "Best validation: after seeing %d minibatches in epoch %d, error %.2f%%" %
        (n_batches_best, i_epoch_best + 1, best_validation_error * 100.)
        )
    if test_model is not None:
        logger.info("Test error: %.2f%%" % (test_error * 100.))
    if save_model_func is not None:
        logger.info("Best validation model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    return record_dict
def test_siamese_triplet_batch_save_load():
    testdir = "train_siamese_triplets_convlstm_tmp_testdir"
    options_dict = default_options_dict.copy()
    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    if not path.isdir(testdir):
        os.makedirs(testdir)
    model_fn = path.join(testdir, "model.pkl.gz")

    # Symbolic variables
    x1 = tensor.matrix("x1", dtype=THEANOTYPE)
    x2 = tensor.matrix("x2", dtype=THEANOTYPE)
    x3 = tensor.matrix("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)

    # Random number generators
    rng = np.random.RandomState(options_dict["rnd_seed"])

    # Build model
    input_shape = (options_dict["batch_size"], 1, 200, 39)
    model = siamese.SiameseTripletBatchConvLSTM(
        rng, x1, x2, x3, m1, m2, m3, input_shape,
        filter_shape=options_dict["filter_shape"],
        n_lstm_hiddens=options_dict["n_hiddens"],
        output_type=options_dict["sequence_output_type"],
        srng=srng, dropout=options_dict["dropout_rates"])

    run_model = theano.function(
        inputs=[model.input, model.mask],
        outputs=model.output)

    x0 = rng.randn(options_dict["batch_size"], 200, 39)
    m0 = rng.rand(options_dict["batch_size"], 200).T

    y0 = run_model(x0, m0)

    f = data_io.smart_open(model_fn, "wb")
    model.save(f)
    f.close()

    model = siamese.SiameseTripletBatchConvLSTM(
        rng, x1, x2, x3, m1, m2, m3, input_shape,
        filter_shape=options_dict["filter_shape"],
        n_lstm_hiddens=options_dict["n_hiddens"],
        output_type=options_dict["sequence_output_type"],
        srng=srng, dropout=options_dict["dropout_rates"])

    f = data_io.smart_open(model_fn, "rb")
    model.load(f)
    f.close()

    run_model = theano.function(
        inputs=[model.input, model.mask],
        outputs=model.output)

    y1 = run_model(x0, m0)
    
    shutil.rmtree(testdir)
    np.testing.assert_array_almost_equal(y1, y0)
Exemple #27
0
def train_fixed_epochs(n_epochs, train_model, train_batch_iterator,
        test_model=None, test_batch_iterator=None, save_model_func=None,
        save_model_fn=None, record_dict_fn=None):
    """
    Train for a fixed number of epochs.

    Parameters
    ----------
    train_model : Theano function
        Should take input from `train_batch_iterator` and output the training
        loss. The function can provide more than one output, which is averaged.
        This is useful for example to output both negative log likelihood (the
        model loss) and zero-one loss (the number of errors).
    train_batch_iterator : generator
        Provides the training batches.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the best model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tracked, while the dict
        value is a list of (epoch, statistic) tuples giving the statistic-value
        at a particular epoch.
    """

    record_dict = {}
    record_dict["train_loss"] = []          # each element is (epoch, loss)
    if test_model is not None:
        record_dict["test_loss"] = []       # testing is not necessarily performed every epoch
    record_dict["epoch_time"] = []

    logger.info(datetime.now())

    # Training epochs
    i_epoch_best = 0
    test_loss = np.inf
    for i_epoch in xrange(n_epochs):

        logger.info("Epoch " + str(i_epoch + 1) + ":")

        # Loop over training batches
        # train_losses = []
        start_time = timeit.default_timer()
        train_losses = [train_model(*batch) for batch in train_batch_iterator]
        # for i_batch in xrange(n_train_batches):
        # for batch in train_batch_iterator()
            # Calculate training loss for this batch and update parameters
            # train_losses.append(train_model(*batch))

        # Test model
        if test_model is not None:
            test_losses = [test_model(*batch) for batch in test_batch_iterator]
            test_loss = np.mean(test_losses, axis=0)
            logger.info("    Test loss: " + str(test_loss))
            record_dict["test_loss"].append((i_epoch, test_loss))

        # Write this model
        if save_model_func is not None:
            f = smart_open(save_model_fn, "wb")
            save_model_func(f)
            f.close()

        # Training statistics for this epoch
        end_time = timeit.default_timer()
        train_loss = np.mean(train_losses, axis=0)
        epoch_time = end_time - start_time
        # logger.info("Training loss: " + str(train_loss)  # + ", " + 
            # )
        logger.info("Time: %f" % (epoch_time) + " sec, " + 
            "training loss: " + str(train_loss)  # + ", " + 
            )
        record_dict["epoch_time"].append((i_epoch, epoch_time))
        record_dict["train_loss"].append((i_epoch, train_loss))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info("Training complete: %f min" % (total_time / 60.))
    if test_model is not None:
        logger.info("Test loss: " + str(test_loss))
    if save_model_func is not None:
        logger.info("Model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    logger.info(datetime.now())

    return record_dict
def apply_layers(model_dir, set, batch_size=None, i_layer=-1):

    logger.info(datetime.now())

    # Load the model options
    options_dict_fn = path.join(model_dir, "options_dict.pkl.gz")
    logger.info("Reading: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    # print options_dict
    f.close()

    # Load the dataset
    npz_fn = path.join(options_dict["data_dir"], "swbd." + set + ".npz")
    logger.info("Reading: " + npz_fn)
    npz = numpy.load(npz_fn)
    logger.info("Loaded " + str(len(npz.keys())) + " segments")

    model = train_siamese_triplets_convlstm.load_siamese_triplets_convlstm_minibatch(options_dict)
        

    # Load data into Theano shared variable
    utt_ids = sorted(npz.keys())
    xs = [npz[i] for i in utt_ids]
    ls = numpy.asarray([len(x) for x in xs], dtype=int)
    max_length = 200
    batch_size = options_dict["batch_size"]
    n_batches = (len(ls) - 1)/100 + 1
    blocked_size = n_batches * batch_size
    xs = numpy.zeros((blocked_size, max_length, npz[utt_ids[0]].shape[1]),
                      dtype=theano.config.floatX)
    mask = numpy.zeros((blocked_size, max_length), dtype=theano.config.floatX)
    for j, i in enumerate(utt_ids):
        xs[j][:ls[j]] = npz[i]
        mask[j][:ls[j]] = 1.0

    logger.info("Formatting into Theano shared variable")

    shared_x = theano.shared(xs, borrow=True)
    shared_mask = theano.shared(mask, borrow=True)

    # Compile function for passing segments through CNN layers
    x = model.input  # input to the tied layers
    x_i = T.lscalar()
    m = model.mask
    normalized_output = model.output
    apply_model = theano.function(
        inputs=[x_i],
        outputs=normalized_output,
        givens={
            x: shared_x[x_i*batch_size:batch_size*(x_i+1)],
            m: shared_mask[batch_size*x_i:batch_size*(x_i+1)].T
            }
        )

    logger.info(datetime.now())

    n_x = len(ls)
    logger.info("Passing data through in model: " + str(n_x))
    embeddings = []
    for x_i in range(n_batches):
        x_embeddings = apply_model(x_i)
        embeddings.extend(x_embeddings)
    embeddings = numpy.vstack(embeddings[:len(ls)])
    logger.info("Outputs shape: " + str(embeddings.shape))

    embeddings_dict = {}

    for embedding_i, embedding in enumerate(embeddings):
        utt_id = utt_ids[embedding_i]
        embeddings_dict[utt_id] = embedding

    logger.info(datetime.now())

    return embeddings_dict
def train_siamese_triplets_lstm_nn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff_mask(rng, options_dict["data_dir"])
    train_x, train_mask, train_lengths, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_mask, dev_lengths, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_mask, test_lengths, test_matches_vec, test_labels = datasets[2]

    # Make batch iterators
    train_triplet_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True,
    )
    validate_triplet_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )
    test_triplet_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False,
    )

    # Setup model

    logger.info("Building Siamese triplets LSTM")

    # Symbolic variables
    x1 = tensor.tensor3("x1", dtype=THEANOTYPE)
    x2 = tensor.tensor3("x2", dtype=THEANOTYPE)
    x3 = tensor.tensor3("x3", dtype=THEANOTYPE)
    m1 = tensor.matrix("m1", dtype=THEANOTYPE)
    m2 = tensor.matrix("m2", dtype=THEANOTYPE)
    m3 = tensor.matrix("m3", dtype=THEANOTYPE)
    x1_indices = tensor.ivector("x1_indices")
    x2_indices = tensor.ivector("x2_indices")
    x3_indices = tensor.ivector("x3_indices")
    l1 = tensor.iscalar("l1")
    l2 = tensor.iscalar("l2")
    l3 = tensor.iscalar("l3")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletBatchLSTMNN(
        rng,
        x1,
        x2,
        x3,
        m1,
        m2,
        m3,
        n_in=39,
        n_lstm_hiddens=options_dict["n_hiddens"],
        mlp_hidden_specs=options_dict["hidden_layer_specs"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(options_dict["margin"])  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l2_weight"] > 0.0:
        loss = loss + options_dict["l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same()  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")

    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices].swapaxes(0, 1)[: dev_lengths[x1_indices].max()],
            m1: dev_mask[x1_indices].T[: dev_lengths[x1_indices].max()],
            x2: dev_x[x2_indices].swapaxes(0, 1)[: dev_lengths[x2_indices].max()],
            m2: dev_mask[x2_indices].T[: dev_lengths[x2_indices].max()],
            x3: dev_x[x3_indices].swapaxes(0, 1)[: dev_lengths[x3_indices].max()],
            m3: dev_mask[x3_indices].T[: dev_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices].swapaxes(0, 1)[: test_lengths[x1_indices].max()],
            m1: test_mask[x1_indices].T[: test_lengths[x1_indices].max()],
            x2: test_x[x2_indices].swapaxes(0, 1)[: test_lengths[x2_indices].max()],
            m2: test_mask[x2_indices].T[: test_lengths[x2_indices].max()],
            x3: test_x[x3_indices].swapaxes(0, 1)[: test_lengths[x3_indices].max()],
            m3: test_mask[x3_indices].T[: test_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # test_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     givens={
    #         l1: test_lengths[x1_indices].max(),
    #         x1: test_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: test_mask[x1_indices][:l1],
    #         l2: test_lengths[x2_indices].max(),
    #         x2: test_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: test_mask[x2_indices][:l2],
    #         l3: test_lengths[x3_indices].max(),
    #         x3: test_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: test_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Gradients and training updates
    parameters = model.parameters
    gradients = tensor.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients, learning_rule["rho"], learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
        )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices].swapaxes(0, 1)[: train_lengths[x1_indices].max()],
            m1: train_mask[x1_indices].T[: train_lengths[x1_indices].max()],
            x2: train_x[x2_indices].swapaxes(0, 1)[: train_lengths[x2_indices].max()],
            m2: train_mask[x2_indices].T[: train_lengths[x2_indices].max()],
            x3: train_x[x3_indices].swapaxes(0, 1)[: train_lengths[x3_indices].max()],
            m3: train_mask[x3_indices].T[: train_lengths[x3_indices].max()],
        },
        mode=theano_mode,
    )
    # train_model = theano.function(
    #     inputs=[x1_indices, x2_indices, x3_indices],
    #     outputs=outputs,
    #     updates=updates,
    #     givens={
    #         l1: train_lengths[x1_indices].max(),
    #         x1: train_x[x1_indices].swapaxes(0, 1)[:l1],
    #         m1: train_mask[x1_indices][:l1],
    #         l2: train_lengths[x2_indices].max(),
    #         x2: train_x[x2_indices].swapaxes(0, 1)[:l2],
    #         m2: train_mask[x2_indices][:l2],
    #         l3: train_lengths[x3_indices].max(),
    #         x3: train_x[x3_indices].swapaxes(0, 1)[:l3],
    #         m3: train_mask[x3_indices][:l3],
    #         },
    #     mode=theano_mode,
    #     )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_triplet_iterator=train_triplet_iterator,
        validate_model=validate_model,
        validate_triplet_iterator=validate_triplet_iterator,
        test_model=test_model,
        test_triplet_iterator=test_triplet_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645
    )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Exemple #30
0
def apply_layers(model_dir, set, batch_size=None, i_layer=-1):

    logger.info(datetime.now())

    # Load the model options
    options_dict_fn = path.join(model_dir, "options_dict.pkl.gz")
    logger.info("Reading: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    # print options_dict
    f.close()

    # Load the dataset
    npz_fn = path.join(options_dict["data_dir"], "swbd." + set + ".npz")
    logger.info("Reading: " + npz_fn)
    npz = numpy.load(npz_fn)
    logger.info("Loaded " + str(len(npz.keys())) + " segments")

    if "siamese_triplets" in options_dict["model_dir"]:
        model = siamese_triplets_lstm.load_siamese_triplets_lstm(options_dict)

    # Load data into Theano shared variable
    utt_ids = sorted(npz.keys())
    xs = [npz[i] for i in utt_ids]
    ls = numpy.asarray([len(x) for x in xs], dtype=int)
    base_inds = numpy.cumsum(ls)
    ends = theano.shared(base_inds, borrow=True)
    base_begins = base_inds.copy()
    base_begins[1:] = base_inds[:-1]
    base_begins[0] = 0
    begins = theano.shared(base_begins, borrow=True)

    logger.info("Formatting into Theano shared variable")

    shared_x = theano.shared(numpy.asarray(
        numpy.vstack(xs), dtype=siamese_triplets_lstm.THEANOTYPE),
                             borrow=True)

    # Compile function for passing segments through CNN layers
    x = model.input  # input to the tied layers
    x_i = T.lscalar()
    normalized_output = model.output
    apply_model = theano.function(inputs=[x_i],
                                  outputs=normalized_output,
                                  givens={x: shared_x[begins[x_i]:ends[x_i]]})

    logger.info(datetime.now())

    n_x = len(ls)
    logger.info("Passing data through in model: " + str(n_x))
    embeddings = []
    for x_i in range(n_x):
        x_embedding = apply_model(x_i)
        embeddings.append(x_embedding)
    embeddings = numpy.vstack(embeddings)
    logger.info("Outputs shape: " + str(embeddings.shape))

    embeddings_dict = {}

    for embedding_i, embedding in enumerate(embeddings):
        utt_id = utt_ids[embedding_i]
        embeddings_dict[utt_id] = embedding

    logger.info(datetime.now())

    return embeddings_dict
def train_mlp(options_dict):
    """Train and save a word classifier MLP."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(
        rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches

        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]

    n_train_batches = train_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(
        borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Setup model

    logger.info("Building MLP")

    # Symbolic variables
    i_batch = T.lscalar()  # batch index
    x = T.matrix("x")  # flattened data of shape (n_data, d_in)
    y = T.ivector("y")  # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    model = mlp.MLP(rng, x, d_in, options_dict["d_out"],
                    options_dict["hidden_layer_specs"], srng,
                    options_dict["dropout_rates"])
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            dev_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]],
            y:
            dev_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                  options_dict["batch_size"]]
        })
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x:
            test_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]],
            y:
            test_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                   options_dict["batch_size"]]
        })

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x:
            train_x[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]],
            y:
            train_y[i_batch * options_dict["batch_size"]:(i_batch + 1) *
                    options_dict["batch_size"]]
        },
    )

    # Train model

    logger.info("Training MLP")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"],
        "dev",
        batch_size=645,
        i_layer=options_dict["i_layer_eval"])
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def main():
    args = check_argv()

    if "," in args.model_basedir:
        directory_list = []
        for model_basedir in args.model_basedir.split(","):
            directory_list += glob.glob(path.join(model_basedir, "*"))
        print directory_list
    else:
        directory_list = glob.glob(path.join(args.model_basedir, "*"))

    # Get results from directories
    results = []  # list of (dir, option_value_dict, performance)
    for d in directory_list:
        if path.isdir(d):
            hash = path.split(d)[-1]
            # print d, hash

            options_dict_fn = path.join(d, "options_dict.pkl.gz")
            if not path.isfile(options_dict_fn):
                continue
            print "Reading:", options_dict_fn
            f = smart_open(options_dict_fn)
            options_dict = pickle.load(f)
            f.close()

            # Data  filter
            if data_dir_filter is not None:
                if not data_dir_filter in options_dict["data_dir"]:
                    continue

            # Read average precision
            ap_fn = path.join(d, "dev_ap.txt")
            if not path.isfile(ap_fn):
                continue
            with open(ap_fn) as f:
                ap = float(f.readline().strip())

            # Get the options we are interested in
            options = {}
            if "min_count" in options_dict:
                options["min_count"] = options_dict["min_count"]
            else:
                options["min_count"] = None
            if "conv_layer_specs" in options_dict:
                options["n_cnn_units"] = options_dict["conv_layer_specs"][0][
                    "filter_shape"][0]
            else:
                options["n_cnn_units"] = None
            options["n_hidden_units"] = options_dict["hidden_layer_specs"][0][
                "units"]
            options["n_hidden_layers"] = len(
                options_dict["hidden_layer_specs"])
            options["n_hidden_units_final_layer"] = options_dict[
                "hidden_layer_specs"][-1]["units"]
            for key in options_monitor:
                if key in options_dict:
                    options[key] = options_dict[key]
                else:
                    options[key] = None

            results.append((d, options, ap))

    # Try to sort the results according to the option_value_dict
    results = sorted(results, key=lambda i: i[1].values())

    # Present results
    options = results[0][1].keys()
    print "Possible options:", options
    print_options = sorted(options)  # or can give a filtered list here
    print print_options
    print
    print "-" * 39
    print "# Directory\t" + "\t".join(print_options) + "\tDev AP"
    for dir, options, ap in results:
        print dir + "\t" + "\t".join([str(options[i])
                                      for i in print_options]) + "\t" + str(ap)
    print "-" * 39
def apply_layers(model_dir, set, batch_size=None, i_layer=-1):

    logger.info(datetime.now())

    # Load the model options
    options_dict_fn = path.join(model_dir, "options_dict.pkl.gz")
    logger.info("Reading: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    # print options_dict
    f.close()

    # Load the dataset
    npz_fn = path.join(options_dict["data_dir"], "swbd." + set + ".npz")
    logger.info("Reading: " + npz_fn)
    npz = numpy.load(npz_fn)
    logger.info("Loaded " + str(len(npz.keys())) + " segments")

    model = load_model(options_dict)
        

    # Load data into Theano shared variable
    utt_ids = sorted(npz.keys())
    xs = [npz[i] for i in utt_ids]
    ls = numpy.asarray([len(x) for x in xs], dtype=int)
    base_inds = numpy.cumsum(ls)
    ends = theano.shared(base_inds, borrow=True)
    base_begins = base_inds.copy()
    base_begins[1:] = base_inds[:-1]
    base_begins[0] = 0
    begins = theano.shared(base_begins, borrow=True)
    
    logger.info("Formatting into Theano shared variable")

    shared_x = theano.shared(numpy.asarray(
        numpy.vstack(xs), dtype=siamese_triplets_lstm.THEANOTYPE), borrow=True)

    # Compile function for passing segments through CNN layers
    x = model.input  # input to the tied layers
    x_i = T.lscalar()
    normalized_output = model.output
    apply_model = theano.function(
        inputs=[x_i],
        outputs=normalized_output,
        givens={
            x: shared_x[
                begins[x_i]:ends[x_i]
                ]
            }
        )

    logger.info(datetime.now())

    n_x = len(ls)
    logger.info("Passing data through in model: " + str(n_x))
    embeddings = []
    for x_i in range(n_x):
        x_embedding = apply_model(x_i)
        embeddings.append(x_embedding)
    embeddings = numpy.vstack(embeddings)
    logger.info("Outputs shape: " + str(embeddings.shape))

    embeddings_dict = {}

    for embedding_i, embedding in enumerate(embeddings):
        utt_id = utt_ids[embedding_i]
        embeddings_dict[utt_id] = embedding

    logger.info(datetime.now())

    return embeddings_dict
def main():
    args = check_argv()

    if "," in args.model_basedir:
        directory_list = []
        for model_basedir in args.model_basedir.split(","):
            directory_list += glob.glob(path.join(model_basedir, "*"))
        print directory_list
    else:
        directory_list = glob.glob(path.join(args.model_basedir, "*"))

    # Get results from directories
    results = []  # list of (dir, option_value_dict, performance)
    for d in directory_list:
        if path.isdir(d):
            hash = path.split(d)[-1]
            # print d, hash

            options_dict_fn = path.join(d, "options_dict.pkl.gz")
            if not path.isfile(options_dict_fn):
                continue
            print "Reading:", options_dict_fn
            f = smart_open(options_dict_fn)
            options_dict = pickle.load(f)
            f.close()

            # Data  filter
            if data_dir_filter is not None:
                if not data_dir_filter in options_dict["data_dir"]:
                    continue

            # Read average precision
            ap_fn = path.join(d, "dev_ap.txt")
            if not path.isfile(ap_fn):
                continue
            with open(ap_fn) as f:
                ap = float(f.readline().strip())

            # Get the options we are interested in
            options = {}
            if "min_count" in options_dict:
                options["min_count"] = options_dict["min_count"]
            else:
                options["min_count"] = None
            if "conv_layer_specs" in options_dict:
                options["n_cnn_units"] = options_dict["conv_layer_specs"][0]["filter_shape"][0]
            else:
                options["n_cnn_units"] = None
            options["n_hidden_units"] = options_dict["hidden_layer_specs"][0]["units"]
            options["n_hidden_layers"] = len(options_dict["hidden_layer_specs"])
            options["n_hidden_units_final_layer"] = options_dict["hidden_layer_specs"][-1]["units"]
            for key in options_monitor:
                if key in options_dict:
                    options[key] = options_dict[key]
                else:
                    options[key] = None

            results.append((d, options, ap))

    # Try to sort the results according to the option_value_dict
    results = sorted(results, key=lambda i:i[1].values())

    # Present results
    options = results[0][1].keys()
    print "Possible options:", options
    print_options = sorted(options)  # or can give a filtered list here
    print print_options
    print
    print "-"*39
    print "# Directory\t" + "\t".join(print_options)  + "\tDev AP"
    for dir, options, ap in results:
        print dir + "\t" + "\t".join([str(options[i]) for i in print_options]) + "\t" + str(ap)
    print "-"*39
def train_cnn(options_dict):
    """Train and save a word classifier CNN."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
        # root_logger = logging.getLogger()
        # formatter = root_logger.handlers[0].formatter
        # root_logger.removeHandler(root_logger.handlers[0])
        # file_handler = logging.FileHandler(log_fn, "a")
        # file_handler.setFormatter(formatter)
        # root_logger.addHandler(file_handler)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None


    # Load and format data

    # Load into shared variables
    datasets, word_to_i_map = data_io.load_swbd_labelled(rng, options_dict["data_dir"], options_dict["min_count"])
    train_x, train_y = datasets[0]
    dev_x, dev_y = datasets[1]
    test_x, test_y = datasets[2]

    # Get batch sizes and iterators
    class BatchIterator(object):
        def __init__(self, n_batches):
            self.n_batches = n_batches
        def __iter__(self):
            for i_batch in xrange(self.n_batches):
                yield [i_batch]
    n_train_batches = train_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_dev_batches = dev_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    n_test_batches = test_x.get_value(borrow=True).shape[0] / options_dict["batch_size"]
    train_batch_iterator = BatchIterator(n_train_batches)
    validate_batch_iterator = BatchIterator(n_dev_batches)
    test_batch_iterator = BatchIterator(n_test_batches)

    # Flatten data
    d_in = 39*200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))
    d_out = len(word_to_i_map)
    options_dict["d_out"] = d_out

    # Save `options_dict`
    options_dict_fn = path.join(options_dict["model_dir"], "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))


    # Setup model

    logger.info("Building CNN")

    # Symbolic variables
    i_batch = T.lscalar()   # batch index
    x = T.matrix("x")       # flattened data of shape (n_data, d_in)
    y = T.ivector("y")      # labels

    # Build model
    logger.info("No. of word type targets: " + str(options_dict["d_out"]))
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = cnn.CNN(
        rng, x, input_shape, options_dict["conv_layer_specs"],
        options_dict["hidden_layer_specs"], options_dict["d_out"], srng,
        options_dict["dropout_rates"] 
        )
    if options_dict["dropout_rates"] is not None:
        loss = model.dropout_negative_log_likelihood(y)
    else:
        loss = model.negative_log_likelihood(y)
    error = model.errors(y)

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"]*model.l1 + options_dict["l2_weight"]* model.l2

    # Compile test functions
    outputs = [error, loss]
    validate_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: dev_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: dev_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )
    test_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        givens={
            x: test_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: test_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            }
        )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(
            parameters, gradients, learning_rule["rho"], learning_rule["epsilon"]
            )
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"], learning_rule["momentum"]
            )
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[i_batch],
        outputs=outputs,
        updates=updates,
        givens={
            x: train_x[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]],
            y: train_y[i_batch * options_dict["batch_size"]: (i_batch + 1) * options_dict["batch_size"]]
            },
        )


    # Train model

    logger.info("Training CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
        )


    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev", batch_size=645, i_layer=options_dict["i_layer_eval"]
        )  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True], distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
Exemple #36
0
def train_fixed_epochs(n_epochs,
                       train_model,
                       train_batch_iterator,
                       test_model=None,
                       test_batch_iterator=None,
                       save_model_func=None,
                       save_model_fn=None,
                       record_dict_fn=None):
    """
    Train for a fixed number of epochs.

    Parameters
    ----------
    train_model : Theano function
        Should take input from `train_batch_iterator` and output the training
        loss. The function can provide more than one output, which is averaged.
        This is useful for example to output both negative log likelihood (the
        model loss) and zero-one loss (the number of errors).
    train_batch_iterator : generator
        Provides the training batches.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the best model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tracked, while the dict
        value is a list of (epoch, statistic) tuples giving the statistic-value
        at a particular epoch.
    """

    record_dict = {}
    record_dict["train_loss"] = []  # each element is (epoch, loss)
    if test_model is not None:
        record_dict["test_loss"] = [
        ]  # testing is not necessarily performed every epoch
    record_dict["epoch_time"] = []

    logger.info(datetime.now())

    # Training epochs
    i_epoch_best = 0
    test_loss = np.inf
    for i_epoch in xrange(n_epochs):

        logger.info("Epoch " + str(i_epoch + 1) + ":")

        # Loop over training batches
        # train_losses = []
        start_time = timeit.default_timer()
        train_losses = [train_model(*batch) for batch in train_batch_iterator]
        # for i_batch in xrange(n_train_batches):
        # for batch in train_batch_iterator()
        # Calculate training loss for this batch and update parameters
        # train_losses.append(train_model(*batch))

        # Test model
        if test_model is not None:
            test_losses = [test_model(*batch) for batch in test_batch_iterator]
            test_loss = np.mean(test_losses, axis=0)
            logger.info("    Test loss: " + str(test_loss))
            record_dict["test_loss"].append((i_epoch, test_loss))

        # Write this model
        if save_model_func is not None:
            f = smart_open(save_model_fn, "wb")
            save_model_func(f)
            f.close()

        # Training statistics for this epoch
        end_time = timeit.default_timer()
        train_loss = np.mean(train_losses, axis=0)
        epoch_time = end_time - start_time
        # logger.info("Training loss: " + str(train_loss)  # + ", " +
        # )
        logger.info("Time: %f" % (epoch_time) + " sec, " + "training loss: " +
                    str(train_loss)  # + ", " + 
                    )
        record_dict["epoch_time"].append((i_epoch, epoch_time))
        record_dict["train_loss"].append((i_epoch, train_loss))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info("Training complete: %f min" % (total_time / 60.))
    if test_model is not None:
        logger.info("Test loss: " + str(test_loss))
    if save_model_func is not None:
        logger.info("Model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    logger.info(datetime.now())

    return record_dict
def train_siamese_triplets_cnn(options_dict):
    """Train and save a Siamese CNN using the specified options."""

    # Preliminary

    logger.info(datetime.now())

    if not path.isdir(options_dict["model_dir"]):
        os.makedirs(options_dict["model_dir"])

    if "log_to_file" in options_dict and options_dict["log_to_file"] is True:
        log_fn = path.join(options_dict["model_dir"], "log")
        print "Writing:", log_fn
        root_logger = logging.getLogger()
        if len(root_logger.handlers) > 0:
            root_logger.removeHandler(
                root_logger.handlers[0])  # close open file handler
        logging.basicConfig(filename=log_fn, level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.DEBUG)

    rng = np.random.RandomState(options_dict["rnd_seed"])
    if options_dict["dropout_rates"] is not None:
        srng = RandomStreams(seed=options_dict["rnd_seed"])
    else:
        srng = None

    options_dict_fn = path.join(options_dict["model_dir"],
                                "options_dict.pkl.gz")
    logger.info("Saving options: " + options_dict_fn)
    f = data_io.smart_open(options_dict_fn, "wb")
    pickle.dump(options_dict, f, -1)
    f.close()

    logger.info("Options: " + str(options_dict))

    # Load and format data

    # Load into shared variables
    datasets = data_io.load_swbd_same_diff(rng, options_dict["data_dir"])
    train_x, train_matches_vec, train_labels = datasets[0]
    dev_x, dev_matches_vec, dev_labels = datasets[1]
    test_x, test_matches_vec, test_labels = datasets[2]

    # Flatten data
    d_in = 39 * 200
    train_x = train_x.reshape((-1, d_in))
    dev_x = dev_x.reshape((-1, d_in))
    test_x = test_x.reshape((-1, d_in))

    # Make batch iterators
    train_batch_iterator = BatchIteratorTriplets(
        rng,
        train_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=True)
    validate_batch_iterator = BatchIteratorTriplets(
        rng,
        dev_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)
    test_batch_iterator = BatchIteratorTriplets(
        rng,
        test_matches_vec,
        options_dict["batch_size"],
        n_same_pairs=options_dict["n_same_pairs"],
        sample_diff_every_epoch=False)

    # Setup model

    logger.info("Building Siamese triplets CNN")

    # Symbolic variables
    x1 = T.matrix("x1")
    x2 = T.matrix("x2")
    x3 = T.matrix("x3")
    x1_indices = T.ivector("x1_indices")
    x2_indices = T.ivector("x2_indices")
    x3_indices = T.ivector("x3_indices")

    # Build model
    input_shape = (options_dict["batch_size"], 1, 39, 200)
    model = siamese.SiameseTripletCNN(
        rng,
        x1,
        x2,
        x3,
        input_shape,
        conv_layer_specs=options_dict["conv_layer_specs"],
        hidden_layer_specs=options_dict["hidden_layer_specs"],
        srng=srng,
        dropout_rates=options_dict["dropout_rates"],
    )
    if options_dict["loss"] == "hinge_cos":
        if options_dict["dropout_rates"] is not None:
            loss = model.dropout_loss_hinge_cos(options_dict["margin"])
        else:
            loss = model.loss_hinge_cos(options_dict["margin"])
        error = model.loss_hinge_cos(
            options_dict["margin"]
        )  # doesn't include regularization or dropout
    else:
        assert False, "Invalid loss: " + options_dict["loss"]

    # Add regularization
    if options_dict["l1_weight"] > 0. or options_dict["l2_weight"] > 0.:
        loss = loss + options_dict["l1_weight"] * model.l1 + options_dict[
            "l2_weight"] * model.l2

    # Compile test functions
    same_distance = model.cos_same(
    )  # track the distances of same and different pairs separately
    diff_distance = model.cos_diff()
    outputs = [error, loss, same_distance, diff_distance]
    theano_mode = theano.Mode(linker="cvm")
    validate_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: dev_x[x1_indices],
            x2: dev_x[x2_indices],
            x3: dev_x[x3_indices],
        },
        mode=theano_mode,
    )
    test_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        givens={
            x1: test_x[x1_indices],
            x2: test_x[x2_indices],
            x3: test_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Gradients and training updates
    parameters = model.parameters
    gradients = T.grad(loss, parameters)
    learning_rule = options_dict["learning_rule"]
    if learning_rule["type"] == "adadelta":
        updates = training.learning_rule_adadelta(parameters, gradients,
                                                  learning_rule["rho"],
                                                  learning_rule["epsilon"])
    elif learning_rule["type"] == "momentum":
        updates = training.learning_rule_momentum(
            parameters, gradients, learning_rule["learning_rate"],
            learning_rule["momentum"])
    else:
        assert False, "Invalid learning rule: " + learning_rule["type"]

    # Compile training function
    train_model = theano.function(
        inputs=[x1_indices, x2_indices, x3_indices],
        outputs=outputs,
        updates=updates,
        givens={
            x1: train_x[x1_indices],
            x2: train_x[x2_indices],
            x3: train_x[x3_indices],
        },
        mode=theano_mode,
    )

    # Train model

    logger.info("Training Siamese triplets CNN")
    record_dict_fn = path.join(options_dict["model_dir"], "record_dict.pkl.gz")
    record_dict = training.train_fixed_epochs_with_validation(
        options_dict["n_max_epochs"],
        train_model=train_model,
        train_batch_iterator=train_batch_iterator,
        validate_model=validate_model,
        validate_batch_iterator=validate_batch_iterator,
        test_model=test_model,
        test_batch_iterator=test_batch_iterator,
        save_model_func=model.save,
        save_model_fn=path.join(options_dict["model_dir"], "model.pkl.gz"),
        record_dict_fn=record_dict_fn,
    )

    # Extrinsic evaluation

    # Pass data trough model
    logger.info("Performing same-different evaluation")
    layers_output_dict = apply_layers.apply_layers(
        options_dict["model_dir"], "dev",
        batch_size=645)  # batch size covers 10965 out of 10966 tokens
    utt_ids = sorted(layers_output_dict.keys())
    embeddings = np.array([layers_output_dict[i] for i in utt_ids])
    labels = data_io.swbd_utts_to_labels(utt_ids)

    # Perform same-different
    distances = pdist(embeddings, metric="cosine")
    matches = samediff.generate_matches_array(labels)
    ap, prb = samediff.average_precision(distances[matches == True],
                                         distances[matches == False])
    logger.info("Validation average precision: " + str(ap))
    ap_fn = path.join(options_dict["model_dir"], "dev_ap.txt")
    with open(ap_fn, "w") as f:
        f.write(str(ap) + "\n")
def apply_layers(model_dir, set, batch_size=None, i_layer=-1):

    logger.info(datetime.now())

    # Load the model options
    options_dict_fn = path.join(model_dir, "options_dict.pkl.gz")
    logger.info("Reading: " + options_dict_fn)
    f = smart_open(options_dict_fn)
    options_dict = pickle.load(f)
    # print options_dict
    f.close()

    # Load the dataset
    npz_fn = path.join(options_dict["data_dir"], "swbd." + set + ".npz")
    logger.info("Reading: " + npz_fn)
    npz = np.load(npz_fn)
    logger.info("Loaded " + str(len(npz.keys())) + " segments")

    # Load the model
    if batch_size is not None:
        options_dict["batch_size"] = batch_size
    else:
        options_dict["batch_size"] = len(npz.keys())
    model = load_model(options_dict)

    # Load data into Theano shared variable
    utt_ids = sorted(npz.keys())
    mats = np.array([npz[i] for i in utt_ids])
    logger.info("Data shape: " + str(mats.shape))
    logger.info("Formatting into Theano shared variable")
    shared_x = theano.shared(np.asarray(mats, dtype=theano.config.floatX), borrow=True)

    # Flatten data
    d_in = 39*200
    shared_x = shared_x.reshape((-1, d_in))

    # Compile function for passing segments through CNN layers
    x = model.input  # input to the tied layers
    i_batch = T.lscalar()
    layers_output = model.layers[i_layer].output
    apply_model = theano.function(
        inputs=[i_batch],
        outputs=layers_output,
        givens={
            x: shared_x[
                i_batch * options_dict["batch_size"] : 
                (i_batch + 1) * options_dict["batch_size"]
                ]
            }
        )

    logger.info(datetime.now())

    n_batches = mats.shape[0]/options_dict["batch_size"]
    logger.info("Passing data through in batches: " + str(n_batches))
    layers_outputs = []
    for i_batch in xrange(n_batches):
        batch_layers_outputs = apply_model(i_batch)
        layers_outputs.append(batch_layers_outputs)
    layers_outputs = np.vstack(layers_outputs)
    logger.info("Outputs shape: " + str(layers_outputs.shape))

    layers_output_dict = {}
    # for i , utt_id in enumerate(utt_ids):
    for i in xrange(layers_outputs.shape[0]):
        utt_id = utt_ids[i]
        layers_output_dict[utt_id] = layers_outputs[i]

    logger.info(datetime.now())

    return layers_output_dict
Exemple #39
0
def train_fixed_epochs_with_validation(n_epochs,
                                       train_model,
                                       train_batch_iterator,
                                       validate_model,
                                       validate_batch_iterator,
                                       test_model=None,
                                       test_batch_iterator=None,
                                       save_model_func=None,
                                       save_model_fn=None,
                                       record_dict_fn=None):
    """
    Train for a fixed number of epochs, using validation to decide which model
    to save.

    Parameters
    ----------
    train_model : Theano function
        Should take input from `train_batch_iterator` and output the training
        loss. The function can provide more than one output, which is averaged.
        This is useful for example to output both negative log likelihood (the
        model loss) and zero-one loss (the number of errors).
    train_batch_iterator : generator
        Provides the training batches.
    validate_model : Theano function
        Should take input from `validate_batch_iterator` and output the
        validation loss. The function can provide more than one output (which
        would be averaged), but for the validation only the first output will
        be used (except if `validate_extrinsic` is provided).
    validate_extrinsic : function
        Extrinsic evaluation can be performed using this function. If provided,
        validation is performed on the output of this function instead of using
        the output from `validate_model`.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the best model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tracked, while the dict
        value is a list of (epoch, statistic) tuples giving the statistic-value
        at a particular epoch.
    """

    record_dict = {}
    record_dict["train_loss"] = []  # each element is (epoch, loss)
    record_dict["validation_loss"] = [
    ]  # validation is not necessarily performed every epoch
    if test_model is not None:
        record_dict["test_loss"] = []  # and neither is testing
    # if validate_extrinsic is not None:
    #     record_dict["validation_extrinsic"] = []
    record_dict["epoch_time"] = []

    logger.info(datetime.now())

    # Training epochs
    best_validation_loss0 = np.inf
    test_loss = np.inf
    i_epoch_best = 0
    for i_epoch in xrange(n_epochs):

        # Loop over training batches
        # train_losses = []
        start_time = timeit.default_timer()
        train_losses = [train_model(*batch) for batch in train_batch_iterator]
        # for i_batch in xrange(n_train_batches):
        # for batch in train_batch_iterator()
        # Calculate training loss for this batch and update parameters
        # train_losses.append(train_model(*batch))

        # Validate the model
        validation_losses = [
            validate_model(*batch) for batch in validate_batch_iterator
        ]
        validation_loss = np.mean(validation_losses, axis=0)
        logger.info("Epoch " + str(i_epoch + 1) + ": "
                    "validation loss: " + str(validation_loss))
        record_dict["validation_loss"].append((i_epoch, validation_loss))

        # print math.isnan(validation_loss)
        if hasattr(validation_loss, "__len__"):
            validation_loss0 = validation_loss[0]
        else:
            validation_loss0 = validation_loss

        # If this is the best model, test and save
        if validation_loss0 < best_validation_loss0:

            best_validation_loss0 = validation_loss0
            i_epoch_best = i_epoch

            # Test model
            if test_model is not None:
                test_losses = [
                    test_model(*batch) for batch in test_batch_iterator
                ]
                test_loss = np.mean(test_losses, axis=0)
                logger.info("    Test loss: " + str(test_loss))
                record_dict["test_loss"].append((i_epoch, test_loss))

            # Write the best model
            if save_model_func is not None:
                f = smart_open(save_model_fn, "wb")
                save_model_func(f)
                f.close()

        # Training statistics for this epoch
        end_time = timeit.default_timer()
        train_loss = np.mean(train_losses, axis=0)
        epoch_time = end_time - start_time
        # logger.info("Training loss: " + str(train_loss)  # + ", " +
        # )
        logger.info("Time: %f" % (epoch_time) + " sec, " + "training loss: " +
                    str(train_loss)  # + ", " + 
                    )
        record_dict["epoch_time"].append((i_epoch, epoch_time))
        record_dict["train_loss"].append((i_epoch, train_loss))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info("Training complete: %f min" % (total_time / 60.))
    logger.info("Best validation epoch: " + str(i_epoch_best + 1) + ", "
                "best validation loss: " + str(best_validation_loss0))
    if test_model is not None:
        logger.info("Test loss: " + str(test_loss))
    if save_model_func is not None:
        logger.info("Best validation model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    logger.info(datetime.now())

    return record_dict
Exemple #40
0
def train_early_stopping(n_train_batches,
                         n_validation_batches,
                         train_model,
                         validate_model,
                         test_model=None,
                         n_test_batches=None,
                         n_max_epochs=1000,
                         n_batches_validation_frequency=None,
                         n_patience=5000,
                         patience_increase_factor=2,
                         improvement_threshold=0.995,
                         save_model_func=None,
                         save_model_fn=None,
                         record_dict_fn=None,
                         learning_rate_update=None):
    """
    Train model using early stopping, using the provided training function.

    Parameters
    ----------
    n_train_batches : int
        Total number of training batches.
    n_validation_batches : int
        Total number of validation batches.
    train_model : Theano function
        Should take as input a batch index and output the training loss and
        error (e.g. negative log likelihood and zero-one loss).
    validate_model : Theano function
        Should take as input a batch index and output the validation loss and
        error.
    test_model : Theano function
        Should take as input a batch index and output the test loss and error.
        If not provided, testing is not performed over the training iterations.
    n_test_batches : int
        Total number of test batches.
    n_batches_validation_frequency : int
        Number of batches between calculating the validation error; if not
        provided, is set to min(n_train_batches, n_patience / 2) which means
        that at a minimum validation will be performed every epoch (i.e. every
        time after seeing `n_train_batches` batches).
    n_patience : int
        Number of minibatches to consider at a minimum before completing
        training.
    patience_increase_factor : int
        When a new validation minimum is found, the number of seen minibatches
        are multiplied by this factor to give the new minimum number of
        minibatches before stopping.
    improvement_threshold : float
        The minimum relative improvement in validation error to be warrant an
        increase in `n_patience` by `patience_increase_factor`.
    save_model_func : function
        If provided, this function is used to the save the model to the file
        `save_model_fn` every time a new validation best model is found.
    save_model_fn : str
        The file to which the current model is written.
    record_dict_fn : str
        If provided, the current `record_dict` is saved to this file at the end
        of every epoch.
    learning_rate_update : Theano function
        If provided, this function is called (without any parameters) at the
        beginning of every epoch to update the learning rate.

    Return
    ------
    record_dict : dict
        The dict key describes the statistic being tract, while the dict value
        is a list of (epoch, statistic) tuples giving the statistic-value at a
        particular epoch.
    """

    assert (save_model_func is None) or (save_model_fn is not None)
    assert (test_model is None) or (n_test_batches is not None)

    # Set default if not provided
    if n_batches_validation_frequency is None:
        n_batches_validation_frequency = min(n_train_batches, n_patience / 2)

    record_dict = {}
    record_dict["train_loss"] = []  # each element is (epoch, loss)
    record_dict["train_error"] = []
    record_dict["validation_loss"] = [
    ]  # validation is not necessarily performed every epoch
    record_dict["validation_error"] = []
    if test_model is not None:
        record_dict["test_loss"] = []  # and neither is testing
        record_dict["test_error"] = []
    record_dict["epoch_time"] = []

    # Training epochs
    i_epoch = 0
    done_looping = False
    best_validation_error = np.inf
    n_batches_best = 0
    i_epoch_best = 0
    while (i_epoch < n_max_epochs) and (not done_looping):

        train_losses = []
        train_errors = []
        start_time = timeit.default_timer()

        if learning_rate_update is not None:
            learning_rate = learning_rate_update(i_epoch)

        # Minibatches
        for i_batch in xrange(n_train_batches):

            # Calculate cost for this minibatch, updating the parameters
            minibatch_train_loss, minibatch_train_errors = train_model(i_batch)
            train_errors.append(minibatch_train_errors)
            train_losses.append(minibatch_train_loss)

            # print train_losses
            # print i_batch, train_model(i_batch)
            # break

            n_seen_batches = i_epoch * n_train_batches + i_batch

            # Use n_seen_batches + 1 to avoid checking very first batch
            if (n_seen_batches + 1) % n_batches_validation_frequency == 0:

                # Validate model
                validation_losses_errors = [
                    validate_model(i) for i in xrange(n_validation_batches)
                ]
                validation_loss = np.mean(
                    [i[0] for i in validation_losses_errors])
                validation_error = np.mean(
                    [i[1] for i in validation_losses_errors])

                logger.info(
                    "Validation: epoch %i, minibatch %i/%i, loss %f, error %.2f%%"
                    % (i_epoch + 1, i_batch + 1, n_train_batches,
                       validation_loss, validation_error * 100.))
                record_dict["validation_loss"].append(
                    (i_epoch, validation_loss))
                record_dict["validation_error"].append(
                    (i_epoch, validation_error))

                # Check validation to see if we have new best model
                if validation_error < best_validation_error:
                    if validation_error < best_validation_error * improvement_threshold:
                        n_patience = max(
                            n_patience,
                            n_seen_batches * patience_increase_factor)
                    best_validation_error = validation_error
                    n_batches_best = n_seen_batches
                    i_epoch_best = i_epoch

                    if test_model is not None:
                        # test_losses = [test_model(i) for i in xrange(n_test_batches)]
                        test_losses_errors = [
                            test_model(i) for i in xrange(n_test_batches)
                        ]
                        test_loss = np.mean([i[0] for i in test_losses_errors])
                        test_error = np.mean(
                            [i[1] for i in test_losses_errors])

                        logger.info("\tTest: loss %f, error %.2f%%" %
                                    (test_loss, test_error * 100.))
                        # logger.info(
                        #     "Test: epoch %i, minibatch %i/%i, error %f%%" %
                        #     (i_epoch + 1, i_batch + 1, n_train_batches, test_loss * 100)
                        #     )
                        record_dict["test_loss"].append((i_epoch, test_loss))
                        record_dict["test_error"].append((i_epoch, test_error))

                    # Write the best model
                    if save_model_func is not None:
                        f = smart_open(save_model_fn, "wb")
                        save_model_func(f)
                        f.close()

            # break

            # Check if training is done
            if n_patience <= n_seen_batches:
                done_looping = True
                break

        end_time = timeit.default_timer()
        epoch_time = end_time - start_time
        record_dict["epoch_time"].append((i_epoch, epoch_time))

        # print train_losses
        # print train_errors
        cur_train_loss = np.mean(train_losses)
        cur_train_error = np.mean(train_errors)
        if learning_rate_update is not None:
            logger.info(
                "Train: lr %f, epoch %i, %f sec/epoch, loss %f, error %.2f%%" %
                (learning_rate, i_epoch + 1, epoch_time, cur_train_loss,
                 cur_train_error * 100.))
        else:
            logger.info(
                "Train: epoch %i, %f sec/epoch, loss %f, error %.2f%%" %
                (i_epoch + 1, epoch_time, cur_train_loss,
                 cur_train_error * 100.))
        record_dict["train_loss"].append((i_epoch, cur_train_loss))
        record_dict["train_error"].append((i_epoch, cur_train_error))

        if record_dict_fn is not None:
            f = smart_open(record_dict_fn, "wb")
            pickle.dump(record_dict, f, -1)
            f.close()

        i_epoch += 1

    total_time = np.sum([i[1] for i in record_dict["epoch_time"]])
    logger.info(
        "Training complete: %d epochs, %f sec/epoch, total time %f min" %
        (i_epoch, 1. * total_time / i_epoch, total_time / 60.))
    logger.info(
        "Best validation: after seeing %d minibatches in epoch %d, error %.2f%%"
        % (n_batches_best, i_epoch_best + 1, best_validation_error * 100.))
    if test_model is not None:
        logger.info("Test error: %.2f%%" % (test_error * 100.))
    if save_model_func is not None:
        logger.info("Best validation model saved: " + save_model_fn)
    if record_dict_fn is not None:
        logger.info("Saved record: " + record_dict_fn)

    return record_dict