Esempio n. 1
0
def score(data: str, load_from: str, batch_size: int, **kwargs):
    """Scores a text using a trained language model. See argument description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, 'vocab.json'))

    raw_data = reader.read(data, vocab)

    inputs, targets, loss, _, _, _, _, init_state = define_computation_graph(
        vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, MODEL_FILENAME))

        _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE))
        total_loss = 0.0
        total_iter = 0
        for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS):
            l = session.run([loss],
                            feed_dict={
                                inputs: x,
                                targets: y,
                                init_state: _current_state
                            })
            total_loss += l[0]
            total_iter += 1
        perplexity = np.exp(total_loss / total_iter)
        return perplexity
Esempio n. 2
0
def train(data: str,
          epochs: int = C.NUM_EPOCHS,
          batch_size: int = C.BATCH_SIZE,
          hidden_size: int = C.HIDDEN_SIZE,
          embedding_size: int = C.EMBEDDING_SIZE,
          vocab_max_size: int = C.VOCAB_SIZE,
          save_to: str = C.MODEL_PATH,
          log_to: str = C.LOGS_PATH,
          num_steps: int = C.NUM_STEPS,
          **kwargs):
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create vocabulary to map words to ids
    vocab = Vocabulary()
    vocab.build(data, max_size=vocab_max_size)
    vocab.save(os.path.join(save_to, C.VOCAB_FILENAME))

    # convert training data to list of word ids
    raw_data = reader.read(data, vocab)

    # define computation graph
    inputs, targets, loss, train_step, _, summary = define_computation_graph(
        vocab_size=vocab.size,
        batch_size=batch_size,
        num_steps=num_steps,
        hidden_size=hidden_size,
        embedding_size=embedding_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())
        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0
            for x, y in reader.iterate(raw_data, batch_size, C.NUM_STEPS):
                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict={
                                          inputs: x,
                                          targets: y
                                      })
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % 100 == 0:
                    logging.debug("Epoch=%s, iteration=%s", epoch, total_iter)
            perplexity = np.exp(total_loss / total_iter)
            logging.info("Perplexity on training data after epoch %s: %.2f",
                         epoch, perplexity)
            saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))
Esempio n. 3
0
def train(data: str, epochs: int, batch_size: int, vocab_max_size: int,
          save_to: str, log_to: str, **kwargs):
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # create vocabulary to map words to ids
    vocab = Vocabulary()
    vocab.build(data, max_size=vocab_max_size)
    vocab.save(os.path.join(save_to, VOCAB_FILENAME))

    # convert training data to list of word ids
    raw_data = reader.read(data, vocab)

    # define computation graph
    inputs, targets, loss, train_step, _, summary, current_state, init_state = define_computation_graph(
        vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())
        # iterate over training data `epoch` times
        for epoch in range(1, epochs + 1):
            _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE))
            total_loss = 0.0
            total_iter = 0
            for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS):

                l, _, _current_state, s = session.run(
                    [loss, train_step, current_state, summary],
                    feed_dict={
                        inputs: x,
                        targets: y,
                        init_state: _current_state
                    })
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % 100 == 0:
                    logging.debug("Epoch=%s, iteration=%s", epoch, total_iter)
            perplexity = np.exp(total_loss / total_iter)
            logging.info("Perplexity on training data after epoch %s: %.2f",
                         epoch, perplexity)
            saver.save(session, os.path.join(save_to, MODEL_FILENAME))
Esempio n. 4
0
def score(data: str,
          load_from: str = C.MODEL_PATH,
          batch_size: int = C.BATCH_SIZE,
          hidden_size: int = C.HIDDEN_SIZE,
          embedding_size: int = C.EMBEDDING_SIZE,
          num_steps: int = C.NUM_STEPS,
          **kwargs):
    """Scores a text using a trained language model. See argument description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, C.VOCAB_FILENAME))

    raw_data = reader.read(data, vocab)
    data_length = len(raw_data)

    if data_length < num_steps:
        logging.warning(
            "Length of input data is shorter than NUM_STEPS. Will try to reduce NUM_STEPS."
        )
        num_steps = data_length - 1

    if data_length < batch_size * num_steps:
        logging.warning(
            "Length of input data is shorter than BATCH_SIZE * NUM_STEPS. Will try to set batch size to 1."
        )
        batch_size = 1

    inputs, targets, loss, _, _, _ = define_computation_graph(
        vocab_size=vocab.size,
        batch_size=batch_size,
        num_steps=num_steps,
        hidden_size=hidden_size,
        embedding_size=embedding_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        total_loss = 0.0
        total_iter = 0
        for x, y in reader.iterate(raw_data, batch_size, num_steps):
            l = session.run([loss], feed_dict={inputs: x, targets: y})
            total_loss += l[0]
            total_iter += 1
        perplexity = np.exp(total_loss / total_iter)
        return perplexity