Ejemplo n.º 1
0
def score(data: str, load_from: str, batch_size: int, **kwargs):
    """Scores a text using a trained language model. See argument description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, 'vocab.json'))

    raw_data = reader.read(data, vocab)

    inputs, targets, loss, _, _, _, _, init_state = define_computation_graph(
        vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, MODEL_FILENAME))

        _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE))
        total_loss = 0.0
        total_iter = 0
        for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS):
            l = session.run([loss],
                            feed_dict={
                                inputs: x,
                                targets: y,
                                init_state: _current_state
                            })
            total_loss += l[0]
            total_iter += 1
        perplexity = np.exp(total_loss / total_iter)
        return perplexity
Ejemplo n.º 2
0
def train(data: str,
          epochs: int = C.NUM_EPOCHS,
          batch_size: int = C.BATCH_SIZE,
          hidden_size: int = C.HIDDEN_SIZE,
          embedding_size: int = C.EMBEDDING_SIZE,
          vocab_max_size: int = C.VOCAB_SIZE,
          save_to: str = C.MODEL_PATH,
          log_to: str = C.LOGS_PATH,
          num_steps: int = C.NUM_STEPS,
          **kwargs):
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create vocabulary to map words to ids
    vocab = Vocabulary()
    vocab.build(data, max_size=vocab_max_size)
    vocab.save(os.path.join(save_to, C.VOCAB_FILENAME))

    # convert training data to list of word ids
    raw_data = reader.read(data, vocab)

    # define computation graph
    inputs, targets, loss, train_step, _, summary = define_computation_graph(
        vocab_size=vocab.size,
        batch_size=batch_size,
        num_steps=num_steps,
        hidden_size=hidden_size,
        embedding_size=embedding_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())
        # iterate over training data `epochs` times
        for epoch in range(1, epochs + 1):
            total_loss = 0.0
            total_iter = 0
            for x, y in reader.iterate(raw_data, batch_size, C.NUM_STEPS):
                l, _, s = session.run([loss, train_step, summary],
                                      feed_dict={
                                          inputs: x,
                                          targets: y
                                      })
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % 100 == 0:
                    logging.debug("Epoch=%s, iteration=%s", epoch, total_iter)
            perplexity = np.exp(total_loss / total_iter)
            logging.info("Perplexity on training data after epoch %s: %.2f",
                         epoch, perplexity)
            saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))
Ejemplo n.º 3
0
def train(data: str, epochs: int, batch_size: int, vocab_max_size: int,
          save_to: str, log_to: str, **kwargs):
    """Trains a language model. See argument description in `bin/romanesco`."""

    # create folders for model and logs if they don't exist yet
    for folder in [save_to, log_to]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # create vocabulary to map words to ids
    vocab = Vocabulary()
    vocab.build(data, max_size=vocab_max_size)
    vocab.save(os.path.join(save_to, VOCAB_FILENAME))

    # convert training data to list of word ids
    raw_data = reader.read(data, vocab)

    # define computation graph
    inputs, targets, loss, train_step, _, summary, current_state, init_state = define_computation_graph(
        vocab.size, batch_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # init
        session.run(tf.global_variables_initializer())
        # write logs (@tensorboard)
        summary_writer = tf.summary.FileWriter(log_to,
                                               graph=tf.get_default_graph())
        # iterate over training data `epoch` times
        for epoch in range(1, epochs + 1):
            _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE))
            total_loss = 0.0
            total_iter = 0
            for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS):

                l, _, _current_state, s = session.run(
                    [loss, train_step, current_state, summary],
                    feed_dict={
                        inputs: x,
                        targets: y,
                        init_state: _current_state
                    })
                summary_writer.add_summary(s, total_iter)
                total_loss += l
                total_iter += 1
                if total_iter % 100 == 0:
                    logging.debug("Epoch=%s, iteration=%s", epoch, total_iter)
            perplexity = np.exp(total_loss / total_iter)
            logging.info("Perplexity on training data after epoch %s: %.2f",
                         epoch, perplexity)
            saver.save(session, os.path.join(save_to, MODEL_FILENAME))
Ejemplo n.º 4
0
def score(data: str,
          load_from: str = C.MODEL_PATH,
          batch_size: int = C.BATCH_SIZE,
          hidden_size: int = C.HIDDEN_SIZE,
          embedding_size: int = C.EMBEDDING_SIZE,
          num_steps: int = C.NUM_STEPS,
          **kwargs):
    """Scores a text using a trained language model. See argument description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, C.VOCAB_FILENAME))

    raw_data = reader.read(data, vocab)
    data_length = len(raw_data)

    if data_length < num_steps:
        logging.warning(
            "Length of input data is shorter than NUM_STEPS. Will try to reduce NUM_STEPS."
        )
        num_steps = data_length - 1

    if data_length < batch_size * num_steps:
        logging.warning(
            "Length of input data is shorter than BATCH_SIZE * NUM_STEPS. Will try to set batch size to 1."
        )
        batch_size = 1

    inputs, targets, loss, _, _, _ = define_computation_graph(
        vocab_size=vocab.size,
        batch_size=batch_size,
        num_steps=num_steps,
        hidden_size=hidden_size,
        embedding_size=embedding_size)

    saver = tf.train.Saver()

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        total_loss = 0.0
        total_iter = 0
        for x, y in reader.iterate(raw_data, batch_size, num_steps):
            l = session.run([loss], feed_dict={inputs: x, targets: y})
            total_loss += l[0]
            total_iter += 1
        perplexity = np.exp(total_loss / total_iter)
        return perplexity
Ejemplo n.º 5
0
def sample(length: int, load_from: str, first_symbol: str = None, **kwargs):
    """Generates a text by sampling from a trained language model. See argument
    description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, 'vocab.json'))

    inputs, targets, _, _, logits, _ = define_computation_graph(vocab.size, 1)

    saver = tf.train.Saver()

    sampled_sequence = []

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, MODEL_FILENAME))

        if first_symbol:
            try:
                sampled_symbol = vocab.get_id(first_symbol)
            except KeyError:
                logging.error('Unknown symbol `{0}`. Try with another start symbol.')
                sys.exit(0)
        else:
            sampled_symbol = vocab.get_random_id()

        x = np.array(np.zeros(NUM_STEPS, dtype=int)) # padding with zeros (UNK)
        y = np.array(np.zeros(NUM_STEPS, dtype=int)) # we don't care about gold targets here

        UNK_ID = vocab.get_id(UNK)

        for _ in range(length):
            sampled_sequence.append(sampled_symbol)
            x = np.roll(x, -1)
            x[NUM_STEPS - 1] = sampled_symbol
            l = session.run([logits], feed_dict={inputs: [x], targets: [y]})
            next_symbol_logits = l[0][0][-1] # first returned session variable, first batch, last symbol
            next_symbol_probs = softmax(next_symbol_logits)
            # avoid generating unknown words
            sampled_symbol = UNK_ID
            while sampled_symbol == UNK_ID: # TODO: avoid infinite loop
                sampled_symbol = np.random.choice(range(vocab.size), p=next_symbol_probs)

    words = vocab.get_words(sampled_sequence)
    return ' '.join(words).replace(' ' + EOS + ' ', '\n') # OPTIMIZE: remove <eos> at the very end
Ejemplo n.º 6
0
def sample(length: int = C.SAMPLE_LENGTH,
           load_from: str = C.MODEL_PATH,
           first_symbols: List[str] = [],
           hidden_size: int = C.HIDDEN_SIZE,
           embedding_size: int = C.EMBEDDING_SIZE,
           num_steps: int = C.NUM_STEPS,
           **kwargs):
    """Generates a text by sampling from a trained language model. See argument
    description in `bin/romanesco`."""

    vocab = Vocabulary()
    vocab.load(os.path.join(load_from, C.VOCAB_FILENAME))

    inputs, targets, _, _, logits, _ = define_computation_graph(
        vocab_size=vocab.size,
        batch_size=1,
        num_steps=num_steps,
        hidden_size=hidden_size,
        embedding_size=embedding_size)

    saver = tf.train.Saver()

    sampled_sequence = []

    with tf.Session() as session:
        # load model
        saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME))

        if first_symbols != []:
            try:
                first_symbol_ids = [
                    vocab.get_id(symbol, strict=True)
                    for symbol in first_symbols
                ]
            except KeyError:
                logging.error(
                    'Unknown first symbol. Try with other first symbols.')
                sys.exit(0)
        else:
            # if no prime text, then just sample a single symbol
            first_symbol_ids = [vocab.get_random_id()]

        x = np.array(np.zeros(num_steps,
                              dtype=int))  # padding with zeros (UNK)
        y = np.array(np.zeros(
            num_steps, dtype=int))  # we don't care about gold targets here

        UNK_ID = vocab.get_id(C.UNK)

        sampled_symbol = first_symbol_ids.pop(0)

        for _ in range(length):
            sampled_sequence.append(sampled_symbol)
            x = np.roll(x, -1)
            x[num_steps - 1] = sampled_symbol
            l = session.run([logits], feed_dict={inputs: [x], targets: [y]})
            next_symbol_logits = l[0][0][
                -1]  # first returned session variable, first batch, last symbol
            next_symbol_probs = softmax(next_symbol_logits)

            try:
                sampled_symbol = first_symbol_ids.pop(0)
            # list of priming symbols is exhausted
            except IndexError:
                # avoid generating unknown words
                sampled_symbol = UNK_ID
                while sampled_symbol == UNK_ID:  # TODO: avoid infinite loop
                    sampled_symbol = np.random.choice(range(vocab.size),
                                                      p=next_symbol_probs)

    words = vocab.get_words(sampled_sequence)

    for index, word in enumerate(words):
        if word == C.EOS:
            words[index] = "\n"

    return ' '.join(words)