Example #1
0
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
Example #2
0
def load_batcher(data_params, cuda):
    languages, Lang_name = [], []
    # Load the data into languages
    data_dir = data_params['data_dir']
    for w in data_params['languages']:
        lang = Language(
            name=w['name'],
            cuda=cuda,
            mode=data_params['mode'],
            mean_center=data_params['mean_center'],
            unit_norm=data_params['unit_norm']
        )
        Lang_name.append(w['name'])
        lang.load(w['filename'], data_dir, max_freq=data_params['max_freq'])
        languages.append(lang)
    batcher = Batcher(languages)
    if 'supervised' in data_params:
        filename = data_params['supervised']['fname']
        random = data_params['supervised']['random']
        max_count = data_params['supervised']['max_count']
        if data_params["data_dir"] == "./muse_data/":
            sup_dir_name = os.path.join(data_dir, "crosslingual", "dictionaries")
        elif data_params["data_dir"] == "./vecmap_data/":
            sup_dir_name = os.path.join(data_dir, "dictionaries")
        batcher.load_from_supervised(
            filename, Lang_name[0], Lang_name[1],
            sup_dir_name, random = random, max_count=max_count)
    return batcher
Example #3
0
def dump_embeddings_from_dynamic_bilm(option_file,
                                      weight_file,
                                      word_file,
                                      char_file,
                                      data_file,
                                      output_file,
                                      sent_vec=False,
                                      sent_vec_type='last',
                                      cell_reset=False):
    """
    Get elmo embeddings
    """

    with open(option_file, 'r') as fin:
        options = json.load(fin)

    # add one so that 0 is the mask value
    options['char_cnn']['n_characters'] += 1

    max_word_length = options['char_cnn']['max_characters_per_token']
    batcher = Batcher(word_file, char_file, max_word_length)

    # 1D: batch_size, 2D: time_steps, 3D: max_characters_per_token
    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = DynamicLanguageModel(options, weight_file, cell_reset=cell_reset)
    ops = model(ids_placeholder)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        print('Computing ELMo...')
        sentence_id = 0
        with open(data_file, 'r') as fin, h5py.File(output_file, 'w') as fout:
            for line in fin:
                if (sentence_id + 1) % 100 == 0:
                    print("%d" % (sentence_id + 1), flush=True, end=" ")

                sentence = line.rstrip().split()
                char_ids = batcher.batch_sentences([sentence])

                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})

                # 1D: 3(ELMo layers), 2D: n_words, 3D: vector dim
                embeddings = embeddings[0, :, :, :]
                if sent_vec:
                    embeddings = np.mean(embeddings, axis=1)
                    if sent_vec_type == 'last':
                        embeddings = embeddings[-1]
                    else:
                        embeddings = np.mean(embeddings, axis=0)
                else:
                    # 1D: n_words, 2D: 3(ELMo layers), 3D: vector dim
                    embeddings = np.transpose(embeddings, (1, 0, 2))

                fout.create_dataset(name=str(sentence_id), data=embeddings)
                sentence_id += 1
        print('Finished')
Example #4
0
    def __init__(self, model_file_path):
        model_name = re.findall(r'train_\d+', model_file_path)[0] + '_' + \
                     re.findall(r'model_\d+_\d+\.\d+', model_file_path)[0]
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        self.model = Model(model_file_path, is_eval=True)
def main(unused_argv):
  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception("Problem with flags: %s" % unused_argv)

  tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
  tf.logging.info('Starting seq2seq_attention in %s mode...', (PARAMS.mode))

  # Change log_root to PARAMS.log_root/PARAMS.exp_name and create the dir if necessary
  PARAMS.log_root = os.path.join(PARAMS.log_root, PARAMS.exp_name)
  if not os.path.exists(PARAMS.log_root):
    if PARAMS.mode== "train":
      os.makedirs(PARAMS.log_root)
    else:
      raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (PARAMS.log_root))

  vocab = Vocab(PARAMS.vocab_path, PARAMS.vocab_size) # create a vocabulary

  # If in decode mode, set batch_size = beam_size
  # Reason: in decode mode, we decode one example at a time.
  # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
  if PARAMS.mode == 'decode':
    PARAMS.batch_size = PARAMS.beam_size

  # If single_pass=True, check we're in decode mode
  if PARAMS.single_pass and PARAMS.mode!= 'decode':
    raise Exception("The single_pass flag should only be True in decode mode")

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen']
  hps_dict = {}
  for key,val in PARAMS.__flags.items(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

  # Create a batcher object that will create minibatches of data
  batcher = Batcher(PARAMS.data_path, vocab, hps, single_pass=PARAMS.single_pass)

  tf.set_random_seed(111) # a seed value for randomness

  if hps.mode == 'train':
    print("creating model...")
    model = AttHistCopyModel(hps, vocab)
    setup_training(model, batcher)
  elif hps.mode == 'eval':
    model = AttHistCopyModel(hps, vocab)
    run_eval(model, batcher, vocab)
  elif hps.mode == 'decode':
    decode_model_hps = hps  # This will be the hyperparameters for the decoder model
    decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
    model = AttHistCopyModel(decode_model_hps, vocab)
    decoder = BeamSearchDecoder(model, batcher, vocab)
    decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
  else:
    raise ValueError("The 'mode' flag must be one of train/eval/decode")
Example #6
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))

        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Example #7
0
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    '''
    Given an input vocabulary file, dump all the token embeddings to the
    outfile.  The result can be used as the embedding_weight_file when
    constructing a BidirectionalLanguageModel.
    '''
    with open(options_file, 'r') as fin:
        options = json.load(fin)
    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.placeholder('int32',
                                     shape=(None, None, max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']

    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)  #todo 获取具体的单词
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)
Example #8
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = re.findall(r'train_\d+', model_file_path)[0] + '_' + \
                     re.findall(r'model_\d+_\d+\.\d+', model_file_path)[0]
        print('o MODEL NAME: ', model_name)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:  #  and counter <= 100 # 11490
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 10 == 0:
                print('%d example in %d sec' % (counter, time.time() - start))
                start = time.time()
            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)

    def beam_search(self, batch):
        # The batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # Prepare decoder batch
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = paddle.to_tensor(latest_tokens)
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (paddle.stack(all_state_h, 0).unsqueeze(0),
                     paddle.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = paddle.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = paddle.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)
            log_probs = paddle.log(final_dist)
            topk_log_probs, topk_ids = paddle.topk(log_probs,
                                                   config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].numpy()[0],
                                        log_prob=topk_log_probs[i,
                                                                j].numpy()[0],
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
Example #9
0
class Trainer(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(
            config.train_data_path,
            self.vocab,
            mode='train',
            batch_size=config.batch_size,
            single_pass=False)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))

        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'encoder': self.model.encoder.state_dict(),
            'decoder': self.model.decoder.state_dict(),
            'reduce_state': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        model_save_dir = os.path.join(self.model_dir, 'model_%06d_%.8f' %
                                      (iter, running_avg_loss))
        for k in state:
            model_save_path = os.path.join(model_save_dir, '%s.params' % k)
            paddle.save(state[k], model_save_path)
        return model_save_dir

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        assert len(params) == 31
        self.optimizer = Adagrad(
            parameters=params,
            learning_rate=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc,
            epsilon=1.0e-10,
            grad_clip=paddle.nn.ClipGradByGlobalNorm(
                clip_norm=config.max_grad_norm))

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            start_iter = int(model_file_path.split('_')[-2])
            start_loss = float(
                model_file_path.split('_')[-1].replace(os.sep, ''))

            if not config.is_coverage:
                self.optimizer.set_state_dict(
                    paddle.load(
                        os.path.join(model_file_path, 'optimizer.params')))

        return start_iter, start_loss

    def train_one_batch(self, batch, iter):

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)

        self.optimizer.clear_gradients()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]

            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = \
                self.model.decoder(y_t_1, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask,
                                   c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di)

            target = target_batch[:, di]
            add_index = paddle.arange(0, target.shape[0])
            new_index = paddle.stack([add_index, target], axis=1)
            gold_probs = paddle.gather_nd(final_dist, new_index).squeeze()
            step_loss = -paddle.log(gold_probs + config.eps)

            if config.is_coverage:
                step_coverage_loss = paddle.sum(
                    paddle.minimum(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = paddle.sum(paddle.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = paddle.mean(batch_avg_loss)

        loss.backward()
        self.optimizer.minimize(loss)

        return loss.numpy()[0]

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            iter += 1
            print(
                'global step %d/%d, step loss: %.8f, running avg loss: %.8f, speed: %.2f step/s'
                % (iter, n_iters, loss, running_avg_loss,
                   1.0 / (time.time() - start)))
            start = time.time()
            if iter % 5000 == 0 or iter == 1000:
                model_save_dir = self.save_model(running_avg_loss, iter)
                print(
                    'Saved model for iter %d with running avg loss %.8f to directory: %s'
                    % (iter, running_avg_loss, model_save_dir))
Example #10
0
    def test_batch_sentences(self):
        batcher = Batcher(os.path.join(DATA_FIXTURES, 'vocab_test.txt'), 50)
        sentences = [['The', 'first', 'sentence'], ['Second', '.']]
        x_char_ids = batcher.batch_sentences(sentences)

        self.assertTrue((x_char_ids == self._expected_char_ids).all())
Example #11
0
    print('Success rate: %d / %d' % (success_rate, len(train_sentences)))


if __name__ == "__main__":

    config = json.load(open('config.json', 'r'))
    data_path = '/dev/shm/coco/'
    #data_path = 'coco/'
    train_dir = 'summaries/Caption_training' + datetime.datetime.strftime(
        datetime.datetime.today(), '%d%m%Y%H%M%S')

    vocab = Vocab('vocab')
    model = CaptioningNetwork(config, vocab)

    batcher = Batcher(data_path, config, vocab)

    tf.set_random_seed(111)

    # Setup training
    tf.logging.info('Building graph...')
    model.build_graph()

    # print(tf.GraphKeys.GLOBAL_VARIABLES)
    # print(tf.GraphKeys.TRAINABLE_VARIABLES)

    # Feed forward test
    # with sess:
    #     sess.run(...)
    #     output_shape = ...
    #     print('Feed forward OK! Output shape: %s' % str(output_shape))
Example #12
0
    def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only):
        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)

        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=4)
        bilm_ops = model(character_ids)

        weighted_ops = []
        for k in range(2):
            ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, 
                                     do_layer_norm=do_layer_norm,
                                     use_top_only=use_top_only)
            weighted_ops.append(ops)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        n_expected_trainable_weights = 2 * (1 + int(not use_top_only))
        self.assertEqual(len(tf.trainable_variables()),
                         n_expected_trainable_weights)
        # and one regularizer per weighted layer
        n_expected_reg_losses = 2 * int(not use_top_only)
        self.assertEqual(
            len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            n_expected_reg_losses,
        )

        # Set the variables.
        weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])],
                   [np.array([0.2, 0.4, 0.6]), np.array([0.88])]]
        for k in range(2):
            with tf.variable_scope('', reuse=True):
                if not use_top_only:
                    W = tf.get_variable('{}_ELMo_W'.format(k))
                    _ = self.sess.run([W.assign(weights[k][0])])
                gamma = tf.get_variable('{}_ELMo_gamma'.format(k))
                _ = self.sess.run([gamma.assign(weights[k][1])])

        # make some data
        sentences = [
            ['The', 'first', 'sentence', '.'],
            ['The', 'second'],
            ['Third']
        ]
        X_chars = batcher.batch_sentences(sentences)

        ops = model(character_ids)
        lm_embeddings, mask, weighted0, weighted1 = self.sess.run(
            [ops['lm_embeddings'], ops['mask'],
             weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']],
            feed_dict={character_ids: X_chars}
        )
        actual_elmo = [weighted0, weighted1]

        # check the mask first
        expected_mask = [[True, True, True, True],
                         [True, True, False, False],
                         [True, False, False, False]]
        self.assertTrue((expected_mask == mask).all())

        # Now compute the actual weighted layers
        for k in range(2):
            normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum(
                                  np.exp(weights[k][0] + 1.0 / 3))
            # masked layer normalization
            expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1]))
            if not use_top_only:
                for j in range(3):  # number of LM layers
                    if do_layer_norm:
                        mean = np.mean(lm_embeddings[:, j, :, :][mask])
                        std = np.std(lm_embeddings[:, j, :, :][mask])
                        normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / (
                            std + 1E-12)
                        expected_elmo += normed_weights[j] * normed_lm_embed
                    else:
                        expected_elmo += normed_weights[j] * lm_embeddings[
                                                                    :, j, :, :]
            else:
                expected_elmo += lm_embeddings[:, -1, :, :]

            # the scale parameter
            expected_elmo *= weights[k][1]
            self.assertTrue(
                np.allclose(expected_elmo, actual_elmo[k], atol=1e-6)
            )
Example #13
0
    def test_bilm(self):
        sentences, expected_lm_embeddings = _load_sentences_embeddings()

        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)
        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(options_file,
                                           weight_file,
                                           max_batch_size=4)

        # get the ops to compute embeddings
        ops = model(character_ids)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        # We shouldn't have any trainable variables
        self.assertEqual(len(tf.trainable_variables()), 0)

        # will run 10 batches of 3 sentences
        for i in range(10):
            # make a batch of sentences
            batch_sentences = []
            for k in range(3):
                sentence = sentences[k][i].strip().split()
                batch_sentences.append(sentence)

            X = batcher.batch_sentences(batch_sentences)
            lm_embeddings, lengths = self.sess.run(
                [ops['lm_embeddings'], ops['lengths']],
                feed_dict={character_ids: X})
            #todo 句子的真实的长度
            actual_lengths = [len(sent) for sent in batch_sentences]

            self.assertEqual(actual_lengths, list(lengths))

            # get the expected embeddings and compare!
            expected_y = [expected_lm_embeddings[k][i] for k in range(3)]
            for k in range(3):
                self.assertTrue(
                    np.allclose(lm_embeddings[k, 2, :lengths[k], :],
                                expected_y[k],
                                atol=1.0e-6))

        # Finally, check that the states are being updated properly.
        # All batches were size=3, so last element of states should always
        # be zero.
        third_states = []
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(np.sum(np.abs(state[-1, :])) < 1e-7)
                    third_states.append(state[2, :])

        # Run a batch with size=2, the third state should not have been updated
        _ = self.sess.run(
            ops['lm_embeddings'],
            feed_dict={character_ids: np.ones((2, 5, 50), dtype=np.int32)})
        k = 0
        for direction in ['forward', 'backward']:
            states = self.sess.run(
                model._graphs[character_ids].lstm_init_states[direction])
            for i in range(2):
                for state in states[i]:
                    self.assertTrue(
                        np.allclose(third_states[k], state[2, :], atol=1e-6))
                    k += 1