Beispiel #1
0
def test_session(batch_size=1, target_epoch='best', beam_decoding=False):
    """ Executes a quick test session on the SAE model by sampling a small quantity of items from the test set
    and using the model to first compress them into a meaningful representation and subsequently reconstruct them. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size, if left unspecified in test options
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load test data
    test_data = load_pickle(test_pickle)
    # Build model graph
    autoencoder = SeqAE(vocab, test_opt,
                        'seq_ae' + '_{:s}'.format(train_opt.train_id))
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize model interface containing inference methods
        interface = SeqAEInterface(autoencoder, vocab, test_sess, test_opt)
        # Sample candidate sentences from the test set
        samples = np.random.choice(test_data, test_opt.num_samples).tolist()
        while max([len(sample.split()) for sample in samples]) > 100:
            samples = np.random.choice(test_data,
                                       test_opt.num_samples).tolist()
        # Initialize a loader object to pre-process the sampled sentences
        sample_loader = DataServer(samples, vocab, test_opt)
        samples_read = 0

        print('Sampled sentences:')
        for i, s in enumerate(samples):
            print('{:d}: {:s}'.format(i, s))
        print('-' * 10 + '\n')

        if not beam_decoding:
            # Perform greedy encoding-decoding
            print('Greedy decoding:')
            for i, sample_data in enumerate(sample_loader):
                _, enc_input, dec_input = sample_data
                generated = interface.greedy_generation(enc_input, dec_input)
                for j in range(test_opt.batch_size):
                    print('Encoded: {:s}\nDecoded: {:s}\n'.format(
                        samples[samples_read + j], generated[j]))
                samples_read += test_opt.batch_size
        else:
            # Perform encoding-decoding with beam-search (limited use for reconstruction)
            assert (
                test_opt.batch_size == 1
            ), 'Beam search not defined for batches with more than one element.'
            print('Beam search decoding:')
            for i, sample_data in enumerate(sample_loader):
                _, enc_input, _ = sample_data
                print('Encoded: {:s}'.format(samples[i]))
                interface.beam_generation(enc_input, print_results=True)

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Auto-encoder evaluation completed!')
Beispiel #2
0
def train_session():
    """ Executes a training session on the SAE model. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()
    # Load data
    train_data = load_pickle(train_pickle)
    valid_data = load_pickle(valid_pickle)
    # Construct the model graph
    sent_sim_class = SentSimClassifier(vocab, train_opt, 'ssc')
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    # Declare OP for initializing of model variables
    init_op = tf.global_variables_initializer()

    # During domain adoptation, restore learned SSC parameters with the exception of the embeddings,
    # which are extracted from the pre-trained IDGAN-internal LM
    restored_vars = [var for var in all_vars if 'embedding_table' not in var.name]
    pre_train_saver = tf.train.Saver(restored_vars)
    embeddings_ssc_keys = [var.name for var in all_vars if var not in restored_vars and 'optimization' not in var.name]
    embedding_lm_keys = list()
    # Handle scoping discrepancies between SSC and LM checkpoints, to make LM variables compatible with the SSC graph
    for k in embeddings_ssc_keys:
        k = k.replace('ssc', 'cog_lm')
        k = k.replace('encoder/embeddings', 'embeddings')
        k = k_replace('Adam', 'optimizer')
	k = k.split(':')[0]
        embedding_lm_keys.append(k)
    embeddings_dir = os.path.join(train_opt.root_dir, 'cognitive_language_model/src/checkpoints/')
    embeddings_epoch = 'best'
    # Map SSC embedding variables to LM embedding variables,
    # so that the former may be initialized with values extracted from the latter
    embeddings_dict = {embedding_lm_keys[i]: [v for v in tf.global_variables() if v.name == embeddings_ssc_keys[i]][0]
                       for i in range(len(embedding_lm_keys))}
    # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM
    embeddings_saver = tf.train.Saver(embeddings_dict)
    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        if train_opt.pre_train:
            # Initialize variables
            train_sess.run(init_op)
        else:
            # Restore pre-trained model parameters for domain adaptation (sans embedding table)
            load_model(train_sess, pre_train_saver, os.path.join(train_opt.save_dir, 'pre_training'), 'best')
            # Restore embedding parameters from the specified LM checkpoint
            load_model(train_sess, embeddings_saver, embeddings_dir, embeddings_epoch)

        # Initialize SSC trainer
        trainer = SentSimClassTrainer(vocab, train_opt, sent_sim_class, train_sess, train_data, valid_data)
        # Train model (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_model()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info('Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
        int(elapsed // 3600), int((elapsed % 3600)) // 60, elapsed % 60))
Beispiel #3
0
def score_corpus():
    """ Executes a session during which the source corpus is annotated with sentence-wise model perplexity scores. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare path leading to corpus to be scored
    scored_path = os.path.join(data_dir, '{:s}.txt'.format(scored_name))
    ppx_scores = list()
    # Load data
    full_data = load_pickle(full_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    sort_saver = tf.train.Saver()
    # Time the duration of the scoring process
    starting_time = time.time()

    with tf.Session(config=config) as sort_sess:
        # Load learned model parameters
        load_model(sort_sess, sort_saver, test_opt.save_dir, 'best')
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, sort_sess, test_opt)
        # Run the scoring loop
        pos = 0
        with codecs.open(scored_path, 'w') as in_file:
            while pos < len(full_data) - 1:
                # Fill a single batch of sentences to be scored
                try:
                    batch = full_data[pos:pos + test_opt.batch_size]
                    pos += test_opt.batch_size
                except IndexError:
                    batch = full_data[pos:len(full_data) - 1]
                    pos = len(full_data) - 1
                # Get sentence-wise model perplexity scores
                batch_ppx = interface.get_sequence_perplexity(batch)
                # Write the scored sentences to file
                for i in range(len(batch)):
                    sentence_ppx = batch_ppx[i, :].tolist()[0]
                    scored_sent = '{:s}\t{:.4f}\n'.format(
                        batch[i], sentence_ppx)
                    in_file.write(scored_sent)
                    # Keep track of corpus-wide statistics
                    ppx_scores.append(sentence_ppx)

    # Archive corpus statistics
    with open(lm_notes, 'w') as notes_file:
        notes_file.write(
            '------------ Scored Corpus Statistics -------------\n')
        notes_file.write('Metric\tMean\tMedian\n')
        notes_file.write('Senetence Perplexity\t{:.4f}\t{:.4f}\n'.format(
            np.mean(ppx_scores), np.median(ppx_scores)))

    # Report scoring duration
    elapsed = time.time() - starting_time
    print('Scoring took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
        int(elapsed // 3600),
        int((elapsed % 3600)) // 60, elapsed % 60))
Beispiel #4
0
def test_session(target_epoch='best'):
    """ Evaluates the accuracy of the learned SSC model by using it to predict the similarity score of
    sentence pairs contained within the specified test set. """
    # Clear the default graph
    tf.reset_default_graph()
    # Load data
    test_data = load_pickle(test_pickle)
    # Build model graph
    sent_sim_class = SentSimClassifier(vocab, test_opt, 'ssc')
    # Declare saver
    test_saver = tf.train.Saver()
    save_dir = train_opt.save_dir

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, save_dir, target_epoch)
        # Initialize model interface
        interface = SentSimClassInterface(sent_sim_class, vocab, test_sess, test_opt)
        # Initialize a loader object to pre-process and serve items drawn from the source corpus
        sample_loader = DataServer(test_data, vocab, test_opt)
        # Evaluate model's performance on a withheld test corpus to estimate its capacity for generalization beyond
        # seen data
        # Track prediction accuracy and the divergence of predicted similarity scores from target values
        total_error = 0.0
        total_differential = 0.0
        total_items = 0

        for i, test_batch in enumerate(sample_loader):
            # Obtain model predictions for the current test batch
            predictions, prediction_error = interface.infer_step(test_batch)
            total_error += np.sum(np.abs(prediction_error))
            try:
                for j in range(test_opt.batch_size):
                    cj = total_items + j
                    differential = np.abs(np.subtract(float(test_data[1][cj]), predictions[j][0]))
                    total_differential += differential
                    # Report model prediction and error
                    print('Sentence 1: {:s}\nSentence 2: {:s}\n'
                          'True score: {:.4f} | Model Prediction: {:.4f} | Differential: {:.4f}'
                          .format(test_data[0][cj][0], test_data[0][cj][1], float(test_data[1][cj]), predictions[j][0],
                                  differential))
                    print('-' * 10)
                total_items += test_opt.batch_size
            except IndexError:
                break
        # Report test corpus statistics
        print('Total model error: {:.4f} | Average model error: {:.4f} | Average prediction error: {:.4f}'.format(
            total_error, total_error / total_items, total_differential / total_items))

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Sentence similarity classifier evaluation completed!')
Beispiel #5
0
def test_to_file(batch_size=1, target_epoch='best', beam_decoding=True):
    """ Executes a comprehensive test session on the entire test corpus;
    output is written to file for the calculation of the achieved corpus-wide ID reduction and
    the BLEU score between source sentences and their ID-reduced translations. """
    def _reconstruct_input(input_array):
        """ Reconstructs input sentences from numpy arrays; used to derive an accurate representation of the
        pre-processed, encoded sequences. """
        # Convert input array to list of lists of word indices
        input_idx = [
            np.squeeze(array).tolist()
            for array in np.split(input_array, input_array.shape[0], axis=0)
        ]
        # Translate indices into corresponding word tokens; truncated after sentence-final <EOS>
        input_boundaries = [
            idx_list.index(vocab.eos_id)
            if vocab.eos_id in idx_list else len(idx_list)
            for idx_list in input_idx
        ]
        input_sentences = [[
            vocab.index_to_word[idx]
            for idx in input_idx[j][:input_boundaries[j]]
        ] for j in range(len(input_idx))]
        input_sentences = [
            ' '.join(word_list) + '.' for word_list in input_sentences
        ]
        return input_sentences

    assert (test_opt.batch_size == 1), \
        'Function is defined for a batch size of 1 due to the nature of beam search implementation.'

    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load test data from the high-ID corpus
    source_test_data = load_pickle(pickle_paths[2])
    # Build model graph
    seq_gan = IDGAN(opts, vocab, 'IDGAN')
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    # Declare paths pointing to locations of output files (i.e. reference and translations sets for BLEU)
    encoded_path = os.path.join(
        test_opt.out_dir,
        'source_encoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding)))
    decoded_path = os.path.join(
        test_opt.out_dir,
        'source_decoded_test_corpus_beam_{:s}.txt'.format(str(beam_decoding)))

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize the model interface containing inference methods
        interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt)
        # Initialize a loader object to pre-process the test corpus
        test_loader = DataServer(source_test_data, vocab, test_opt)

        with open(encoded_path, 'w') as enc_file:
            with open(decoded_path, 'w') as dec_file:
                if not beam_decoding:
                    # Perform greedy ID-reduction on the sampled sentences
                    print('Greedy decoding:')
                    for s_id, test_items in enumerate(test_loader):
                        enc_labels, enc_inputs, dec_inputs = test_items
                        generated = interface.greedy_generation(
                            enc_labels, enc_inputs, dec_inputs)
                        enc_file.write(
                            _reconstruct_input(enc_labels)[0] + '\n')
                        dec_file.write(generated[0] + '\n')
                        if s_id % 10 == 0 and s_id > 0:
                            print(
                                '{:d} sentences written to file.'.format(s_id))

                else:
                    # Perform greedy ID-reduction with beam-search on the sampled sentences
                    assert (
                        test_opt.batch_size == 1
                    ), 'Beam search not defined for batches with more than one element.'
                    print('Beam search decoding:')
                    for s_id, test_items in enumerate(test_loader):
                        enc_labels, enc_input, _ = test_items
                        generated = interface.beam_generation(
                            enc_labels, enc_input, print_results=False)
                        # Write best beam result only
                        enc_file.write(
                            _reconstruct_input(enc_labels)[0] + '\n')
                        dec_file.write(generated[0][0] + '\n')
                        if s_id % 10 == 0 and s_id > 0:
                            print(
                                '{:d} sentences written to file.'.format(s_id))

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('IDGAN documented evaluation completed!')
Beispiel #6
0
def test_session(batch_size=1, target_epoch='best', beam_decoding=False):
    """ Executes a quick test session on the IDGAN system by sampling a small quantity of items from the test set
    and using the model to first compress them into a meaningful representation and subsequently reconstruct them;
    the evaluation process focuses exclusively on the translator SAE. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare the batch size, if left unspecified in test options
    if train_opt.batch_size is None:
        train_opt.batch_size = batch_size
    # Load data
    source_test_data = load_pickle(pickle_paths[2])
    # Build system graph
    seq_gan = IDGAN(opts, vocab, 'IDGAN')
    # Declare saver object for restoring learned IDGAN parameters
    test_saver = tf.train.Saver()

    # Initiate testing session
    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize system interface containing inference methods
        interface = IDGANInterface(seq_gan, vocab, test_sess, test_opt)
        # Sample candidate sentences from the test set
        samples = np.random.choice(source_test_data,
                                   test_opt.num_samples).tolist()
        while max([len(sample.split()) for sample in samples]) > 10:
            samples = np.random.choice(source_test_data,
                                       test_opt.num_samples).tolist()
        # Initialize a loader object to pre-process the sampled sentences
        sample_loader = DataServer(samples, vocab, test_opt)
        samples_read = 0

        print('Sampled sentences:')
        for s_id, s in enumerate(samples):
            print('{:d}: {:s}'.format(s_id, s))
        print('-' * 10 + '\n')

        if not beam_decoding:
            # Perform greedy ID-reduction on the sampled sentences
            print('Greedy decoding:')
            for _, sample_data in enumerate(sample_loader):
                enc_labels, enc_inputs, dec_inputs = sample_data
                generated = interface.greedy_generation(
                    enc_labels, enc_inputs, dec_inputs)
                for j in range(test_opt.batch_size):
                    print('Encoded: {:s}\nDecoded: {:s}\n'.format(
                        samples[samples_read + j], generated[j]))
                samples_read += test_opt.batch_size
        else:
            # Perform greedy ID-reduction with beam-search on the sampled sentences
            assert (
                test_opt.batch_size == 1
            ), 'Beam search not defined for batches with more than one element.'
            print('Beam search decoding:')
            for _, sample_data in enumerate(sample_loader):
                enc_labels, enc_input, _ = sample_data
                print('Encoded: {:s}'.format(samples[i]))
                interface.beam_generation(enc_labels,
                                          enc_input,
                                          print_results=True)

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('IDGAN evaluation completed!')
Beispiel #7
0
def train_session():
    """ Executes a training session on the IDGAN system. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()

    # Load data
    source_train_data = load_pickle(pickle_paths[0])
    source_valid_data = load_pickle(pickle_paths[1])
    target_train_data = load_pickle(pickle_paths[3])
    target_valid_data = load_pickle(pickle_paths[4])

    # Construct the system graph (component-specific graphs are constructed within the IDGAN graph)
    seq_gan = IDGAN(opts, vocab, 'IDGAN')

    # Initialize IDGAN's component models with pre-trained parameters
    # Declare paths pointing to checkpoints containing desired parameter values
    component_ckpt_dir = os.path.join(train_opt.local_dir,
                                      'checkpoints/components')
    lm_dir = os.path.join(component_ckpt_dir, 'lm')
    source_encoder_dir = os.path.join(component_ckpt_dir, 'source')
    if train_opt.cross_dec:
        source_decoder_dir = os.path.join(component_ckpt_dir, 'source_decoder')
    else:
        source_decoder_dir = os.path.join(component_ckpt_dir,
                                          'source')  # NO crossing
    target_dir = os.path.join(component_ckpt_dir, 'target')
    chosen_epoch = 'best'

    # Isolate parameters to be loaded into the IDGAN's system
    # Excludes optimization variables as well as variables connected to the training of 'frozen' IDGAN components
    # Get lists of variables contained within component checkpoint files
    lm_vars_plus_optimization = get_ckpt_vars(lm_dir)
    source_encoder_vars_plus_optimization = get_ckpt_vars(source_encoder_dir)
    source_decoder_vars_plus_optimization = get_ckpt_vars(source_decoder_dir)
    target_vars_plus_optimization = get_ckpt_vars(target_dir)
    # Exclude training-specific variables from IDGAN initialization
    lm_vars = [
        var_name for var_name in lm_vars_plus_optimization
        if 'optimization' not in var_name
    ]
    # To enable the 'crossed decoder' training condition, separate the encoder and decoder variables
    # of the SAE pre-trained on the high-ID corpus ('translator SAE' within IDGAN)
    source_encoder_vars = \
        [var_name for var_name in source_encoder_vars_plus_optimization if 'optimization' not in var_name]
    source_encoder_vars = [
        var_name for var_name in source_encoder_vars if 'encoder' in var_name
    ]
    source_decoder_vars = \
        [var_name for var_name in source_decoder_vars_plus_optimization if 'optimization' not in var_name]
    source_decoder_vars = [
        var_name for var_name in source_decoder_vars if 'decoder' in var_name
    ]
    target_vars = [
        var_name for var_name in target_vars_plus_optimization
        if 'optimization' not in var_name
    ]
    # Obtain list of all variables which have to be initialized (either randomly or from pre-trained values)
    # within the IDGAN system
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

    # Check for matches between variables found within the pre-trained checkpoints and IDGAN variables
    lm_parameters = [
        var for var in all_vars if var.name.split(':')[0] in lm_vars
    ]
    source_encoder_parameters = [
        var for var in all_vars
        if var.name.split(':')[0] in source_encoder_vars
    ]
    source_decoder_parameters = [
        var for var in all_vars
        if var.name.split(':')[0] in source_decoder_vars
    ]
    target_parameters = [
        var for var in all_vars if var.name.split(':')[0] in target_vars
    ]
    # Load matching variables from corresponding checkpoints
    loaded_parameters = lm_parameters + source_encoder_parameters + source_decoder_parameters + target_parameters
    # Rest is initialized randomly
    initialized_parameters = [
        var for var in all_vars if var not in loaded_parameters
    ]

    # Initialize saver objects tasked with loading in the pre-trained parameters
    lm_saver = tf.train.Saver(lm_parameters)
    source_encoder_saver = tf.train.Saver(source_encoder_parameters)
    source_decoder_saver = tf.train.Saver(source_decoder_parameters)
    target_saver = tf.train.Saver(target_parameters)
    # Declare random initialization OP
    init_op = tf.variables_initializer(initialized_parameters)

    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        # Load pre-trained parameters into the IDGAN graph
        load_model(train_sess, lm_saver, lm_dir, chosen_epoch)
        load_model(train_sess, source_encoder_saver, source_encoder_dir,
                   chosen_epoch)
        load_model(train_sess, source_decoder_saver, source_decoder_dir,
                   chosen_epoch)
        load_model(train_sess, target_saver, target_dir, chosen_epoch)
        # Initialize the rest
        train_sess.run(init_op)

        # Initialize IDGAN interface and trainer, used for inference and training steps, respectively
        interface = IDGANInterface(seq_gan, vocab, train_sess, test_opt)
        trainer = IDGANTrainer(vocab,
                               train_opt,
                               seq_gan,
                               train_sess,
                               source_train_data,
                               source_valid_data,
                               target_train_data,
                               target_valid_data,
                               test_opt,
                               interface,
                               verbose=True)
        # Train system (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_gan()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info(
        'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))
Beispiel #8
0
def test_session(target_epoch='best',
                 calculate_er=False,
                 generate=True,
                 gen_cycles=1):
    """ Executes a quick test session on the language model by sampling a small quantity of items from the test set
    and scoring them along various metrics. """
    # Tests are defined for a batch size of 1
    assert (
        test_opt.batch_size == 1), 'Model tests require batch size to equal 1.'
    # Clear the default graph
    tf.reset_default_graph()
    # Load data
    test_data = load_pickle(test_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    test_saver = tf.train.Saver()

    with tf.Session(config=config) as test_sess:
        # Load learned model parameters
        load_model(test_sess, test_saver, test_opt.save_dir, target_epoch)
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, test_sess, test_opt)
        # Sample sentences to be forwarded to the model
        samples = np.random.choice(test_data, test_opt.num_samples).tolist()

        print('Sampled sentences:')
        for i, s in enumerate(samples):
            print('{:d}: {:s}'.format(i, s))
        print('-' * 10 + '\n')

        # Get sentence probabilities
        print('Probabilities:')
        for i, s in enumerate(samples):
            total, prob_array, _ = interface.get_probability(s)
            # Mask <EOS> and <PAD> tag values
            cut_off = len(s.split())
            print('{:d}: {:s} | Total probability: {:.10f}'.format(
                i, s, total[0][0]))
            print('Per-word probabilities:')
            print('\t'.join(s.split()))
            print('\t'.join(
                ['{:.4}'.format(score) for score in prob_array[0][:cut_off]]))
        print('-' * 10 + '\n')

        # Get sentence log-probabilities
        print('Log-probabilities:')
        for i, s in enumerate(samples):
            total, prob_array, _ = interface.get_log_probability(s)
            cut_off = len(s.split())
            print('{:d}: {:s} | Total log-probability: {:.4f}'.format(
                i, s, total[0][0]))
            print('Per-word log-probabilities:')
            print('\t'.join(s.split()))
            print('\t'.join(['{:.4}'.format(score)
                             for score in prob_array[0]][:cut_off]))
        print('-' * 10 + '\n')

        # Get surprisal
        print('Surprisal and UID:')
        for i, s in enumerate(samples):
            total_s, s_array, norm_s, total_ud, ud_array, norm_ud = interface.get_surprisal(
                s)
            cut_off = len(s.split())
            tabbed_sent = '\t'.join(s.split())
            print(
                '{:d}: {:s} | Total surprisal: {: .4f} | Normalized surprisal: {: .4f}'
                .format(i, s, total_s[0][0], norm_s[0][0]))
            print('Per-word surprisal:')
            print(tabbed_sent)
            print('\t'.join(
                ['{: .4}'.format(score) for score in s_array[0][:cut_off]]))
            print(
                '{:d}: {:s} | Absolute UID: {:.4f} | Normalized UID: {: .4f}'.
                format(i, s, total_ud[0][0], norm_ud[0][0]))
            print('Per-word UID:')
            print(tabbed_sent)
            print('\t'.join(
                ['{: .4}'.format(score) for score in ud_array[0][:cut_off]]))
        print('-' * 10 + '\n')

        # Get approximate entropy reduction (computationally expensive!)
        if calculate_er:
            print('Approximate entropy reduction:')
            for i, s in enumerate(samples):
                total, array, norm = interface.get_entropy_reduction(samples)
                cut_off = len(s.split())
                print(
                    '{:d}: {:s} | Total ER: {: .4f} | Normalized ER: {: .4f}'.
                    format(i, s, total[0][0], norm[0][0]))
                print('Per-word ER:')
                print('\t'.join(s.split()))
                print('\t'.join(
                    ['{: .4}'.format(score) for score in array[0][:cut_off]]))
            print('-' * 10 + '\n')

            # Get cognitive load score (weighted sum of normalized surprisal and entropy reduction scores)
            print('Combined cognitive load:')
            for i, s in enumerate(samples):
                total, array, norm = interface.get_cognitive_load(samples)
                cut_off = len(s.split())
                print(
                    '{: d}: {:s} | Total CL: {: .4f} | Normalized CL: {: .4f}'.
                    format(i, s, total[0][0], norm[0][0]))
                print('Per-word CL:')
                print('\t'.join(s.split()))
                print('\t'.join(
                    ['{: .4}'.format(score) for score in array[0][:cut_off]]))
            print('-' * 10 + '\n')

        # Get model perplexity for the entire test set
        print('Model perplexity: {: .4f}'.format(
            interface.get_model_perplexity(test_data)[0][0]))
        print('-' * 10 + '\n')

        # Evaluate generative capability of the trained LM
        if generate:
            # Generate greedliy from scratch
            print('Sentences generated from scratch:')
            for c in range(gen_cycles):
                interface.generate(prefix=None, print_results=True)
            print('-' * 10 + '\n')

            # Generate greedily from some sentence prefix (i.e. a sentence completion test)
            print('Sentences generated from prefix:')
            for i, s in enumerate(samples):
                sent_list = s.split(' ')
                # Generate a sentence prefix of random length (at most 1/2 of the source sentence)
                cut_off = np.random.randint(1, len(sent_list) // 2)
                prefix = ' '.join(sent_list[:cut_off])
                print('Prefix: {:s} | Source: {:s}'.format(prefix, s))
                generated = interface.generate(prefix=prefix,
                                               print_results=False)
                for tpl in generated:
                    print('{:s} | Probability: {:.10f}'.format(tpl[0], tpl[1]))
                print('\n')

    print('-' * 10 + '\n')
    print('=' * 10 + '\n')
    print('Language model evaluation completed!')
Beispiel #9
0
def annotate_corpus():
    """ Executes a session during which the shrunk source corpus is annotated with ID-relevant measures. """
    # Clear the default graph
    tf.reset_default_graph()
    # Declare path leading to corpus to be annotated
    annotated_path = os.path.join(
        data_dir, '{:s}_annotated.txt'.format(train_name.split('_')[0]))
    # Values assigned per sentence are tracked for subsequent computation of corpus-wide statistics
    corpus_stats = {
        'Total_surprisal': list(),
        'Per_word_surprisal': list(),
        'Normalized_surprisal': list(),
        'Total_UID_divergence': list(),
        'Per_word_UID_divergence': list(),
        'Normalized_UID_divergence': list()
    }

    # Load data
    full_data = load_pickle(full_pickle)
    # Build model graph
    cog_lm = CogLM(vocab, test_opt, 'cog_lm')
    # Declare saver object for restoring learned model parameters
    annotate_saver = tf.train.Saver()
    # Time annotation duration
    starting_time = time.time()

    with tf.Session(config=config) as annotate_sess:
        # Load learned model parameters
        load_model(annotate_sess, annotate_saver, test_opt.save_dir, 'best')
        # Initialize LM interface
        interface = CogLMInterface(cog_lm, vocab, annotate_sess, test_opt)
        # Run the annotation loop
        pos = 0
        with codecs.open(annotated_path, 'w') as in_file:
            while pos < len(full_data) - 1:
                # Fill a single batch of sentences to be annotated
                try:
                    batch = full_data[pos:pos + test_opt.batch_size]
                    pos += test_opt.batch_size
                except IndexError:
                    batch = full_data[pos:len(full_data) - 1]
                    pos = len(full_data) - 1
                # Obtain ID-values via LM's interface
                total_surp, per_word_surp, norm_surp, total_uiddiv, per_word_uiddiv, norm_uiddiv = \
                    interface.get_surprisal(batch)
                # Write annotated sentences to file
                for i in range(len(batch)):
                    # For per-word annotations, exclude values associated with <EOS> and <PAD> tags
                    cut_off = len(batch[i].split())
                    item_ts = total_surp[i, :].tolist()[0]
                    # Surprisal
                    item_pws_floats = per_word_surp[i, :].tolist()[:cut_off]
                    item_pws = ';'.join(
                        ['{:.4f}'.format(pws) for pws in item_pws_floats])
                    item_ns = norm_surp[i, :].tolist()[0]
                    item_tu = total_uiddiv[i, :].tolist()[0]
                    # UID divergence
                    item_pwu_floats = per_word_uiddiv[i, :].tolist()[:cut_off]
                    item_pwu = ';'.join(
                        ['{:.4f}'.format(pwu) for pwu in item_pwu_floats])
                    item_nu = norm_uiddiv[i, :].tolist()[0]
                    # Construct annotated sample
                    scored_sent = '{:s}\t{:.4f}\t{:s}\t{:.4f}\t{:.4f}\t{:s}\t{:4f}\n'. \
                        format(batch[i], item_ts, item_pws, item_ns, item_tu, item_pwu, item_nu)
                    # Write to file
                    in_file.write(scored_sent)
                    # Update corpus stats dictionary
                    corpus_stats['Total_surprisal'].append(item_ts)
                    corpus_stats['Per_word_surprisal'].extend(item_pws_floats)
                    corpus_stats['Normalized_surprisal'].append(item_ns)
                    corpus_stats['Total_UID_divergence'].append(item_tu)
                    corpus_stats['Per_word_UID_divergence'].extend(
                        item_pwu_floats)
                    corpus_stats['Normalized_UID_divergence'].append(item_nu)

    # Archive corpus statistics
    with open(lm_notes, 'a') as notes_file:
        notes_file.write('\n')
        notes_file.write(
            '------------ Annotated Corpus Statistics -------------\n')
        notes_file.write('Metric\tMean\tMedian\tLowest\tHighest\n')
        for k, v in corpus_stats.items():
            notes_file.write('{:s}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(
                k, np.mean(v), np.median(v), np.min(v), np.max(v)))

    # Report scoring duration
    elapsed = time.time() - starting_time
    print(
        'Annotation took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))
Beispiel #10
0
def train_session():
    """ Executes a training session on the SAE model. """
    # Clear the default graph within which the model graph is constructed
    tf.reset_default_graph()
    # Load data
    train_data = load_pickle(train_pickle)
    valid_data = load_pickle(valid_pickle)
    # Construct the model graph
    sae_name = 'seq_ae' + '_{:s}'.format(train_opt.train_id)
    autoencoder = SeqAE(vocab, train_opt, sae_name)
    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    all_init_op = tf.global_variables_initializer()

    # Extract pre-trained word embeddings from the IDGAN-internal LM and use them to initialize the SAE
    initialized_vars = [
        var for var in all_vars
        if 'embedding_table' not in var.name or 'optimization' in var.name
    ]
    embeddings_sae_keys = [
        var.name for var in all_vars if var not in initialized_vars
    ]
    embedding_lm_keys = list()
    # Handle scoping discrepancies between SAE graph and LM checkpoints,
    # to make LM variables compatible with the instantiated graph
    for k in embeddings_sae_keys:
        k = k.replace(sae_name, 'cog_lm')
        k = k.split(':')[0]
        embedding_lm_keys.append(k)
    embeddings_dir = os.path.join(train_opt.root_dir,
                                  'cognitive_language_model/src/checkpoints/')
    embeddings_epoch = 'best'
    # Map SAE embedding variables to LM embedding variables,
    # so that the former may be initialized with values extracted from the latter
    embeddings_dict = {
        embedding_lm_keys[i]:
        [v for v in tf.global_variables()
         if v.name == embeddings_sae_keys[i]][0]
        for i in range(len(embedding_lm_keys))
    }
    # Declare saver object for initializing SAE's embedding table with embeddings learned by IDGAN's LM
    embeddings_saver = tf.train.Saver(embeddings_dict)
    # Declare OP for initializing other SAE parameters randomly
    no_embeds_init_op = tf.variables_initializer(initialized_vars)
    # Time training duration
    starting_time = time.time()

    with tf.Session(config=config) as train_sess:
        # Initialize variables
        if train_opt.is_local:
            # No pre-trained embeddings are loaded for experiments on the toy set
            train_sess.run(all_init_op)
        else:
            load_model(train_sess, embeddings_saver, embeddings_dir,
                       embeddings_epoch)
            train_sess.run(no_embeds_init_op)

        # Initialize SAE interface and trainer, used for inference and training steps, respectively
        interface = SeqAEInterface(autoencoder, vocab, train_sess, test_opt)
        trainer = SeqAETrainer(vocab, train_opt, autoencoder, train_sess,
                               train_data, valid_data, test_opt, interface)
        # Train model (either for a predefined number of epochs or until early stopping)
        print('+++TRAINING+++')
        trainer.train_model()

    # Report training duration
    elapsed = time.time() - starting_time
    logging.info(
        'Training took {:d} hours, {:d} minutes, and {:.4f} seconds.'.format(
            int(elapsed // 3600),
            int((elapsed % 3600)) // 60, elapsed % 60))