Beispiel #1
0
def main():
    args = parse_args()
    state = prototype_state()

    state_path = args.model_prefix + "_state.pkl"
    model_path = args.model_prefix + "_model.npz"

    with open(state_path) as src:
        state.update(cPickle.load(src))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state['compute_training_updates'] = False

    model = DialogEncoderDecoder(state)

    sampler = search.RandomSampler(model)
    if args.beam_search:
        sampler = search.BeamSampler(model)
    if args.diverse_beam_search:
        sampler = search.DiverseBeamSampler(model, args.gamma)

    if os.path.isfile(model_path):
        logger.debug("Loading previous model")
        model.load(model_path)
    else:
        raise Exception("Must specify a valid model path")

    contexts = [[]]
    lines = open(args.context, "r").readlines()
    if len(lines):
        contexts = [x.strip() for x in lines]

    print('Sampling started...')
    context_samples, context_costs = sampler.sample(contexts,
                                                    n_samples=args.n_samples,
                                                    n_turns=args.n_turns,
                                                    ignore_unk=args.ignore_unk,
                                                    verbose=args.verbose,
                                                    return_words=True)
    print('Sampling finished.')
    print('Saving to file...')

    # Write to output file
    print type(context_samples)
    print type(context_samples[0])
    print context_samples[0]
    output_handle = open(args.output, "w")
    for context_sample in context_samples:

        print >> output_handle, '\t'.join(context_sample)
    output_handle.close()
    print('Saving to file finished.')
    print('All done!')
Beispiel #2
0
def main():
    args = parse_args()
    state = prototype_ubuntu_HRED()#state()

    state_path = args.model_prefix + "_state.pkl"
    model_path = args.model_prefix + "_model.npz"

    with open(state_path) as src:
        state.update(cPickle.load(src))

    state['dictionary'] = "/home/ml/rlowe1/UbuntuData/Dataset.dict.pkl"

    logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    model = DialogEncoderDecoder(state) 
    
    sampler = search.RandomSampler(model)
    if args.beam_search:
        sampler = search.BeamSampler(model)

    if os.path.isfile(model_path):
        logger.debug("Loading previous model")
        model.load(model_path)
    else:
        raise Exception("Must specify a valid model path")
  
    # Start chat loop
    utterances = collections.deque()
    
    while (True):
       var = raw_input("User - ")

       # Increase number of utterances. We just set it to zero for simplicity so that model has no memory. 
       # But it works fine if we increase this number
       while len(utterances) > 0:
           utterances.popleft()
         
       current_utterance = [ model.end_sym_utterance ] + ['<first_speaker>'] + var.split() + [ model.end_sym_utterance ]
       utterances.append(current_utterance)
         
       #TODO Sample a random reply. To spice it up, we could pick the longest reply or the reply with the fewest placeholders...
       seqs = list(itertools.chain(*utterances))

       #TODO Retrieve only replies which are generated for second speaker...
       sentences = sample(model, \
            seqs=[seqs], ignore_unk=args.ignore_unk, \
            sampler=sampler, n_samples=5)

       if len(sentences) == 0:
           raise ValueError("Generation error, no sentences were produced!")

       utterances.append(sentences[0][0].split())

       reply = sentences[0][0].encode('utf-8')
       print "AI - ", remove_speaker_tokens(reply)
Beispiel #3
0
def main():
    args = parse_args()
    state = prototype_state()

    state_path = args.model_prefix + "_state.pkl"
    model_path = args.model_prefix + "_model.npz"
    timing_path = args.model_prefix + "_timing.npz"

    with open(state_path, 'r') as src:
        state.update(cPickle.load(src))
    with open(timing_path, 'r') as src:
        timings = dict(numpy.load(src))

    state['compute_training_updates'] = False

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    print "\nLoaded previous state, model, timings:"
    print "state:"
    print state
    print "timings:"
    print timings

    print "\nBuilding model..."
    model = DialogEncoderDecoder(state)

    sampler = search.RandomSampler(model)
    if args.beam_search:
        sampler = search.BeamSampler(model)

    if os.path.isfile(model_path):
        model.load(model_path)
    else:
        raise Exception("Must specify a valid model path")
    print "build.\n"

    context = []
    while True:
        line = raw_input("user: "******"<first_speaker> <at> " + line + " </s> ")
        print "context: ", [' '.join(context[-4:])]
        context_samples, context_costs = sampler.sample(
            [' '.join(context[-4:])],
            ignore_unk=args.ignore_unk,
            verbose=args.verbose,
            return_words=True)

        print "bot:", context_samples
        context.append(context_samples[0][0] + " </s> ")
        print "cost:", context_costs
Beispiel #4
0
def main():
    args = parse_args()
    state = prototype_state()

    state_path = args.model_prefix + "_state.pkl"
    model_path = args.model_prefix + "_model.npz"

    with open(state_path) as src:
        state.update(cPickle.load(src))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    model = DialogEncoderDecoder(state)

    sampler = search.RandomSampler(model)
    if args.beam_search:
        sampler = search.BeamSampler(model)

    if os.path.isfile(model_path):
        logger.debug("Loading previous model")
        model.load(model_path)
    else:
        raise Exception("Must specify a valid model path")

    contexts = [[]]
    lines = open(args.context, "r").readlines()
    if len(lines):
        contexts = [x.strip().split('\t') for x in lines]

    context_samples, context_costs = sampler.sample(contexts,
                                                    n_samples=args.n_samples,
                                                    n_turns=args.n_turns,
                                                    ignore_unk=args.ignore_unk,
                                                    verbose=args.verbose)

    # Write to output file
    output_handle = open(args.output, "w")
    for context_sample in context_samples:
        print >> output_handle, '\t'.join(context_sample)
    output_handle.close()
Beispiel #5
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state = eval(args.prototype)()
    timings = init_timings()

    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)
        state_file = args.resume + '_state.pkl'
        timings_file = args.resume + '_timing.npz'
        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")
            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)
        else:
            raise Exception("Cannot resume, cannot find files!")

    logger.info("State:\n{}".format(pprint.pformat(state)))
    logger.info("Timings:\n{}".format(pprint.pformat(timings)))

    model = SessionEncoderDecoder(state)
    rng = model.rng

    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.info("Loading previous model")
            load(model, filename)
        else:
            raise Exception("Cannot resume, cannot find model file!")
    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID

    logger.info("Compile trainer")
    train_batch = model.build_train_function()

    logger.info("Visualizing")
    pydotprint(train_batch, 'visualize.png')

    logger.info("Compile eval")
    eval_batch = model.build_eval_function()
    random_sampler = search.RandomSampler(model)

    logger.info("Load data")
    train_data, valid_data = get_batch_iterator(rng, state)
    train_data.start()

    # Start looping through the dataset
    step = 0
    patience = state['patience']
    start_time = time.time()
    train_cost = 0
    train_done = 0
    ex_done = 0
    while step < state['loop_iters'] and patience >= 0:
        # Sample stuff
        if step % 200 == 0:
            for param in model.params:
                print "%s = %.4f" % (param.name, numpy.sum(param.get_value()**
                                                           2)**0.5)
            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print "Sampled : {}".format(samples[0])

        # Training phase
        batch = train_data.next()
        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break
        c = train_batch(batch['x'], batch['y'], batch['max_length'],
                        batch['x_mask'])
        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            continue

        train_cost += c
        train_done += batch['num_preds']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time
            h, m, s = ConvertTimedelta(this_time - start_time)
            print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f" % (h, m, s,\
                                                                             state['time_stop'] - (time.time() - start_time)/60.,\
                                                                             step, \
                                                                             batch['x'].shape[1], \
                                                                             batch['max_length'], \
                                                                             float(train_cost/train_done))
        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            valid_data.start()
            valid_cost = 0
            valid_done = 0
            logger.debug("[VALIDATION START]")
            while True:
                batch = valid_data.next()
                # Train finished
                if not batch:
                    break
                if numpy.isinf(c) or numpy.isnan(c):
                    continue
                c = eval_batch(batch['x'], batch['y'], batch['max_length'],
                               batch['x_mask'])
                valid_cost += c
                valid_done += batch['num_preds']
            logger.debug("[VALIDATION END]")
            valid_cost /= valid_done
            if len(timings["valid"]) == 0 or valid_cost < numpy.min(
                    numpy.array(timings["valid"])):
                patience = state['patience']
                # Saving model if decrease in validation cost
                save(model, timings)
            elif valid_cost >= timings["valid"][-1] * state['cost_threshold']:
                patience -= 1

            print "** validation error = %.4f, patience = %d" % (
                float(valid_cost), patience)
            timings["train"].append(train_cost / train_done)
            timings["valid"].append(valid_cost)

            # Reset train cost and train done
            train_cost = 0
            train_done = 0
        step += 1
    logger.debug("All done, exiting...")
Beispiel #6
0
def main(args):
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state = eval(args.prototype)()
    timings = init_timings()

    auto_restarting = False
    if args.auto_restart:
        assert not args.save_every_valid_iteration
        assert len(args.resume) == 0

        directory = state['save_dir']
        if not directory[-1] == '/':
            directory = directory + '/'

        auto_resume_postfix = state['prefix'] + '_auto_model.npz'

        if os.path.exists(directory):
            directory_files = [
                f for f in listdir(directory) if isfile(join(directory, f))
            ]
            resume_filename = ''
            for f in directory_files:
                if len(f) > len(auto_resume_postfix):
                    if f[len(f) - len(auto_resume_postfix
                                      ):len(f)] == auto_resume_postfix:
                        if len(resume_filename) > 0:
                            print('ERROR: FOUND MULTIPLE MODELS IN DIRECTORY:',
                                  directory)
                            assert False
                        else:
                            resume_filename = directory + f[
                                0:len(f) - len('__auto_model.npz')]

            if len(resume_filename) > 0:
                logger.debug("Found model to automatically resume: %s" %
                             resume_filename)
                auto_restarting = True
                # Setup training to automatically resume training with the model found
                args.resume = resume_filename + '__auto'
                # Disable training from reinitialization any parameters
                args.reinitialize_decoder_parameters = False
                args.reinitialize_latent_variable_parameters = False
            else:
                logger.debug(
                    "Could not find any model to automatically resume...")

    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)

        state_file = args.resume + '_state.pkl'
        timings_file = args.resume + '_timing.npz'

        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")

            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)

            # Increment seed to make sure we get newly shuffled batches when training on large datasets
            state['seed'] = state['seed']

        else:
            raise Exception("Cannot resume, cannot find files!")

    logger.debug("State:\n{}".format(pprint.pformat(state)))
    logger.debug("Timings:\n{}".format(pprint.pformat(timings)))

    if args.force_train_all_wordemb == True:
        state['fix_pretrained_word_embeddings'] = False

    model = DialogEncoderDecoder(state)
    rng = model.rng

    valid_rounds = 0
    save_model_on_first_valid = False

    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.debug("Loading previous model")

            parameter_strings_to_ignore = []
            if args.reinitialize_decoder_parameters:
                parameter_strings_to_ignore += ['Wd_']
                parameter_strings_to_ignore += ['bd_']

                save_model_on_first_valid = True
            if args.reinitialize_latent_variable_parameters:
                parameter_strings_to_ignore += ['latent_utterance_prior']
                parameter_strings_to_ignore += [
                    'latent_utterance_approx_posterior'
                ]
                parameter_strings_to_ignore += ['kl_divergence_cost_weight']
                parameter_strings_to_ignore += ['latent_dcgm_encoder']

                save_model_on_first_valid = True

            load(model, filename, parameter_strings_to_ignore)
        else:
            raise Exception("Cannot resume, cannot find model file!")

        if 'run_id' not in model.state:
            raise Exception(
                'Backward compatibility not ensured! (need run_id in state)')

    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID

    logger.debug("Compile trainer")
    if not state["use_nce"]:
        if ('add_latent_gaussian_per_utterance'
                in state) and (state["add_latent_gaussian_per_utterance"]):
            logger.debug(
                "Training using variational lower bound on log-likelihood")
        else:
            logger.debug("Training using exact log-likelihood")

        train_batch = model.build_train_function()
    else:
        logger.debug("Training with noise contrastive estimation")
        train_batch = model.build_nce_function()

    eval_batch = model.build_eval_function()

    gamma_bounding = model.build_gamma_bounding_function()

    random_sampler = search.RandomSampler(model)
    beam_sampler = search.BeamSampler(model)

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    train_data.start()

    # Start looping through the dataset
    step = 0
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_kl_divergence_cost = 0
    train_posterior_gaussian_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    ex_done = 0
    is_end_of_batch = True
    start_validation = False

    batch = None

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        # Flush to log files
        sys.stderr.flush()
        sys.stdout.flush()

        ### Sampling phase
        if step % 200 == 0:
            # First generate stochastic samples
            for param in model.params:
                print("%s = %.4f" %
                      (param.name, numpy.sum(param.get_value()**2)**0.5))

            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print("Sampled : {}".format(samples[0]))

        ### Training phase
        batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break

        logger.debug("[TRAIN] - Got batch %d,%d" %
                     (batch['x'].shape[1], batch['max_length']))

        x_data = batch['x']
        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_reset = batch['x_reset']
        ran_gaussian_const_utterance = batch['ran_var_gaussian_constutterance']
        ran_uniform_const_utterance = batch['ran_var_uniform_constutterance']

        ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

        is_end_of_batch = False
        if numpy.sum(numpy.abs(x_reset)) < 1:
            # Print when we reach the end of an example (e.g. the end of a dialogue or a document)
            # Knowing when the training procedure reaches the end is useful for diagnosing training problems
            # print('END-OF-BATCH EXAMPLE!')
            is_end_of_batch = True

        if state['use_nce']:
            y_neg = rng.choice(size=(10, max_length, x_data.shape[1]),
                               a=model.idim,
                               p=model.noise_probs).astype('int32')
            c, kl_divergence_cost, posterior_gaussian_mean_variance = train_batch(
                x_data, x_data_reversed, y_neg, max_length, x_cost_mask,
                x_reset, ran_gaussian_const_utterance,
                ran_uniform_const_utterance, ran_decoder_drop_mask)
        else:

            latent_piecewise_utterance_variable_approx_posterior_alpha = 0.0
            latent_piecewise_utterance_variable_prior_alpha = 0.0
            kl_divergences_between_piecewise_prior_and_posterior = 0.0
            kl_divergences_between_gaussian_prior_and_posterior = 0.0
            latent_piecewise_posterior_sample = 0.0
            posterior_gaussian_mean_variance = 0.0

            if model.add_latent_piecewise_per_utterance and model.add_latent_gaussian_per_utterance:
                c, kl_divergence_cost, posterior_gaussian_mean_variance, latent_piecewise_utterance_variable_approx_posterior_alpha, latent_piecewise_utterance_variable_prior_alpha, kl_divergences_between_piecewise_prior_and_posterior, kl_divergences_between_gaussian_prior_and_posterior, latent_piecewise_posterior_sample = train_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_gaussian_const_utterance, ran_uniform_const_utterance,
                    ran_decoder_drop_mask)
            elif model.add_latent_gaussian_per_utterance:
                c, kl_divergence_cost, posterior_gaussian_mean_variance, kl_divergences_between_gaussian_prior_and_posterior = train_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_gaussian_const_utterance, ran_uniform_const_utterance,
                    ran_decoder_drop_mask)
            elif model.add_latent_piecewise_per_utterance:
                c, kl_divergence_cost, kl_divergences_between_piecewise_prior_and_posterior = train_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_gaussian_const_utterance, ran_uniform_const_utterance,
                    ran_decoder_drop_mask)
            else:
                c = train_batch(x_data, x_data_reversed, max_length,
                                x_cost_mask, x_reset,
                                ran_gaussian_const_utterance,
                                ran_uniform_const_utterance,
                                ran_decoder_drop_mask)
                kl_divergence_cost = 0.0

        gamma_bounding()

        # Print batch statistics
        print('cost_sum', c)
        print('cost_mean', c / float(numpy.sum(x_cost_mask)))

        if model.add_latent_piecewise_per_utterance or model.add_latent_gaussian_per_utterance:
            print('kl_divergence_cost_sum', kl_divergence_cost)
            print(
                'kl_divergence_cost_mean', kl_divergence_cost /
                float(len(numpy.where(x_data == model.eos_sym)[0])))

        if model.add_latent_gaussian_per_utterance:
            print('posterior_gaussian_mean_variance',
                  posterior_gaussian_mean_variance)
            print(
                'kl_divergences_between_gaussian_prior_and_posterior',
                numpy.sum(kl_divergences_between_gaussian_prior_and_posterior),
                numpy.min(kl_divergences_between_gaussian_prior_and_posterior),
                numpy.max(kl_divergences_between_gaussian_prior_and_posterior))

        if model.add_latent_piecewise_per_utterance:
            print(
                'kl_divergences_between_piecewise_prior_and_posterior',
                numpy.sum(
                    kl_divergences_between_piecewise_prior_and_posterior),
                numpy.min(
                    kl_divergences_between_piecewise_prior_and_posterior),
                numpy.max(
                    kl_divergences_between_piecewise_prior_and_posterior))

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            gc.collect()
            continue

        train_cost += c
        train_kl_divergence_cost += kl_divergence_cost
        train_posterior_gaussian_mean_variance += posterior_gaussian_mean_variance

        train_done += batch['num_preds']
        train_dialogues_done += batch['num_dialogues']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time

            # Keep track of training cost for the last 'train_freq' batches.
            current_train_cost = train_cost / train_done
            if prev_train_done >= 1 and abs(train_done - prev_train_done) > 0:
                current_train_cost = float(
                    train_cost - prev_train_cost) / float(train_done -
                                                          prev_train_done)

            if numpy.isinf(c) or numpy.isnan(c):
                current_train_cost = 0

            prev_train_cost = train_cost
            prev_train_done = train_done

            h, m, s = ConvertTimedelta(this_time - start_time)

            # We need to catch exceptions due to high numbers in exp
            try:
                print(".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_kl_divergence_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\
                                 state['time_stop'] - (time.time() - start_time)/60.,\
                                 step, \
                                 batch['x'].shape[1], \
                                 batch['max_length'], \
                                 float(train_cost/train_done), \
                                 math.exp(float(train_cost/train_done)), \
                                 current_train_cost, \
                                 math.exp(current_train_cost), \
                                 float(train_misclass)/float(train_done), \
                                 float(train_kl_divergence_cost/train_done), \
                                 float(train_posterior_gaussian_mean_variance/train_dialogues_done)))
            except:
                pass

        ### Inspection phase
        if (step % 20 == 0):
            if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance:
                try:
                    print('posterior_gaussian_mean_combination',
                          model.posterior_mean_combination.W.get_value())

                except:
                    pass

                print(
                    'latent_piecewise_utterance_variable_approx_posterior_alpha',
                    numpy.mean(
                        latent_piecewise_utterance_variable_approx_posterior_alpha
                    ),
                    latent_piecewise_utterance_variable_approx_posterior_alpha)

                print(
                    'latent_piecewise_utterance_variable_prior_alpha',
                    numpy.mean(
                        latent_piecewise_utterance_variable_prior_alpha),
                    latent_piecewise_utterance_variable_prior_alpha)

                print(
                    'latent_piecewise_utterance_variable_alpha_diff',
                    (latent_piecewise_utterance_variable_approx_posterior_alpha
                     - latent_piecewise_utterance_variable_prior_alpha))

                print('latent_piecewise_posterior_sample',
                      numpy.min(latent_piecewise_posterior_sample),
                      numpy.max(latent_piecewise_posterior_sample),
                      latent_piecewise_posterior_sample[0, 0, :])
                print('ran_uniform_const_utterance',
                      numpy.min(ran_uniform_const_utterance),
                      numpy.max(ran_uniform_const_utterance),
                      ran_uniform_const_utterance[0, 0, :])

            if model.utterance_decoder_gating.upper(
            ) == 'GRU' and model.decoder_bias_type.upper() == 'ALL':
                Wd_s_q = model.utterance_decoder.Wd_s_q.get_value()
                Wd_s_q_len = Wd_s_q.shape[0]
                print('model.utterance_decoder Wd_s_q full',
                      numpy.mean(numpy.abs(Wd_s_q)), numpy.mean(Wd_s_q**2))

                if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance:
                    Wd_s_q_gaussian = Wd_s_q[
                        Wd_s_q_len - 2 *
                        model.latent_piecewise_per_utterance_dim:Wd_s_q_len -
                        model.latent_piecewise_per_utterance_dim, :]
                    Wd_s_q_piecewise = Wd_s_q[
                        Wd_s_q_len -
                        model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :]

                    print('model.utterance_decoder Wd_s_q gaussian',
                          numpy.mean(numpy.abs(Wd_s_q_gaussian)),
                          numpy.mean(Wd_s_q_gaussian**2))
                    print('model.utterance_decoder Wd_s_q piecewise',
                          numpy.mean(numpy.abs(Wd_s_q_piecewise)),
                          numpy.mean(Wd_s_q_piecewise**2))

                    print(
                        'model.utterance_decoder Wd_s_q piecewise/gaussian',
                        numpy.mean(numpy.abs(Wd_s_q_piecewise)) /
                        numpy.mean(numpy.abs(Wd_s_q_gaussian)),
                        numpy.mean(Wd_s_q_piecewise**2) /
                        numpy.mean(Wd_s_q_gaussian**2))

                elif model.add_latent_gaussian_per_utterance:
                    Wd_s_q_piecewise = Wd_s_q[
                        Wd_s_q_len -
                        model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :]

                    print('model.utterance_decoder Wd_s_q piecewise',
                          numpy.mean(numpy.abs(Wd_s_q_piecewise)),
                          numpy.mean(Wd_s_q_piecewise**2))

                elif model.add_latent_piecewise_per_utterance:
                    Wd_s_q_gaussian = Wd_s_q[
                        Wd_s_q_len -
                        model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :]

                    print('model.utterance_decoder Wd_s_q gaussian',
                          numpy.mean(numpy.abs(Wd_s_q_gaussian)),
                          numpy.mean(Wd_s_q_gaussian**2))

            if model.utterance_decoder_gating.upper(
            ) == 'BOW' and model.decoder_bias_type.upper() == 'ALL':
                Wd_bow_W_in = model.utterance_decoder.Wd_bow_W_in.get_value()
                Wd_bow_W_in_len = Wd_bow_W_in.shape[0]
                print('model.utterance_decoder Wd_bow_W_in full',
                      numpy.mean(numpy.abs(Wd_bow_W_in)),
                      numpy.mean(Wd_bow_W_in**2))

                if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance:
                    Wd_bow_W_in_gaussian = Wd_bow_W_in[
                        Wd_bow_W_in_len -
                        2 * model.latent_piecewise_per_utterance_dim:
                        Wd_bow_W_in_len -
                        model.latent_piecewise_per_utterance_dim, :]
                    Wd_bow_W_in_piecewise = Wd_bow_W_in[
                        Wd_bow_W_in_len - model.
                        latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :]

                    print('model.utterance_decoder Wd_bow_W_in gaussian',
                          numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)),
                          numpy.mean(Wd_bow_W_in_gaussian**2))
                    print('model.utterance_decoder Wd_bow_W_in piecewise',
                          numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)),
                          numpy.mean(Wd_bow_W_in_piecewise**2))

                    print(
                        'model.utterance_decoder Wd_bow_W_in piecewise/gaussian',
                        numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)) /
                        numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)),
                        numpy.mean(Wd_bow_W_in_piecewise**2) /
                        numpy.mean(Wd_bow_W_in_gaussian**2))

                elif model.add_latent_gaussian_per_utterance:
                    Wd_bow_W_in_piecewise = Wd_bow_W_in[
                        Wd_bow_W_in_len - model.
                        latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :]

                    print('model.utterance_decoder Wd_bow_W_in piecewise',
                          numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)),
                          numpy.mean(Wd_bow_W_in_piecewise**2))

                elif model.add_latent_piecewise_per_utterance:
                    Wd_bow_W_in_gaussian = Wd_bow_W_in[
                        Wd_bow_W_in_len - model.
                        latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :]

                    print('model.utterance_decoder Wd_bow_W_in gaussian',
                          numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)),
                          numpy.mean(Wd_bow_W_in_gaussian**2))

        ### Evaluation phase
        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            start_validation = True

        # Only start validation loop once it's time to validate and once all previous batches have been reset
        if start_validation and is_end_of_batch:
            start_validation = False
            valid_data.start()
            valid_cost = 0
            valid_kl_divergence_cost = 0
            valid_posterior_gaussian_mean_variance = 0

            valid_wordpreds_done = 0
            valid_dialogues_done = 0

            logger.debug("[VALIDATION START]")

            while True:
                batch = valid_data.next()

                # Validation finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                x_data = batch['x']
                x_data_reversed = batch['x_reversed']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']

                x_reset = batch['x_reset']
                ran_gaussian_const_utterance = batch[
                    'ran_var_gaussian_constutterance']
                ran_uniform_const_utterance = batch[
                    'ran_var_uniform_constutterance']

                ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

                posterior_gaussian_mean_variance = 0.0

                c, c_list, kl_divergence_cost = eval_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_gaussian_const_utterance, ran_uniform_const_utterance,
                    ran_decoder_drop_mask)

                # Rehape into matrix, where rows are validation samples and columns are tokens
                # Note that we use max_length-1 because we don't get a cost for the first token
                # (the first token is always assumed to be eos)
                c_list = c_list.reshape((batch['x'].shape[1], max_length - 1),
                                        order=(1, 0))
                c_list = numpy.sum(c_list, axis=1)

                words_in_dialogues = numpy.sum(x_cost_mask, axis=0)
                c_list = c_list / words_in_dialogues

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                valid_kl_divergence_cost += kl_divergence_cost
                valid_posterior_gaussian_mean_variance += posterior_gaussian_mean_variance

                # Print batch statistics
                print('valid_cost', valid_cost)
                print('valid_kl_divergence_cost sample', kl_divergence_cost)
                print('posterior_gaussian_mean_variance',
                      posterior_gaussian_mean_variance)

                valid_wordpreds_done += batch['num_preds']
                valid_dialogues_done += batch['num_dialogues']

            logger.debug("[VALIDATION END]")

            valid_cost /= max(1.0, valid_wordpreds_done)
            valid_kl_divergence_cost /= max(1.0, valid_wordpreds_done)
            valid_posterior_gaussian_mean_variance /= max(
                1.0, valid_dialogues_done)

            if (len(timings["valid_cost"]) == 0) \
                or (valid_cost < numpy.min(timings["valid_cost"])) \
                or (save_model_on_first_valid and valid_rounds == 0):
                patience = state['patience']

                # Save model if there is decrease in validation cost
                save(model, timings, train_data)
                print('best valid_cost', valid_cost)
            elif valid_cost >= timings["valid_cost"][-1] * state[
                    'cost_threshold']:
                patience -= 1

            if args.save_every_valid_iteration:
                save(model, timings, train_data, '_' + str(step) + '_')
            if args.auto_restart:
                save(model, timings, train_data, '_auto_')

            # We need to catch exceptions due to high numbers in exp
            try:
                print(
                    "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid kldiv cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d"
                    %
                    (float(valid_cost), float(
                        math.exp(valid_cost)), float(valid_kl_divergence_cost),
                     float(valid_posterior_gaussian_mean_variance), patience))
            except:
                try:
                    print("** valid cost (NLL) = %.4f, patience = %d" %
                          (float(valid_cost), patience))
                except:
                    pass

            timings["train_cost"].append(train_cost / train_done)
            timings["train_kl_divergence_cost"].append(
                train_kl_divergence_cost / train_done)
            timings["train_posterior_gaussian_mean_variance"].append(
                train_posterior_gaussian_mean_variance / train_dialogues_done)
            timings["valid_cost"].append(valid_cost)
            timings["valid_kl_divergence_cost"].append(
                valid_kl_divergence_cost)
            timings["valid_posterior_gaussian_mean_variance"].append(
                valid_posterior_gaussian_mean_variance)

            # Reset train cost, train misclass and train done metrics
            train_cost = 0
            train_done = 0
            prev_train_cost = 0
            prev_train_done = 0

            # Count number of validation rounds done so far
            valid_rounds += 1

        step += 1

    logger.debug("All done, exiting...")
Beispiel #7
0
def main(args):
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state = eval(args.prototype)()
    timings = init_timings()

    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)

        state_file = args.resume + '_state.pkl'
        timings_file = args.resume + '_timing.npz'

        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")

            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)

            # Increment seed to make sure we get newly shuffled batches when training on large datasets
            state['seed'] = state['seed'] + 10

        else:
            raise Exception("Cannot resume, cannot find files!")

    logger.debug("State:\n{}".format(pprint.pformat(state)))
    logger.debug("Timings:\n{}".format(pprint.pformat(timings)))

    if args.force_train_all_wordemb == True:
        state['fix_pretrained_word_embeddings'] = False

    model = DialogEncoderDecoder(state)
    rng = model.rng

    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.debug("Loading previous model")

            parameter_strings_to_ignore = []
            if args.reinitialize_decoder_parameters:
                parameter_strings_to_ignore += ['latent_utterance_prior']
                parameter_strings_to_ignore += [
                    'latent_utterance_approx_posterior'
                ]
            if args.reinitialize_variational_parameters:
                parameter_strings_to_ignore += ['Wd_']
                parameter_strings_to_ignore += ['bd_']
                parameter_strings_to_ignore += ['variational_cost_weight']

            load(model, filename, parameter_strings_to_ignore)
        else:
            raise Exception("Cannot resume, cannot find model file!")

        if 'run_id' not in model.state:
            raise Exception(
                'Backward compatibility not ensured! (need run_id in state)')

    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID

    logger.debug("Compile trainer")
    if not state["use_nce"]:
        if ('add_latent_gaussian_per_utterance'
                in state) and (state["add_latent_gaussian_per_utterance"]):
            logger.debug(
                "Training using variational lower bound on log-likelihood")
        else:
            logger.debug("Training using exact log-likelihood")

        train_batch = model.build_train_function()
    else:
        logger.debug("Training with noise contrastive estimation")
        train_batch = model.build_nce_function()

    eval_batch = model.build_eval_function()

    if model.add_latent_gaussian_per_utterance:
        eval_grads = model.build_eval_grads()

    random_sampler = search.RandomSampler(model)
    beam_sampler = search.BeamSampler(model)

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    train_data.start()

    use_secondary_data = False
    if ('secondary_train_dialogues'
            in state) and (len(state['secondary_train_dialogues']) > 0):
        logger.debug("Load secondary data")
        use_secondary_data = True
        secondary_train_data = get_secondary_train_iterator(state)
        secondary_train_data.start()
        secondary_rng = numpy.random.RandomState(state['seed'])

    # Build the data structures for Bleu evaluation
    if 'bleu_evaluation' in state:
        bleu_eval_n_1 = BleuEvaluator(n=1)
        bleu_eval_n_2 = BleuEvaluator(n=2)
        bleu_eval_n_3 = BleuEvaluator(n=3)
        bleu_eval_n_4 = BleuEvaluator(n=4)
        jaccard_eval = JaccardEvaluator()
        recall_at_1_eval = RecallEvaluator(n=1)
        recall_at_5_eval = RecallEvaluator(n=5)
        mrr_at_5_eval = MRREvaluator(n=5)
        tfidf_cs_at_1_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 1)
        tfidf_cs_at_5_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 5)

        samples = open(state['bleu_evaluation'], 'r').readlines()
        n = state['bleu_context_length']

        contexts = []
        targets = []
        for x in samples:
            sentences = x.strip().split('\t')
            assert len(sentences) > n
            contexts.append(sentences[:n])
            targets.append(sentences[n:])

    # Start looping through the dataset
    step = 0
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_variational_cost = 0
    train_posterior_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    ex_done = 0
    is_end_of_batch = True
    start_validation = False
    training_on_secondary_dataset = False

    batch = None

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        # Sample stuff
        if step % 200 == 0:
            # First generate stochastic samples
            for param in model.params:
                print "%s = %.4f" % (param.name, numpy.sum(param.get_value()**
                                                           2)**0.5)

            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print "Sampled : {}".format(samples[0])

        # Training phase

        # If we are training on a primary and secondary dataset, sample at random from either of them
        if is_end_of_batch:
            if use_secondary_data and (secondary_rng.uniform() >
                                       state['secondary_proportion']):
                training_on_secondary_dataset = True
            else:
                training_on_secondary_dataset = False

        if training_on_secondary_dataset:
            batch = secondary_train_data.next()
        else:
            batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break

        logger.debug("[TRAIN] - Got batch %d,%d" %
                     (batch['x'].shape[1], batch['max_length']))

        x_data = batch['x']
        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_semantic = batch['x_semantic']
        x_reset = batch['x_reset']
        ran_cost_utterance = batch['ran_var_constutterance']
        ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

        is_end_of_batch = False
        if numpy.sum(numpy.abs(x_reset)) < 1:
            print 'END-OF-BATCH EXAMPLE!'
            is_end_of_batch = True

        if state['use_nce']:
            y_neg = rng.choice(size=(10, max_length, x_data.shape[1]),
                               a=model.idim,
                               p=model.noise_probs).astype('int32')
            c, variational_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, y_neg, max_length, x_cost_mask,
                x_semantic, x_reset, ran_cost_utterance, ran_decoder_drop_mask)
        else:
            c, variational_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, max_length, x_cost_mask, x_semantic,
                x_reset, ran_cost_utterance, ran_decoder_drop_mask)

        print 'cost_sum', c
        print 'cost_mean', c / float(numpy.sum(x_cost_mask))
        print 'variational_cost_sum', variational_cost
        print 'variational_cost_mean', variational_cost / float(
            len(numpy.where(x_data == model.eos_sym)[0]))
        print 'posterior_mean_variance', posterior_mean_variance

        #if variational_cost > 2:
        #    print 'x_data', x_data
        #    print 'x_data_reversed', x_data_reversed
        #    print 'max_length', max_length
        #    print 'x_cost_mask', x_cost_mask
        #    print 'x_semantic', x_semantic
        #    print 'x_reset', x_reset
        #    print 'ran_cost_utterance', ran_cost_utterance[0:3, 0:3, 0:3]

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            gc.collect()
            continue

        train_cost += c
        train_variational_cost += variational_cost
        train_posterior_mean_variance += posterior_mean_variance

        train_done += batch['num_preds']
        train_dialogues_done += batch['num_dialogues']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time

            # Keep track of training cost for the last 'train_freq' batches.
            current_train_cost = train_cost / train_done
            if prev_train_done >= 1:
                current_train_cost = float(
                    train_cost - prev_train_cost) / float(train_done -
                                                          prev_train_done)

            prev_train_cost = train_cost
            prev_train_done = train_done

            h, m, s = ConvertTimedelta(this_time - start_time)
            print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_variational_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\
                             state['time_stop'] - (time.time() - start_time)/60.,\
                             step, \
                             batch['x'].shape[1], \
                             batch['max_length'], \
                             float(train_cost/train_done), \
                             math.exp(float(train_cost/train_done)), \
                             current_train_cost, \
                             math.exp(current_train_cost), \
                             float(train_misclass)/float(train_done), \
                             float(train_variational_cost/train_done), \
                             float(train_posterior_mean_variance/train_dialogues_done))


        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            start_validation = True

        # Evaluate gradient variance every 200 steps

        if (step % 200 == 0) and (model.add_latent_gaussian_per_utterance):
            k_eval = 10

            softmax_costs = numpy.zeros((k_eval), dtype='float32')
            var_costs = numpy.zeros((k_eval), dtype='float32')
            gradients_wrt_softmax = numpy.zeros(
                (k_eval, model.qdim_decoder, model.qdim_decoder),
                dtype='float32')
            for k in range(0, k_eval):
                batch = add_random_variables_to_batch(model.state, model.rng,
                                                      batch, None, False)
                ran_cost_utterance = batch['ran_var_constutterance']
                ran_decoder_drop_mask = batch['ran_decoder_drop_mask']
                softmax_cost, var_cost, grads_wrt_softmax, grads_wrt_variational_cost = eval_grads(
                    x_data, x_data_reversed, max_length, x_cost_mask,
                    x_semantic, x_reset, ran_cost_utterance,
                    ran_decoder_drop_mask)
                softmax_costs[k] = softmax_cost
                var_costs[k] = var_cost
                gradients_wrt_softmax[k, :, :] = grads_wrt_softmax

            print 'mean softmax_costs', numpy.mean(softmax_costs)
            print 'std softmax_costs', numpy.std(softmax_costs)

            print 'mean var_costs', numpy.mean(var_costs)
            print 'std var_costs', numpy.std(var_costs)

            print 'mean gradients_wrt_softmax', numpy.mean(
                numpy.abs(numpy.mean(gradients_wrt_softmax,
                                     axis=0))), numpy.mean(
                                         gradients_wrt_softmax, axis=0)
            print 'std gradients_wrt_softmax', numpy.mean(
                numpy.std(gradients_wrt_softmax,
                          axis=0)), numpy.std(gradients_wrt_softmax, axis=0)

            print 'std greater than mean', numpy.where(
                numpy.std(gradients_wrt_softmax, axis=0) > numpy.abs(
                    numpy.mean(gradients_wrt_softmax, axis=0)))[0].shape[0]

            Wd_s_q = model.utterance_decoder.Wd_s_q.get_value()

            print 'Wd_s_q all', numpy.sum(numpy.abs(Wd_s_q)), numpy.mean(
                numpy.abs(Wd_s_q))
            print 'Wd_s_q latent', numpy.sum(
                numpy.abs(
                    Wd_s_q[(Wd_s_q.shape[0] -
                            state['latent_gaussian_per_utterance_dim']
                            ):Wd_s_q.shape[0], :])), numpy.mean(
                                numpy.abs(Wd_s_q[(
                                    Wd_s_q.shape[0] -
                                    state['latent_gaussian_per_utterance_dim']
                                ):Wd_s_q.shape[0], :]))

            print 'Wd_s_q ratio', (numpy.sum(
                numpy.abs(Wd_s_q[(Wd_s_q.shape[0] -
                                  state['latent_gaussian_per_utterance_dim']
                                  ):Wd_s_q.shape[0], :])) /
                                   numpy.sum(numpy.abs(Wd_s_q)))

        #print 'tmp_normalizing_constant_a', tmp_normalizing_constant_a
        #print 'tmp_normalizing_constant_b', tmp_normalizing_constant_b
        #print 'tmp_c', tmp_c.shape, tmp_c
        #print 'tmp_d', tmp_d.shape, tmp_d

        #print 'grads_wrt_softmax', grads_wrt_softmax.shape, numpy.sum(numpy.abs(grads_wrt_softmax)), numpy.abs(grads_wrt_softmax[0:5,0:5])
        #print 'grads_wrt_variational_cost', grads_wrt_variational_cost.shape, numpy.sum(numpy.abs(grads_wrt_variational_cost)), numpy.abs(grads_wrt_variational_cost[0:5,0:5])

        # Only start validation loop once it's time to validate and once all previous batches have been reset
        if start_validation and is_end_of_batch:
            start_validation = False
            valid_data.start()
            valid_cost = 0
            valid_variational_cost = 0
            valid_posterior_mean_variance = 0

            valid_wordpreds_done = 0
            valid_dialogues_done = 0

            # Prepare variables for plotting histogram over word-perplexities and mutual information
            valid_data_len = valid_data.data_len
            valid_cost_list = numpy.zeros((valid_data_len, ))
            valid_pmi_list = numpy.zeros((valid_data_len, ))

            # Prepare variables for printing the training examples the model performs best and worst on
            valid_extrema_setsize = min(state['track_extrema_samples_count'],
                                        valid_data_len)
            valid_extrema_samples_to_print = min(
                state['print_extrema_samples_count'], valid_extrema_setsize)

            max_stored_len = 160  # Maximum number of tokens to store for dialogues with highest and lowest validation errors
            valid_lowest_costs = numpy.ones((valid_extrema_setsize, )) * 1000
            valid_lowest_dialogues = numpy.ones(
                (valid_extrema_setsize, max_stored_len)) * 1000
            valid_highest_costs = numpy.ones(
                (valid_extrema_setsize, )) * (-1000)
            valid_highest_dialogues = numpy.ones(
                (valid_extrema_setsize, max_stored_len)) * (-1000)

            logger.debug("[VALIDATION START]")

            while True:
                batch = valid_data.next()

                # Train finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                x_data = batch['x']
                x_data_reversed = batch['x_reversed']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']
                x_semantic = batch['x_semantic']

                x_reset = batch['x_reset']
                ran_cost_utterance = batch['ran_var_constutterance']
                ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

                c, c_list, variational_cost, posterior_mean_variance = eval_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask,
                    x_semantic, x_reset, ran_cost_utterance,
                    ran_decoder_drop_mask)

                # Rehape into matrix, where rows are validation samples and columns are tokens
                # Note that we use max_length-1 because we don't get a cost for the first token
                # (the first token is always assumed to be eos)
                c_list = c_list.reshape((batch['x'].shape[1], max_length - 1),
                                        order=(1, 0))
                c_list = numpy.sum(c_list, axis=1)

                words_in_dialogues = numpy.sum(x_cost_mask, axis=0)
                c_list = c_list / words_in_dialogues

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                valid_variational_cost += variational_cost
                valid_posterior_mean_variance += posterior_mean_variance

                print 'valid_cost', valid_cost
                print 'valid_variational_cost sample', variational_cost
                print 'posterior_mean_variance', posterior_mean_variance

                valid_wordpreds_done += batch['num_preds']
                valid_dialogues_done += batch['num_dialogues']

            logger.debug("[VALIDATION END]")

            valid_cost /= valid_wordpreds_done
            valid_variational_cost /= valid_wordpreds_done
            valid_posterior_mean_variance /= valid_dialogues_done

            if len(timings["valid_cost"]) == 0 or valid_cost < numpy.min(
                    timings["valid_cost"]):
                patience = state['patience']
                # Saving model if decrease in validation cost
                save(model, timings)
                print 'best valid_cost', valid_cost
            elif valid_cost >= timings["valid_cost"][-1] * state[
                    'cost_threshold']:
                patience -= 1

            if args.save_every_valid_iteration:
                save(model, timings, '_' + str(step) + '_')

            print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid variational cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % (
                float(valid_cost), float(
                    math.exp(valid_cost)), float(valid_variational_cost),
                float(valid_posterior_mean_variance), patience)

            timings["train_cost"].append(train_cost / train_done)
            timings["train_variational_cost"].append(train_variational_cost /
                                                     train_done)
            timings["train_posterior_mean_variance"].append(
                train_posterior_mean_variance / train_dialogues_done)
            timings["valid_cost"].append(valid_cost)
            timings["valid_variational_cost"].append(valid_variational_cost)
            timings["valid_posterior_mean_variance"].append(
                valid_posterior_mean_variance)

            # Reset train cost, train misclass and train done
            train_cost = 0
            train_done = 0
            prev_train_cost = 0
            prev_train_done = 0

        step += 1

    logger.debug("All done, exiting...")
Beispiel #8
0
def main():
    ####yawa add
    raw_dict = cPickle.load(open('./Data/Dataset.dict.pkl', 'r'))
    str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict])
    idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in raw_dict])
    #########

    args = parse_args()
    state = prototype_state()

    state_path = args.model_prefix + "_state.pkl"
    model_path = args.model_prefix + "_model.npz"

    with open(state_path) as src:
        state.update(cPickle.load(src))

    logging.basicConfig(
        level=getattr(logging, state['level']),
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    model = DialogEncoderDecoder(state)

    sampler = search.RandomSampler(model)
    if args.beam_search:
        sampler = search.BeamSampler(model)

    if os.path.isfile(model_path):
        logger.debug("Loading previous model")
        model.load(model_path)
    else:
        raise Exception("Must specify a valid model path")

    contexts = [[]]
    lines = open(args.context, "r").readlines()
    if len(lines):
        contexts = [x.strip() for x in lines]
    #contexts = cPickle.load(open('./Data/Test.dialogues.pkl', 'r'))
    print('Sampling started...')
    context_samples, context_costs, att_weights, att_context = sampler.sample(
        contexts,
        n_samples=args.n_samples,
        n_turns=args.n_turns,
        ignore_unk=args.ignore_unk,
        verbose=args.verbose)
    print('Sampling finished.')
    print('Saving to file...')

    # Write to output file
    output_handle = open(args.output, "w")
    for context_sample in context_samples:
        print >> output_handle, '\t'.join(context_sample)
    outline = ''
    #for att_weight in att_weights:
    #for att_in in att_weight:
    #print >> output_handle, str(att_in)
    print "number of weights:" + str(len(att_weights))
    #for i in range(len(att_weights)):
    #outline = att_weights[0]
    cPickle.dump(att_weights,
                 open('Data/beam_search_2000_2_weight.pkl', 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(att_context,
                 open('Data/beam_search_2000_2_context.pkl', 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
    #for i in range(len(att_context)):
    #print att_context[i]
    #print numpy.array(att_weights[0])
    #print type(att_weights[0])
    #aa = numpy.array(att_weights[0])
    #size  = aa.shape[1]
    #bb = aa.reshape(5,5,size/5)
    #print bb.shape

    output_handle.close()
    print('Saving to file finished.')
    print('All done!')
Beispiel #9
0
def main(args):
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    state = eval(args.prototype)()
    timings = init_timings()

    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)

        state_file = args.resume + '_state.pkl'
        timings_file = args.resume + '_timing.npz'

        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")

            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)
        else:
            raise Exception("Cannot resume, cannot find files!")

    logger.debug("State:\n{}".format(pprint.pformat(state)))
    logger.debug("Timings:\n{}".format(pprint.pformat(timings)))

    model = DialogEncoderDecoder(state)
    rng = model.rng

    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.debug("Loading previous model")
            load(model, filename)
        else:
            raise Exception("Cannot resume, cannot find model file!")

        if 'run_id' not in model.state:
            raise Exception(
                'Backward compatibility not ensured! (need run_id in state)')
    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID

    logger.debug("Compile trainer")
    if not state["use_nce"]:
        train_batch = model.build_train_function()
    else:
        train_batch = model.build_nce_function()

    eval_batch = model.build_eval_function()
    eval_misclass_batch = model.build_eval_misclassification_function()

    random_sampler = search.RandomSampler(model)
    beam_sampler = search.BeamSampler(model)

    logger.debug("Load data")
    train_data, \
    valid_data, \
    test_data = get_batch_iterator(rng, state)

    train_data.start()

    # Build the data structures for Bleu evaluation
    if 'bleu_evaluation' in state:
        bleu_eval = BleuEvaluator()
        jaccard_eval = JaccardEvaluator()
        recall_at_1_eval = RecallEvaluator(n=1)
        recall_at_5_eval = RecallEvaluator(n=5)
        mrr_at_5_eval = MRREvaluator(n=5)
        tfidf_cs_at_1_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 1)
        tfidf_cs_at_5_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 5)

        samples = open(state['bleu_evaluation'], 'r').readlines()
        n = state['bleu_context_length']

        contexts = []
        targets = []
        for x in samples:
            sentences = x.strip().split('\t')
            assert len(sentences) > n
            contexts.append(sentences[:n])
            targets.append(sentences[n:])

    # Start looping through the dataset
    step = 0
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_misclass = 0
    train_done = 0
    ex_done = 0

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        # Sample stuff
        if step % 200 == 0:
            for param in model.params:
                print "%s = %.4f" % (param.name, numpy.sum(param.get_value()**
                                                           2)**0.5)

            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print "Sampled : {}".format(samples[0])

        # Training phase
        batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break

        logger.debug("[TRAIN] - Got batch %d,%d" %
                     (batch['x'].shape[1], batch['max_length']))

        x_data = batch['x']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']

        if state['use_nce']:
            y_neg = rng.choice(size=(10, max_length, x_data.shape[1]),
                               a=model.idim,
                               p=model.noise_probs).astype('int32')
            c = train_batch(x_data, y_neg, max_length, x_cost_mask)
        else:
            c = train_batch(x_data, max_length, x_cost_mask)

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            continue

        train_cost += c

        # Compute word-error rate
        miscl = eval_misclass_batch(x_data, max_length, x_cost_mask)
        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN misclassification .. skipping")
            continue

        train_misclass += miscl

        train_done += batch['num_preds']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time
            h, m, s = ConvertTimedelta(this_time - start_time)
            print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f acc_mean_word_error = %.4f " % (h, m, s,\
                                                                 state['time_stop'] - (time.time() - start_time)/60.,\
                                                                 step, \
                                                                 batch['x'].shape[1], \
                                                                 batch['max_length'], \
                                                                 float(train_cost/train_done), \
                                                                 math.exp(float(train_cost/train_done)), \
                                                                 float(train_misclass)/float(train_done))




        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            valid_data.start()
            valid_cost = 0
            valid_misclass = 0
            valid_empirical_mutual_information = 0
            valid_wordpreds_done = 0
            valid_triples_done = 0

            # Prepare variables for plotting histogram over word-perplexities and mutual information
            valid_data_len = valid_data.data_len
            valid_cost_list = numpy.zeros((valid_data_len, ))
            valid_pmi_list = numpy.zeros((valid_data_len, ))

            # Prepare variables for printing the training examples the model performs best and worst on
            valid_extrema_setsize = min(state['track_extrema_samples_count'],
                                        valid_data_len)
            valid_extrema_samples_to_print = min(
                state['print_extrema_samples_count'], valid_extrema_setsize)

            valid_lowest_costs = numpy.ones((valid_extrema_setsize, )) * 1000
            valid_lowest_triples = numpy.ones(
                (valid_extrema_setsize, state['seqlen'])) * 1000
            valid_highest_costs = numpy.ones(
                (valid_extrema_setsize, )) * (-1000)
            valid_highest_triples = numpy.ones(
                (valid_extrema_setsize, state['seqlen'])) * (-1000)

            logger.debug("[VALIDATION START]")

            while True:
                batch = valid_data.next()
                # Train finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                x_data = batch['x']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']

                c, c_list = eval_batch(x_data, max_length, x_cost_mask)
                c_list = c_list.reshape((batch['x'].shape[1], max_length),
                                        order=(1, 0))
                c_list = numpy.sum(c_list, axis=1)

                words_in_triples = numpy.sum(x_cost_mask, axis=0)
                c_list = c_list / words_in_triples

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                nxt = min((valid_triples_done + batch['x'].shape[1]),
                          valid_data_len)
                triples_in_batch = nxt - valid_triples_done
                valid_cost_list[(nxt - triples_in_batch):nxt] = numpy.exp(
                    c_list[0:triples_in_batch])

                # Store best and worst validation costs
                con_costs = np.concatenate(
                    [valid_lowest_costs, c_list[0:triples_in_batch]])
                con_triples = np.concatenate(
                    [valid_lowest_triples, x_data[:, 0:triples_in_batch].T],
                    axis=0)
                con_indices = con_costs.argsort()[0:valid_extrema_setsize][::1]
                valid_lowest_costs = con_costs[con_indices]
                valid_lowest_triples = con_triples[con_indices]

                con_costs = np.concatenate(
                    [valid_highest_costs, c_list[0:triples_in_batch]])
                con_triples = np.concatenate(
                    [valid_highest_triples, x_data[:, 0:triples_in_batch].T],
                    axis=0)
                con_indices = con_costs.argsort(
                )[-valid_extrema_setsize:][::-1]
                valid_highest_costs = con_costs[con_indices]
                valid_highest_triples = con_triples[con_indices]

                # Compute word-error rate
                miscl = eval_misclass_batch(x_data, max_length, x_cost_mask)
                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_misclass += miscl

                # Compute empirical mutual information
                if state['compute_mutual_information'] == True:
                    # Compute marginal log-likelihood of last utterance in triple:
                    # We approximate it with the margina log-probabiltiy of the utterance being observed first in the triple
                    x_data_last_utterance = batch['x_last_utterance']
                    x_cost_mask_last_utterance = batch['x_mask_last_utterance']
                    marginal_last_utterance_loglikelihood, marginal_last_utterance_loglikelihood_list = eval_batch(
                        x_data_last_utterance, max_length,
                        x_cost_mask_last_utterance)
                    marginal_last_utterance_loglikelihood_list = marginal_last_utterance_loglikelihood_list.reshape(
                        (batch['x'].shape[1], max_length), order=(1, 0))
                    marginal_last_utterance_loglikelihood_list = numpy.sum(
                        marginal_last_utterance_loglikelihood_list, axis=1)
                    # If we wanted to normalize histogram plots by utterance length, we should enable this:
                    #words_in_last_utterance = numpy.sum(x_cost_mask_last_utterance, axis=0)
                    #marginal_last_utterance_loglikelihood_list = marginal_last_utterance_loglikelihood_list / words_in_last_utterance

                    # Compute marginal log-likelihood of first utterances in triple by masking the last utterance
                    x_cost_mask_first_utterances = x_cost_mask - x_cost_mask_last_utterance
                    marginal_first_utterances_loglikelihood, marginal_first_utterances_loglikelihood_list = eval_batch(
                        x_data, max_length, x_cost_mask_first_utterances)

                    marginal_first_utterances_loglikelihood_list = marginal_first_utterances_loglikelihood_list.reshape(
                        (batch['x'].shape[1], max_length), order=(1, 0))
                    marginal_first_utterances_loglikelihood_list = numpy.sum(
                        marginal_first_utterances_loglikelihood_list, axis=1)

                    # If we wanted to normalize histogram plots by utterance length, we should enable this:
                    #words_in_first_utterances = numpy.sum(x_cost_mask_first_utterances, axis=0)
                    #marginal_first_utterances_loglikelihood_list = marginal_first_utterances_loglikelihood_list / words_in_first_utterances

                    # Compute empirical mutual information and pointwise empirical mutual information
                    valid_empirical_mutual_information += -c + marginal_first_utterances_loglikelihood + marginal_last_utterance_loglikelihood
                    valid_pmi_list[(nxt - triples_in_batch):nxt] = (
                        -c_list * words_in_triples +
                        marginal_first_utterances_loglikelihood_list +
                        marginal_last_utterance_loglikelihood_list
                    )[0:triples_in_batch]

                valid_wordpreds_done += batch['num_preds']
                valid_triples_done += batch['x'].shape[1]

            logger.debug("[VALIDATION END]")

            valid_cost /= valid_wordpreds_done
            valid_misclass /= float(valid_wordpreds_done)
            valid_empirical_mutual_information /= float(valid_triples_done)

            if len(timings["valid_cost"]
                   ) == 0 or valid_cost < timings["valid_cost"][-1]:
                patience = state['patience']
                # Saving model if decrease in validation cost
                save(model, timings)
            elif valid_cost >= timings["valid_cost"][-1] * state[
                    'cost_threshold']:
                patience -= 1

            print "** valid cost = %.4f, valid word-perplexity = %.4f, valid mean word-error = %.4f, valid emp. mutual information = %.4f, patience = %d" % (
                float(valid_cost), float(
                    math.exp(valid_cost)), float(valid_misclass),
                valid_empirical_mutual_information, patience)

            timings["train_cost"].append(train_cost / train_done)
            timings["train_misclass"].append(
                float(train_misclass) / float(train_done))
            timings["valid_cost"].append(valid_cost)
            timings["valid_misclass"].append(valid_misclass)
            timings["valid_emi"].append(valid_empirical_mutual_information)

            # Reset train cost, train misclass and train done
            train_cost = 0
            train_misclass = 0
            train_done = 0

            # Plot histogram over validation costs
            try:
                pylab.figure()
                bins = range(0, 50, 1)
                pylab.hist(valid_cost_list, normed=1, histtype='bar')
                pylab.savefig(model.state['save_dir'] + '/' +
                              model.state['run_id'] + "_" +
                              model.state['prefix'] +
                              'Valid_WordPerplexities_' + str(step) + '.png')
            except:
                pass

            # Print 5 of 10% validation samples with highest log-likelihood
            if state['track_extrema_validation_samples'] == True:
                print " highest word log-likelihood valid samples: "
                np.random.shuffle(valid_lowest_triples)
                for i in range(valid_extrema_samples_to_print):
                    print "      Sample: {}".format(" ".join(
                        model.indices_to_words(
                            numpy.ravel(valid_lowest_triples[i, :]))))

                print " lowest word log-likelihood valid samples: "
                np.random.shuffle(valid_highest_triples)
                for i in range(valid_extrema_samples_to_print):
                    print "      Sample: {}".format(" ".join(
                        model.indices_to_words(
                            numpy.ravel(valid_highest_triples[i, :]))))

            # Plot histogram over empirical pointwise mutual informations
            if state['compute_mutual_information'] == True:
                try:
                    pylab.figure()
                    bins = range(0, 100, 1)
                    pylab.hist(valid_pmi_list, normed=1, histtype='bar')
                    pylab.savefig(model.state['save_dir'] + '/' +
                                  model.state['run_id'] + "_" +
                                  model.state['prefix'] + 'Valid_PMI_' +
                                  str(step) + '.png')
                except:
                    pass

        if 'bleu_evaluation' in state and \
            step % state['valid_freq'] == 0 and step > 1:

            # Compute samples with beam search
            logger.debug(
                "Executing beam search to get targets for bleu, jaccard etc.")
            samples, costs = beam_sampler.sample(contexts,
                                                 n_samples=5,
                                                 ignore_unk=True)
            logger.debug("Finished beam search.")

            assert len(samples) == len(contexts)
            #print 'samples', samples

            # Bleu evaluation
            bleu = bleu_eval.evaluate(samples, targets)

            print "** bleu score = %.4f " % bleu[0]
            timings["valid_bleu"].append(bleu[0])

            # Jaccard evaluation
            jaccard = jaccard_eval.evaluate(samples, targets)

            print "** jaccard score = %.4f " % jaccard
            timings["valid_jaccard"].append(jaccard)

            # Recall evaluation
            recall_at_1 = recall_at_1_eval.evaluate(samples, targets)

            print "** recall@1 score = %.4f " % recall_at_1
            timings["valid_recall_at_1"].append(recall_at_1)

            recall_at_5 = recall_at_5_eval.evaluate(samples, targets)

            print "** recall@5 score = %.4f " % recall_at_5
            timings["valid_recall_at_5"].append(recall_at_5)

            mrr_at_5 = mrr_at_5_eval.evaluate(samples, targets)

            # MRR evaluation (equivalent to mean average precision)
            print "** mrr@5 score = %.4f " % mrr_at_5
            timings["valid_mrr_at_5"].append(mrr_at_5)

            # TF-IDF cosine similarity evaluation
            tfidf_cs_at_1 = tfidf_cs_at_1_eval.evaluate(samples, targets)

            print "** tfidf-cs@1 score = %.4f " % tfidf_cs_at_1
            timings["tfidf_cs_at_1"].append(tfidf_cs_at_1)

            tfidf_cs_at_5 = tfidf_cs_at_5_eval.evaluate(samples, targets)

            print "** tfidf-cs@5 score = %.4f " % tfidf_cs_at_5
            timings["tfidf_cs_at_5"].append(tfidf_cs_at_5)

        step += 1

    logger.debug("All done, exiting...")
Beispiel #10
0
def train(args, state=None, commands=None):
    if commands:

        def shall_train():
            return commands['train']

        def shall_save():
            return commands['save']

        def shall_abort():
            return commands['abort']

        def saving_done():
            commands['save'] = False

    #logging.basicConfig(level = logging.DEBUG,
    #                    format = "%(asctime)s: %(name)s: %(levelname)s: %(message)s")

    if not state:
        state = eval(args.prototype)()

    timings = init_timings()

    auto_restarting = False
    if args.auto_restart:
        assert not args.save_every_valid_iteration
        assert len(args.resume) == 0

        directory = state['save_dir']
        if not directory[-1] == '/':
            directory = directory + '/'

        auto_resume_postfix = state['prefix'] + '_auto_model.npz'

        if os.path.exists(directory):
            directory_files = [
                f for f in listdir(directory) if isfile(join(directory, f))
            ]
            resume_filename = ''
            for f in directory_files:
                if len(f) > len(auto_resume_postfix):
                    if f[len(f) - len(auto_resume_postfix
                                      ):len(f)] == auto_resume_postfix:
                        if len(resume_filename) > 0:
                            print 'ERROR: FOUND MULTIPLE MODELS IN DIRECTORY:', directory
                            assert False
                        else:
                            resume_filename = directory + f[
                                0:len(f) - len('__auto_model.npz')]

            if len(resume_filename) > 0:
                logger.debug("Found model to automatically resume: %s" %
                             resume_filename)
                auto_restarting = True
                # Setup training to automatically resume training with the model found
                args.resume = resume_filename
                # Disable training from reinitialization any parameters
                args.reinitialize_decoder_parameters = False
                args.reinitialize_latent_variable_parameters = False
            else:
                logger.debug(
                    "Could not find any model to automatically resume...")

    step = 0

    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)

        state_file = args.resume + '_state.pkl'
        if commands:
            if commands['state_path']:
                state_file = commands['state_path']

        timings_file = args.resume + '_timing.npz'

        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")

            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)

            step = timings['step'][0]

            # Increment seed to make sure we get newly shuffled batches when training on large datasets
            state['seed'] = state['seed'] + 10

        else:
            raise Exception("Cannot resume, cannot find files!")

    logger.debug("State:\n{}".format(pprint.pformat(state)))
    logger.debug("Timings:\n{}".format(pprint.pformat(timings)))

    if args.force_train_all_wordemb == True:
        state['fix_pretrained_word_embeddings'] = False

    if state['test_values_enabled']:
        train_data, \
        valid_data, = get_train_iterator(state)
        train_data.start()
        state['batch_iterator'] = train_data

    if not commands:
        model = DialogEncoderDecoder(state)

    if commands:
        commands['timings'] = timings

        if commands['resume_path']:
            model = commands['resume_path'][0]
            timings = commands['resume_path'][1]

            for key, value in timings.iteritems():
                timings[key] = list(value)

            step = timings['step'][0]
        else:
            model = DialogEncoderDecoder(state)

    rng = model.rng

    valid_rounds = 0
    save_model_on_first_valid = False

    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.debug("Loading previous model")

            parameter_strings_to_ignore = []
            if args.reinitialize_decoder_parameters:
                parameter_strings_to_ignore += ['Wd_']
                parameter_strings_to_ignore += ['bd_']

                save_model_on_first_valid = True
            if args.reinitialize_latent_variable_parameters:
                parameter_strings_to_ignore += ['latent_utterance_prior']
                parameter_strings_to_ignore += [
                    'latent_utterance_approx_posterior'
                ]
                parameter_strings_to_ignore += ['kl_divergence_cost_weight']
                parameter_strings_to_ignore += ['latent_dcgm_encoder']

                save_model_on_first_valid = True

            load(model, filename, parameter_strings_to_ignore)
        else:
            raise Exception("Cannot resume, cannot find model file!")

        if 'run_id' not in model.state:
            print 'Backward compatibility not ensured! (need run_id in state)'

    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID

    logger.debug("Compile trainer")
    if not state["use_nce"]:
        if ('add_latent_gaussian_per_utterance'
                in state) and (state["add_latent_gaussian_per_utterance"]):
            logger.debug(
                "Training using variational lower bound on log-likelihood")
        else:
            logger.debug("Training using exact log-likelihood")

        train_batch = model.build_train_function()
    else:
        logger.debug("Training with noise contrastive estimation")
        train_batch = model.build_nce_function()

    eval_batch = model.build_eval_function()

    if model.add_latent_gaussian_per_utterance:
        eval_grads = model.build_eval_grads()

    random_sampler = search.RandomSampler(model)
    beam_sampler = search.BeamSampler(model)

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    train_data.start()

    # Start looping through the dataset
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_kl_divergence_cost = 0
    train_posterior_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    ex_done = 0
    is_end_of_batch = True
    start_validation = False

    batch = None

    import theano.tensor
    word = 'what'
    word_idx = model.words_to_indices([word])
    initial_sum = theano.tensor.sum(model.W_emb[word_idx]).eval()

    if 'fix_W_emb_steps' in state:
        model.W_emb_pretrained_mask.set_value(
            numpy.zeros(model.W_emb_pretrained_mask.shape.eval(),
                        dtype='float32'))

    #for idx in xrange(10):
    #print theano.tensor.sum(model.W_emb[word_idx]).eval()

    total_token_time = 0
    num_tokens_processed = 0

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        timings['step'] = [step]

        if 'save_at_first_iter' in state and step == 1:
            save2(model, timings, commands)

        #print 'init: ',initial_sum
        #print 'changed to: ',theano.tensor.sum(model.W_emb[word_idx]).eval()

        if 'fix_W_emb_steps' in state:
            if state['fix_W_emb_steps'] < step:
                model.W_emb_pretrained_mask.set_value(
                    numpy.ones(model.W_emb_pretrained_mask.shape.eval(),
                               dtype='float32'))

        if commands:

            commands['timings'] = timings

            if not shall_train():
                logging.debug('...training paused')
                wait_until(shall_train)

            if shall_save():
                logging.debug('...saving model (from command)')
                save2(model, timings, commands)
                saving_done()

            if shall_abort():
                break

        ### Sampling phase
        if step % 200 == 0:
            # First generate stochastic samples
            for param in model.params:
                print "%s = %.4f" % (param.name, numpy.sum(param.get_value()**
                                                           2)**0.5)

            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print "Sampled : {}".format(samples[0])

            if commands:
                commands['output'] = samples[0]

        ### Training phase
        batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got None...")
            break

        logger.debug("[TRAIN] [STEP %d]- Got batch %d,%d" %
                     (step + 1, batch['x'].shape[1], batch['max_length']))

        x_data = batch['x']

        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_reset = batch['x_reset']
        ran_cost_utterance = batch['ran_var_constutterance']
        ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

        is_end_of_batch = False
        if numpy.sum(numpy.abs(x_reset)) < 1:
            # Print when we reach the end of an example (e.g. the end of a dialogue or a document)
            # Knowing when the training procedure reaches the end is useful for diagnosing training problems
            #print 'END-OF-BATCH EXAMPLE!'
            is_end_of_batch = True

        if commands:
            token_time = time.time()

        if state['use_nce']:
            y_neg = rng.choice(size=(10, max_length, x_data.shape[1]),
                               a=model.idim,
                               p=model.noise_probs).astype('int32')
            c, kl_divergence_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, y_neg, max_length, x_cost_mask,
                x_reset, ran_cost_utterance, ran_decoder_drop_mask)
        else:
            c, kl_divergence_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                ran_cost_utterance, ran_decoder_drop_mask)

        total_token_time += token_time
        num_tokens_processed += (batch['x'].shape[1] * batch['max_length'])

        print '%.3f words/s' % (num_tokens_processed / total_token_time)

        if commands:
            token_time = time.time() - token_time
            commands['timings'] = timings
            commands['token_time'] += token_time
            commands['num_tokens_processed'] += (batch['x'].shape[1] *
                                                 batch['max_length'])

        # Print batch statistics
        print 'cost_sum', c
        print 'cost_mean', c / float(numpy.sum(x_cost_mask))
        print 'kl_divergence_cost_sum', kl_divergence_cost
        print 'kl_divergence_cost_mean', kl_divergence_cost / float(
            len(numpy.where(x_data == model.eos_sym)[0]))
        print 'posterior_mean_variance', posterior_mean_variance

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            gc.collect()
            continue

        train_cost += c
        train_kl_divergence_cost += kl_divergence_cost
        train_posterior_mean_variance += posterior_mean_variance

        train_done += batch['num_preds']
        train_dialogues_done += batch['num_dialogues']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time

            # Keep track of training cost for the last 'train_freq' batches.
            current_train_cost = train_cost / train_done

            if prev_train_done >= 1 and abs(train_done - prev_train_done) > 0:
                current_train_cost = float(
                    train_cost - prev_train_cost) / float(train_done -
                                                          prev_train_done)

            if numpy.isinf(c) or numpy.isnan(c):
                current_train_cost = 0

            prev_train_cost = train_cost
            prev_train_done = train_done

            h, m, s = ConvertTimedelta(this_time - start_time)

            # We need to catch exceptions due to high numbers in exp
            try:
                print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_kl_divergence_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\
                                 state['time_stop'] - (time.time() - start_time)/60.,\
                                 step, \
                                 batch['x'].shape[1], \
                                 batch['max_length'], \
                                 float(train_cost/train_done), \
                                 math.exp(float(train_cost/train_done)), \
                                 current_train_cost, \
                                 math.exp(current_train_cost), \
                                 float(train_misclass)/float(train_done), \
                                 float(train_kl_divergence_cost/train_done), \
                                 float(train_posterior_mean_variance/train_dialogues_done))
            except:
                pass

            #timings['train_progress'].append(math.exp(float(train_cost/train_done)))
            timings['train_progress'].append(math.exp(current_train_cost))

        ### Inspection phase
        # Evaluate gradient variance every 200 steps for GRU decoder
        if state['utterance_decoder_gating'].upper() == "GRU":
            if (step % 200 == 0) and (model.add_latent_gaussian_per_utterance):
                k_eval = 10

                softmax_costs = numpy.zeros((k_eval), dtype='float32')
                var_costs = numpy.zeros((k_eval), dtype='float32')
                gradients_wrt_softmax = numpy.zeros(
                    (k_eval, model.qdim_decoder, model.qdim_decoder),
                    dtype='float32')
                for k in range(0, k_eval):
                    batch = add_random_variables_to_batch(
                        model.state, model.rng, batch, None, False)
                    ran_cost_utterance = batch['ran_var_constutterance']
                    ran_decoder_drop_mask = batch['ran_decoder_drop_mask']
                    softmax_cost, var_cost, grads_wrt_softmax, grads_wrt_kl_divergence_cost = eval_grads(
                        x_data, x_data_reversed, max_length, x_cost_mask,
                        x_reset, ran_cost_utterance, ran_decoder_drop_mask)
                    softmax_costs[k] = softmax_cost
                    var_costs[k] = var_cost
                    gradients_wrt_softmax[k, :, :] = grads_wrt_softmax

                print 'mean softmax_costs', numpy.mean(softmax_costs)
                print 'std softmax_costs', numpy.std(softmax_costs)

                print 'mean var_costs', numpy.mean(var_costs)
                print 'std var_costs', numpy.std(var_costs)

                print 'mean gradients_wrt_softmax', numpy.mean(
                    numpy.abs(numpy.mean(gradients_wrt_softmax,
                                         axis=0))), numpy.mean(
                                             gradients_wrt_softmax, axis=0)
                print 'std gradients_wrt_softmax', numpy.mean(
                    numpy.std(gradients_wrt_softmax,
                              axis=0)), numpy.std(gradients_wrt_softmax,
                                                  axis=0)

                print 'std greater than mean', numpy.where(
                    numpy.std(gradients_wrt_softmax, axis=0) > numpy.abs(
                        numpy.mean(gradients_wrt_softmax, axis=0)))[0].shape[0]

                Wd_s_q = model.utterance_decoder.Wd_s_q.get_value()

                print 'Wd_s_q all', numpy.sum(numpy.abs(Wd_s_q)), numpy.mean(
                    numpy.abs(Wd_s_q))
                print 'Wd_s_q latent', numpy.sum(
                    numpy.abs(Wd_s_q[(
                        Wd_s_q.shape[0] -
                        state['latent_gaussian_per_utterance_dim']
                    ):Wd_s_q.shape[0], :])), numpy.mean(
                        numpy.abs(
                            Wd_s_q[(Wd_s_q.shape[0] -
                                    state['latent_gaussian_per_utterance_dim']
                                    ):Wd_s_q.shape[0], :]))

                print 'Wd_s_q ratio', (numpy.sum(
                    numpy.abs(
                        Wd_s_q[(Wd_s_q.shape[0] -
                                state['latent_gaussian_per_utterance_dim']
                                ):Wd_s_q.shape[0], :])) /
                                       numpy.sum(numpy.abs(Wd_s_q)))

                if 'latent_gaussian_linear_dynamics' in state:
                    if state['latent_gaussian_linear_dynamics']:
                        prior_Wl_linear_dynamics = model.latent_utterance_variable_prior_encoder.Wl_linear_dynamics.get_value(
                        )
                        print 'prior_Wl_linear_dynamics', numpy.sum(
                            numpy.abs(prior_Wl_linear_dynamics)), numpy.mean(
                                numpy.abs(
                                    prior_Wl_linear_dynamics)), numpy.std(
                                        numpy.abs(prior_Wl_linear_dynamics))

                        approx_posterior_Wl_linear_dynamics = model.latent_utterance_variable_approx_posterior_encoder.Wl_linear_dynamics.get_value(
                        )
                        print 'approx_posterior_Wl_linear_dynamics', numpy.sum(
                            numpy.abs(approx_posterior_Wl_linear_dynamics)
                        ), numpy.mean(
                            numpy.abs(approx_posterior_Wl_linear_dynamics)
                        ), numpy.std(
                            numpy.abs(approx_posterior_Wl_linear_dynamics))

                #print 'grads_wrt_softmax', grads_wrt_softmax.shape, numpy.sum(numpy.abs(grads_wrt_softmax)), numpy.abs(grads_wrt_softmax[0:5,0:5])
                #print 'grads_wrt_kl_divergence_cost', grads_wrt_kl_divergence_cost.shape, numpy.sum(numpy.abs(grads_wrt_kl_divergence_cost)), numpy.abs(grads_wrt_kl_divergence_cost[0:5,0:5])

        ### Evaluation phase
        if valid_data is not None and\
            step % state['valid_freq'] == 0 and step > 1:
            start_validation = True

        # Only start validation loop once it's time to validate and once all previous batches have been reset
        if start_validation and is_end_of_batch:
            start_validation = False
            valid_data.start()
            valid_cost = 0
            valid_kl_divergence_cost = 0
            valid_posterior_mean_variance = 0

            valid_wordpreds_done = 0
            valid_dialogues_done = 0

            logger.debug("[VALIDATION START]")

            while True:
                batch = valid_data.next()

                # Validation finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                x_data = batch['x']
                x_data_reversed = batch['x_reversed']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']

                x_reset = batch['x_reset']
                ran_cost_utterance = batch['ran_var_constutterance']
                ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

                c, kl_term, c_list, kl_term_list, posterior_mean_variance = eval_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_cost_utterance, ran_decoder_drop_mask)

                # Rehape into matrix, where rows are validation samples and columns are tokens
                # Note that we use max_length-1 because we don't get a cost for the first token
                # (the first token is always assumed to be eos)
                c_list = c_list.reshape((batch['x'].shape[1], max_length - 1),
                                        order=(1, 0))
                c_list = numpy.sum(c_list, axis=1)

                words_in_dialogues = numpy.sum(x_cost_mask, axis=0)
                c_list = c_list / words_in_dialogues

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                valid_kl_divergence_cost += kl_divergence_cost
                valid_posterior_mean_variance += posterior_mean_variance

                # Print batch statistics
                print 'valid_cost', valid_cost
                print 'valid_kl_divergence_cost sample', kl_divergence_cost
                print 'posterior_mean_variance', posterior_mean_variance

                valid_wordpreds_done += batch['num_preds']
                valid_dialogues_done += batch['num_dialogues']

            logger.debug("[VALIDATION END]")

            valid_cost /= valid_wordpreds_done
            valid_kl_divergence_cost /= valid_wordpreds_done
            valid_posterior_mean_variance /= valid_dialogues_done

            # We need to catch exceptions due to high numbers in exp
            try:
                print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid kldiv cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % (
                    float(valid_cost), float(
                        math.exp(valid_cost)), float(valid_kl_divergence_cost),
                    float(valid_posterior_mean_variance), patience)
            except:
                try:
                    print "** valid cost (NLL) = %.4f, patience = %d" % (
                        float(valid_cost), patience)
                except:
                    pass

            timings["train_cost"].append(train_cost / train_done)
            timings["train_kl_divergence_cost"].append(
                train_kl_divergence_cost / train_done)
            timings["train_posterior_mean_variance"].append(
                train_posterior_mean_variance / train_dialogues_done)
            timings["valid_cost"].append(valid_cost)
            timings["valid_perplexity"].append(float(math.exp(valid_cost)))
            timings["valid_kl_divergence_cost"].append(
                valid_kl_divergence_cost)
            timings["valid_posterior_mean_variance"].append(
                valid_posterior_mean_variance)


            if (len(timings["valid_cost"]) == 0) \
                or (valid_cost < numpy.min(timings["valid_cost"])) \
                or (save_model_on_first_valid and valid_rounds == 0):
                patience = state['patience']

                # Save model if there is  decrease in validation cost
                if commands:
                    save2(model, timings, commands)
                else:
                    save(model, timings)
                print 'best valid_cost', valid_cost
            elif valid_cost >= timings["valid_cost"][-1] * state[
                    'cost_threshold']:
                patience -= 1

            if args.save_every_valid_iteration:
                if commands:
                    save2(model, timings, commands)
                else:
                    save(model, timings, '_' + str(step) + '_')
            if args.auto_restart:
                if commands:
                    save2(model, timings, commands)
                else:
                    save(model, timings, '_auto_')

            # Reset train cost, train misclass and train done metrics
            train_cost = 0
            train_done = 0
            prev_train_cost = 0
            prev_train_done = 0

            # Count number of validation rounds done so far
            valid_rounds += 1

        step += 1

    logger.debug("All done, exiting...")
Beispiel #11
0
def train2(model_manager, state, model=None, random_seed=True):

    for k, v in state.iteritems():
        print k, '=', v

    if not model:
        model = DialogEncoderDecoder(state)
        step = 0
    else:
        step = model.timings['step']

    logger.debug("Compile trainer")
    if not state["use_nce"]:
        if ('add_latent_gaussian_per_utterance'
                in state) and (state["add_latent_gaussian_per_utterance"]):
            logger.debug(
                "Training using variational lower bound on log-likelihood")
        else:
            logger.debug("Training using exact log-likelihood")

        train_batch = model.build_train_function()
    else:
        logger.debug("Training with noise contrastive estimation")
        train_batch = model.build_nce_function()

    eval_batch = model.build_eval_function()

    if model.add_latent_gaussian_per_utterance:
        eval_grads = model.build_eval_grads()

    random_sampler = search.RandomSampler(model)

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    train_data.start()

    # Start looping through the dataset
    patience = state['patience']
    start_time = time.time()

    train_cost = 0
    train_kl_divergence_cost = 0
    train_posterior_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    valid_rounds = 0

    ex_done = 0
    is_end_of_batch = True
    start_validation = False

    batch = None

    if random_seed:
        rng = numpy.random.RandomState()
    else:
        rng = model.rng

    timings = {'step': step}
    total_token_time = 0
    num_tokens_processed = 0

    while (step < state['loop_iters']
           and (time.time() - start_time) / 60. < state['time_stop']
           and patience >= 0):

        if 'save_at_iter' in state and step == state['save_at_iter']:
            save3(model, timings, model_manager)

        timings['step'] = step

        ### Sampling phase
        if step % 200 == 0:
            # First generate stochastic samples
            for param in model.params:
                print "%s = %.4f" % (param.name, numpy.sum(param.get_value()**
                                                           2)**0.5)

            samples, costs = random_sampler.sample([[]],
                                                   n_samples=1,
                                                   n_turns=3)
            print "Sampled : {}".format(samples[0])

        ### Training phase
        batch = train_data.next()

        # Train finished
        if not batch:
            # Restart training
            logger.debug("Got no batch to train with...")
            break

        logger.debug("[TRAIN] [STEP %d]- Got batch %d,%d" %
                     (step + 1, batch['x'].shape[1], batch['max_length']))

        x_data = batch['x']

        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_reset = batch['x_reset']
        ran_cost_utterance = batch['ran_var_constutterance']
        ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

        is_end_of_batch = False
        if numpy.sum(numpy.abs(x_reset)) < 1:
            # Print when we reach the end of an example (e.g. the end of a dialogue or a document)
            # Knowing when the training procedure reaches the end is useful for diagnosing training problems
            # print 'END-OF-BATCH EXAMPLE!'
            is_end_of_batch = True
        token_time = time.time()

        if state['use_nce']:
            y_neg = rng.choice(size=(10, max_length, x_data.shape[1]),
                               a=model.idim,
                               p=model.noise_probs).astype('int32')
            c, kl_divergence_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, y_neg, max_length, x_cost_mask,
                x_reset, ran_cost_utterance, ran_decoder_drop_mask)
        else:
            c, kl_divergence_cost, posterior_mean_variance = train_batch(
                x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                ran_cost_utterance, ran_decoder_drop_mask)

        total_token_time += (time.time() - token_time)
        num_tokens_processed += (batch['x'].shape[1] * batch['max_length'])

        print '%.3f words/s' % (num_tokens_processed / total_token_time)

        # Print batch statistics
        print 'cost_sum', c
        print 'cost_mean', c / float(numpy.sum(x_cost_mask))
        print 'kl_divergence_cost_sum', kl_divergence_cost
        print 'kl_divergence_cost_mean', kl_divergence_cost / float(
            len(numpy.where(x_data == model.eos_sym)[0]))
        print 'posterior_mean_variance', posterior_mean_variance

        if numpy.isinf(c) or numpy.isnan(c):
            logger.warn("Got NaN cost .. skipping")
            gc.collect()
            continue

        train_cost += c
        train_kl_divergence_cost += kl_divergence_cost
        train_posterior_mean_variance += posterior_mean_variance

        train_done += batch['num_preds']
        train_dialogues_done += batch['num_dialogues']

        this_time = time.time()
        if step % state['train_freq'] == 0:
            elapsed = this_time - start_time

            # Keep track of training cost for the last 'train_freq' batches.
            current_train_cost = train_cost / train_done

            if prev_train_done >= 1 and abs(train_done - prev_train_done) > 0:
                current_train_cost = float(
                    train_cost - prev_train_cost) / float(train_done -
                                                          prev_train_done)

            if numpy.isinf(c) or numpy.isnan(c):
                current_train_cost = 0

            prev_train_cost = train_cost
            prev_train_done = train_done

            h, m, s = ConvertTimedelta(this_time - start_time)

            # We need to catch exceptions due to high numbers in exp
            try:
                print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_kl_divergence_cost = %.8f acc_mean_posterior_variance = %.8f" % (
                h, m, s, \
                state['time_stop'] - (time.time() - start_time) / 60., \
                step, \
                batch['x'].shape[1], \
                batch['max_length'], \
                float(train_cost / train_done), \
                math.exp(float(train_cost / train_done)), \
                current_train_cost, \
                math.exp(current_train_cost), \
                float(train_misclass) / float(train_done), \
                float(train_kl_divergence_cost / train_done), \
                float(train_posterior_mean_variance / train_dialogues_done))
            except:
                pass

            # timings['train_progress'].append(math.exp(float(train_cost/train_done)))
            #timings['train_progress'].append(math.exp(current_train_cost))

        ### Inspection phase
        # Evaluate gradient variance every 200 steps for GRU decoder
        if state['utterance_decoder_gating'].upper() == "GRU":
            if (step % 200 == 0) and (model.add_latent_gaussian_per_utterance):
                k_eval = 10

                softmax_costs = numpy.zeros((k_eval), dtype='float32')
                var_costs = numpy.zeros((k_eval), dtype='float32')
                gradients_wrt_softmax = numpy.zeros(
                    (k_eval, model.qdim_decoder, model.qdim_decoder),
                    dtype='float32')
                for k in range(0, k_eval):
                    batch = add_random_variables_to_batch(
                        model.state, model.rng, batch, None, False)
                    ran_cost_utterance = batch['ran_var_constutterance']
                    ran_decoder_drop_mask = batch['ran_decoder_drop_mask']
                    softmax_cost, var_cost, grads_wrt_softmax, grads_wrt_kl_divergence_cost = eval_grads(
                        x_data, x_data_reversed, max_length, x_cost_mask,
                        x_reset, ran_cost_utterance, ran_decoder_drop_mask)
                    softmax_costs[k] = softmax_cost
                    var_costs[k] = var_cost
                    gradients_wrt_softmax[k, :, :] = grads_wrt_softmax

                print 'mean softmax_costs', numpy.mean(softmax_costs)
                print 'std softmax_costs', numpy.std(softmax_costs)

                print 'mean var_costs', numpy.mean(var_costs)
                print 'std var_costs', numpy.std(var_costs)

                print 'mean gradients_wrt_softmax', numpy.mean(
                    numpy.abs(numpy.mean(gradients_wrt_softmax,
                                         axis=0))), numpy.mean(
                                             gradients_wrt_softmax, axis=0)
                print 'std gradients_wrt_softmax', numpy.mean(
                    numpy.std(gradients_wrt_softmax,
                              axis=0)), numpy.std(gradients_wrt_softmax,
                                                  axis=0)

                print 'std greater than mean', numpy.where(
                    numpy.std(gradients_wrt_softmax, axis=0) > numpy.abs(
                        numpy.mean(gradients_wrt_softmax, axis=0)))[0].shape[0]

                Wd_s_q = model.utterance_decoder.Wd_s_q.get_value()

                print 'Wd_s_q all', numpy.sum(numpy.abs(Wd_s_q)), numpy.mean(
                    numpy.abs(Wd_s_q))
                print 'Wd_s_q latent', numpy.sum(
                    numpy.abs(Wd_s_q[(
                        Wd_s_q.shape[0] -
                        state['latent_gaussian_per_utterance_dim']
                    ):Wd_s_q.shape[0], :])), numpy.mean(
                        numpy.abs(
                            Wd_s_q[(Wd_s_q.shape[0] -
                                    state['latent_gaussian_per_utterance_dim']
                                    ):Wd_s_q.shape[0], :]))

                print 'Wd_s_q ratio', (numpy.sum(
                    numpy.abs(
                        Wd_s_q[(Wd_s_q.shape[0] -
                                state['latent_gaussian_per_utterance_dim']
                                ):Wd_s_q.shape[0], :])) /
                                       numpy.sum(numpy.abs(Wd_s_q)))

                if 'latent_gaussian_linear_dynamics' in state:
                    if state['latent_gaussian_linear_dynamics']:
                        prior_Wl_linear_dynamics = model.latent_utterance_variable_prior_encoder.Wl_linear_dynamics.get_value(
                        )
                        print 'prior_Wl_linear_dynamics', numpy.sum(
                            numpy.abs(prior_Wl_linear_dynamics)), numpy.mean(
                                numpy.abs(
                                    prior_Wl_linear_dynamics)), numpy.std(
                                        numpy.abs(prior_Wl_linear_dynamics))

                        approx_posterior_Wl_linear_dynamics = model.latent_utterance_variable_approx_posterior_encoder.Wl_linear_dynamics.get_value(
                        )
                        print 'approx_posterior_Wl_linear_dynamics', numpy.sum(
                            numpy.abs(approx_posterior_Wl_linear_dynamics)
                        ), numpy.mean(
                            numpy.abs(approx_posterior_Wl_linear_dynamics)
                        ), numpy.std(
                            numpy.abs(approx_posterior_Wl_linear_dynamics))

                        # print 'grads_wrt_softmax', grads_wrt_softmax.shape, numpy.sum(numpy.abs(grads_wrt_softmax)), numpy.abs(grads_wrt_softmax[0:5,0:5])
                        # print 'grads_wrt_kl_divergence_cost', grads_wrt_kl_divergence_cost.shape, numpy.sum(numpy.abs(grads_wrt_kl_divergence_cost)), numpy.abs(grads_wrt_kl_divergence_cost[0:5,0:5])

        ### Evaluation phase
        if valid_data is not None and \
                                step % state['valid_freq'] == 0 and step > 1:
            start_validation = True

        # Only start validation loop once it's time to validate and once all previous batches have been reset
        if start_validation and is_end_of_batch:
            start_validation = False
            valid_data.start()
            valid_cost = 0
            valid_kl_divergence_cost = 0
            valid_posterior_mean_variance = 0

            valid_wordpreds_done = 0
            valid_dialogues_done = 0

            logger.debug("[VALIDATION START]")

            while True:
                batch = valid_data.next()

                # Validation finished
                if not batch:
                    break

                logger.debug("[VALID] - Got batch %d,%d" %
                             (batch['x'].shape[1], batch['max_length']))

                x_data = batch['x']
                x_data_reversed = batch['x_reversed']
                max_length = batch['max_length']
                x_cost_mask = batch['x_mask']

                x_reset = batch['x_reset']
                ran_cost_utterance = batch['ran_var_constutterance']
                ran_decoder_drop_mask = batch['ran_decoder_drop_mask']

                c, kl_term, c_list, kl_term_list, posterior_mean_variance = eval_batch(
                    x_data, x_data_reversed, max_length, x_cost_mask, x_reset,
                    ran_cost_utterance, ran_decoder_drop_mask)

                # Rehape into matrix, where rows are validation samples and columns are tokens
                # Note that we use max_length-1 because we don't get a cost for the first token
                # (the first token is always assumed to be eos)
                c_list = c_list.reshape((batch['x'].shape[1], max_length - 1),
                                        order=(1, 0))
                c_list = numpy.sum(c_list, axis=1)

                words_in_dialogues = numpy.sum(x_cost_mask, axis=0)
                c_list = c_list / words_in_dialogues

                if numpy.isinf(c) or numpy.isnan(c):
                    continue

                valid_cost += c
                valid_kl_divergence_cost += kl_divergence_cost
                valid_posterior_mean_variance += posterior_mean_variance

                # Print batch statistics
                print 'valid_cost', valid_cost
                print 'valid_kl_divergence_cost sample', kl_divergence_cost
                print 'posterior_mean_variance', posterior_mean_variance

                valid_wordpreds_done += batch['num_preds']
                valid_dialogues_done += batch['num_dialogues']

            logger.debug("[VALIDATION END]")

            valid_cost /= valid_wordpreds_done
            valid_kl_divergence_cost /= valid_wordpreds_done
            valid_posterior_mean_variance /= valid_dialogues_done

            # We need to catch exceptions due to high numbers in exp
            try:
                print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid kldiv cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % (
                    float(valid_cost), float(
                        math.exp(valid_cost)), float(valid_kl_divergence_cost),
                    float(valid_posterior_mean_variance), patience)
            except:
                try:
                    print "** valid cost (NLL) = %.4f, patience = %d" % (
                        float(valid_cost), patience)
                except:
                    pass

            timings["train_cost"].append(train_cost / train_done)
            timings["train_kl_divergence_cost"].append(
                train_kl_divergence_cost / train_done)
            timings["train_posterior_mean_variance"].append(
                train_posterior_mean_variance / train_dialogues_done)
            timings["valid_cost"].append(valid_cost)
            timings["valid_perplexity"].append(float(math.exp(valid_cost)))
            timings["valid_kl_divergence_cost"].append(
                valid_kl_divergence_cost)
            timings["valid_posterior_mean_variance"].append(
                valid_posterior_mean_variance)

            save3(model, timings, model_manager)

            # Reset train cost, train misclass and train done metrics
            train_cost = 0
            train_done = 0
            prev_train_cost = 0
            prev_train_done = 0

            # Count number of validation rounds done so far
            valid_rounds += 1

        step += 1

    logger.debug("All done, exiting...")
Beispiel #12
0
def main(args):     
    logging.basicConfig(level = logging.DEBUG,
                        format = "%(asctime)s: %(name)s: %(levelname)s: %(message)s")
     
    state = eval(args.prototype)() 
    timings = init_timings() 
    
    args.resume = 'Que26/models/1448530885.38_testmodel__225000'
    
    if args.resume != "":
        logger.debug("Resuming %s" % args.resume)
        
        state_file = args.resume + '_state.pkl'
        timings_file = args.resume + '_timing.npz'
        
        if os.path.isfile(state_file) and os.path.isfile(timings_file):
            logger.debug("Loading previous state")
            
            state = cPickle.load(open(state_file, 'r'))
            timings = dict(numpy.load(open(timings_file, 'r')))
            for x, y in timings.items():
                timings[x] = list(y)

            # Increment seed to make sure we get newly shuffled batches when training on large datasets
            state['seed'] = state['seed'] + 10

        else:
            raise Exception("Cannot resume, cannot find files!")

    #logger.debug("State:\n{}".format(pprint.pformat(state)))
    #logger.debug("Timings:\n{}".format(pprint.pformat(timings)))
 
    if args.force_train_all_wordemb == True:
        state['fix_pretrained_word_embeddings'] = False
        
    # Load dictionary
    raw_dict = cPickle.load(open(state['dictionary'], 'r'))
    # Dictionaries to convert str to idx and vice-versa
    str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict]) #字典里的每一项包含四个字段,(字符,字符号,词频,文本频率)
    idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq, _ in raw_dict])

    model = DialogEncoderDecoder(state)
    rng = model.rng 
    
    
    if args.resume != "":
        filename = args.resume + '_model.npz'
        if os.path.isfile(filename):
            logger.debug("Loading previous model")

            parameter_strings_to_ignore = []
            if args.reinitialize_decoder_parameters:
                parameter_strings_to_ignore += ['latent_utterance_prior']
                parameter_strings_to_ignore += ['latent_utterance_approx_posterior']
            if args.reinitialize_variational_parameters:
                parameter_strings_to_ignore += ['Wd_']
                parameter_strings_to_ignore += ['bd_']

            load(model, filename, parameter_strings_to_ignore)
        else:
            raise Exception("Cannot resume, cannot find model file!")
        
        if 'run_id' not in model.state:
            raise Exception('Backward compatibility not ensured! (need run_id in state)')           

    else:
        # assign new run_id key
        model.state['run_id'] = RUN_ID
    
    
    logger.debug("Compile trainer")

    test_batch = model.build_test_function() #测试函数

    if model.add_latent_gaussian_per_utterance:
        eval_grads = model.build_eval_grads()

    random_sampler = search.RandomSampler(model)
    beam_sampler = search.BeamSampler(model) 

    logger.debug("Load data")
    train_data, \
    valid_data, = get_train_iterator(state)
    
    

    # Start looping through the dataset
    step = 0
    patience = state['patience'] 
    start_time = time.time()
     
    train_cost = 0
    train_variational_cost = 0
    train_posterior_mean_variance = 0
    train_misclass = 0
    train_done = 0
    train_dialogues_done = 0.0

    prev_train_cost = 0
    prev_train_done = 0

    ex_done = 0

    batch = None

    valid_data.start()
    valid_cost = 0
    valid_variational_cost = 0
    valid_posterior_mean_variance = 0

    valid_wordpreds_done = 0
    valid_dialogues_done = 0


    # Prepare variables for plotting histogram over word-perplexities and mutual information
    valid_data_len = valid_data.data_len
    valid_cost_list = numpy.zeros((valid_data_len,))
    valid_pmi_list = numpy.zeros((valid_data_len,))

    # Prepare variables for printing the training examples the model performs best and worst on
    valid_extrema_setsize = min(state['track_extrema_samples_count'], valid_data_len)
    valid_extrema_samples_to_print = min(state['print_extrema_samples_count'], valid_extrema_setsize)

    max_stored_len = 160 # Maximum number of tokens to store for dialogues with highest and lowest validation errors
    valid_lowest_costs = numpy.ones((valid_extrema_setsize,))*1000
    valid_lowest_dialogues = numpy.ones((valid_extrema_setsize,max_stored_len))*1000
    valid_highest_costs = numpy.ones((valid_extrema_setsize,))*(-1000)
    valid_highest_dialogues = numpy.ones((valid_extrema_setsize,max_stored_len))*(-1000)

    logger.debug("[VALIDATION START]")
    DocMtrix = []
    NNN = 0
    while True:
        NNN += 1
        if NNN > 50:
            break
        batch = valid_data.next()

        # Train finished
        if not batch:
            break
         
        logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length']))
        
        if batch['max_length'] == state['max_grad_steps']:
            continue

        x_data = batch['x']
        x_data_reversed = batch['x_reversed']
        max_length = batch['max_length']
        x_cost_mask = batch['x_mask']
        x_semantic = batch['x_semantic']
        x_semantic_nonempty_indices = numpy.where(x_semantic >= 0)

        x_reset = batch['x_reset']
        ran_cost_utterance = batch['ran_var_constutterance']

        Gen_tar, Tar_Y, DocV= test_batch(x_data, x_data_reversed, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance)
        
        DocMtrix.append(DocV)
        print ''.join([idx_to_str[id_of_w] for id_of_w in list(x_data.T)[0]])
        # Rehape into matrix, where rows are validation samples and columns are tokens
        # Note that we use max_length-1 because we don't get a cost for the first token
        # (the first token is always assumed to be eos)
        #c_list = c_list.reshape((batch['x'].shape[1],max_length-1), order=(1,0))
        #c_list = numpy.sum(c_list, axis=1)
        
        #words_in_dialogues = numpy.sum(x_cost_mask, axis=0)
        #c_list = c_list / words_in_dialogues
        #print 'Original: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Tar_Y.T)[0]]) #'',join([idx_to_str[id_of_w] for id_of_w in Tar_Y])
        #print 'Generations: ',''.join([idx_to_str[id_of_w] for id_of_w in list(Gen_tar.T)[0]])
        #print 'Test:', type(test1), test1
        #raw_input()
        """
        if numpy.isinf(c) or numpy.isnan(c):
            continue
        
        valid_cost += c
        valid_variational_cost += variational_cost
        valid_posterior_mean_variance += posterior_mean_variance

        print 'valid_cost', valid_cost
        print 'Original: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Tar_Y.T)[0]]) #'',join([idx_to_str[id_of_w] for id_of_w in Tar_Y])
        print 'Generations: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Gen_tar.T)[0]])
        #print 'valid_variational_cost', valid_variational_cost
        #print 'posterior_mean_variance', posterior_mean_variance


        valid_wordpreds_done += batch['num_preds']
        valid_dialogues_done += batch['num_dialogues']
        """
    logger.debug("[VALIDATION END]") 
    DocM = numpy.row_stack(DocMtrix)
    simM = cosine_similarity(DocM,DocM)
    meanV = numpy.mean(DocM,axis=1)
    print simM
    print meanV
    """