def main(): args = parse_args() state = prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" with open(state_path) as src: state.update(cPickle.load(src)) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") model = SessionEncoderDecoder(state) sampler = search.BeamSampler(model) if os.path.isfile(model_path): logger.debug("Loading previous model") model.load(model_path) else: raise Exception("Must specify a valid model path") contexts = [[]] lines = open(args.context, "r").readlines() contexts = [x.strip().split('\t') for x in lines] context_samples, context_costs = sampler.sample(contexts, n_samples=args.n_samples, ignore_unk=args.ignore_unk, verbose=args.verbose) # Write to output file output_handle = open(args.context + "_HED_" + model.run_id + ".gen", "w") for context_sample in context_samples: print >> output_handle, '\t'.join(context_sample) output_handle.close()
def __init__(self, model_prefix, dict_file, name): # Load the HRED model. self.name = name state_path = '%s_state.pkl' % model_prefix model_path = '%s_model.npz' % model_prefix state = prototype_state() with open(state_path, 'r') as handle: state.update(cPickle.load(handle)) state['dictionary'] = dict_file print 'Building %s model...' % name self.model = DialogEncoderDecoder(state) print 'Building sampler...' self.sampler = search.BeamSampler(self.model) print 'Loading model...' self.model.load(model_path) print 'Model built (%s).' % name self.speaker_token = '<first_speaker>' if name == 'reddit': self.speaker_token = '<speaker_1>' self.remove_tokens = ['<first_speaker>', '<at>', '<second_speaker>'] for i in range(0, 10): self.remove_tokens.append('<speaker_%d>' % i)
def main(): args = parse_args() state = prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" with open(state_path) as src: state.update(cPickle.load(src)) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") model = DialogEncoderDecoder(state) if os.path.isfile(model_path): logger.debug("Loading previous model") model.load(model_path) else: raise Exception("Must specify a valid model path") logger.info("This model uses " + model.decoder_bias_type + " bias type") #sampler = search.RandomSampler(model) sampler = search.BeamSampler(model) # Start chat loop utterances = collections.deque() while (True): var = raw_input("User - ") # Increase number of utterances. We just set it to zero for simplicity so that model has no memory. # But it works fine if we increase this number while len(utterances) > 0: utterances.popleft() current_utterance = [model.end_sym_sentence] + [ '<first_speaker>' ] + var.split() + [model.end_sym_sentence] utterances.append(current_utterance) #TODO Sample a random reply. To spicy it up, we could pick the longest reply or the reply with the fewest placeholders... seqs = list(itertools.chain(*utterances)) #TODO Retrieve only replies which are generated for second speaker... sentences = sample(model, \ seqs=[seqs], ignore_unk=args.ignore_unk, \ sampler=sampler, n_samples=5) if len(sentences) == 0: raise ValueError("Generation error, no sentences were produced!") utterances.append(sentences[0][0].split()) reply = sentences[0][0].encode('utf-8') print "AI - ", remove_speaker_tokens(reply)
def main(): args = parse_args() state = prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" with open(state_path) as src: state.update(cPickle.load(src)) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") state['compute_training_updates'] = False model = DialogEncoderDecoder(state) sampler = search.RandomSampler(model) if args.beam_search: sampler = search.BeamSampler(model) if args.diverse_beam_search: sampler = search.DiverseBeamSampler(model, args.gamma) if os.path.isfile(model_path): logger.debug("Loading previous model") model.load(model_path) else: raise Exception("Must specify a valid model path") contexts = [[]] lines = open(args.context, "r").readlines() if len(lines): contexts = [x.strip() for x in lines] print('Sampling started...') context_samples, context_costs = sampler.sample(contexts, n_samples=args.n_samples, n_turns=args.n_turns, ignore_unk=args.ignore_unk, verbose=args.verbose, return_words=True) print('Sampling finished.') print('Saving to file...') # Write to output file print type(context_samples) print type(context_samples[0]) print context_samples[0] output_handle = open(args.output, "w") for context_sample in context_samples: print >> output_handle, '\t'.join(context_sample) output_handle.close() print('Saving to file finished.') print('All done!')
def main(): args = parse_args() state = prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" timing_path = args.model_prefix + "_timing.npz" with open(state_path, 'r') as src: state.update(cPickle.load(src)) with open(timing_path, 'r') as src: timings = dict(numpy.load(src)) state['compute_training_updates'] = False logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") print "\nLoaded previous state, model, timings:" print "state:" print state print "timings:" print timings print "\nBuilding model..." model = DialogEncoderDecoder(state) sampler = search.RandomSampler(model) if args.beam_search: sampler = search.BeamSampler(model) if os.path.isfile(model_path): model.load(model_path) else: raise Exception("Must specify a valid model path") print "build.\n" context = [] while True: line = raw_input("user: "******"<first_speaker> <at> " + line + " </s> ") print "context: ", [' '.join(context[-4:])] context_samples, context_costs = sampler.sample( [' '.join(context[-4:])], ignore_unk=args.ignore_unk, verbose=args.verbose, return_words=True) print "bot:", context_samples context.append(context_samples[0][0] + " </s> ") print "cost:", context_costs
def main(args): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") state = eval(args.prototype)() timings = init_timings() auto_restarting = False if args.auto_restart: assert not args.save_every_valid_iteration assert len(args.resume) == 0 directory = state['save_dir'] if not directory[-1] == '/': directory = directory + '/' auto_resume_postfix = state['prefix'] + '_auto_model.npz' if os.path.exists(directory): directory_files = [ f for f in listdir(directory) if isfile(join(directory, f)) ] resume_filename = '' for f in directory_files: if len(f) > len(auto_resume_postfix): if f[len(f) - len(auto_resume_postfix ):len(f)] == auto_resume_postfix: if len(resume_filename) > 0: print('ERROR: FOUND MULTIPLE MODELS IN DIRECTORY:', directory) assert False else: resume_filename = directory + f[ 0:len(f) - len('__auto_model.npz')] if len(resume_filename) > 0: logger.debug("Found model to automatically resume: %s" % resume_filename) auto_restarting = True # Setup training to automatically resume training with the model found args.resume = resume_filename + '__auto' # Disable training from reinitialization any parameters args.reinitialize_decoder_parameters = False args.reinitialize_latent_variable_parameters = False else: logger.debug( "Could not find any model to automatically resume...") if args.resume != "": logger.debug("Resuming %s" % args.resume) state_file = args.resume + '_state.pkl' timings_file = args.resume + '_timing.npz' if os.path.isfile(state_file) and os.path.isfile(timings_file): logger.debug("Loading previous state") state = cPickle.load(open(state_file, 'r')) timings = dict(numpy.load(open(timings_file, 'r'))) for x, y in timings.items(): timings[x] = list(y) # Increment seed to make sure we get newly shuffled batches when training on large datasets state['seed'] = state['seed'] else: raise Exception("Cannot resume, cannot find files!") logger.debug("State:\n{}".format(pprint.pformat(state))) logger.debug("Timings:\n{}".format(pprint.pformat(timings))) if args.force_train_all_wordemb == True: state['fix_pretrained_word_embeddings'] = False model = DialogEncoderDecoder(state) rng = model.rng valid_rounds = 0 save_model_on_first_valid = False if args.resume != "": filename = args.resume + '_model.npz' if os.path.isfile(filename): logger.debug("Loading previous model") parameter_strings_to_ignore = [] if args.reinitialize_decoder_parameters: parameter_strings_to_ignore += ['Wd_'] parameter_strings_to_ignore += ['bd_'] save_model_on_first_valid = True if args.reinitialize_latent_variable_parameters: parameter_strings_to_ignore += ['latent_utterance_prior'] parameter_strings_to_ignore += [ 'latent_utterance_approx_posterior' ] parameter_strings_to_ignore += ['kl_divergence_cost_weight'] parameter_strings_to_ignore += ['latent_dcgm_encoder'] save_model_on_first_valid = True load(model, filename, parameter_strings_to_ignore) else: raise Exception("Cannot resume, cannot find model file!") if 'run_id' not in model.state: raise Exception( 'Backward compatibility not ensured! (need run_id in state)') else: # assign new run_id key model.state['run_id'] = RUN_ID logger.debug("Compile trainer") if not state["use_nce"]: if ('add_latent_gaussian_per_utterance' in state) and (state["add_latent_gaussian_per_utterance"]): logger.debug( "Training using variational lower bound on log-likelihood") else: logger.debug("Training using exact log-likelihood") train_batch = model.build_train_function() else: logger.debug("Training with noise contrastive estimation") train_batch = model.build_nce_function() eval_batch = model.build_eval_function() gamma_bounding = model.build_gamma_bounding_function() random_sampler = search.RandomSampler(model) beam_sampler = search.BeamSampler(model) logger.debug("Load data") train_data, \ valid_data, = get_train_iterator(state) train_data.start() # Start looping through the dataset step = 0 patience = state['patience'] start_time = time.time() train_cost = 0 train_kl_divergence_cost = 0 train_posterior_gaussian_mean_variance = 0 train_misclass = 0 train_done = 0 train_dialogues_done = 0.0 prev_train_cost = 0 prev_train_done = 0 ex_done = 0 is_end_of_batch = True start_validation = False batch = None while (step < state['loop_iters'] and (time.time() - start_time) / 60. < state['time_stop'] and patience >= 0): # Flush to log files sys.stderr.flush() sys.stdout.flush() ### Sampling phase if step % 200 == 0: # First generate stochastic samples for param in model.params: print("%s = %.4f" % (param.name, numpy.sum(param.get_value()**2)**0.5)) samples, costs = random_sampler.sample([[]], n_samples=1, n_turns=3) print("Sampled : {}".format(samples[0])) ### Training phase batch = train_data.next() # Train finished if not batch: # Restart training logger.debug("Got None...") break logger.debug("[TRAIN] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_reset = batch['x_reset'] ran_gaussian_const_utterance = batch['ran_var_gaussian_constutterance'] ran_uniform_const_utterance = batch['ran_var_uniform_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] is_end_of_batch = False if numpy.sum(numpy.abs(x_reset)) < 1: # Print when we reach the end of an example (e.g. the end of a dialogue or a document) # Knowing when the training procedure reaches the end is useful for diagnosing training problems # print('END-OF-BATCH EXAMPLE!') is_end_of_batch = True if state['use_nce']: y_neg = rng.choice(size=(10, max_length, x_data.shape[1]), a=model.idim, p=model.noise_probs).astype('int32') c, kl_divergence_cost, posterior_gaussian_mean_variance = train_batch( x_data, x_data_reversed, y_neg, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) else: latent_piecewise_utterance_variable_approx_posterior_alpha = 0.0 latent_piecewise_utterance_variable_prior_alpha = 0.0 kl_divergences_between_piecewise_prior_and_posterior = 0.0 kl_divergences_between_gaussian_prior_and_posterior = 0.0 latent_piecewise_posterior_sample = 0.0 posterior_gaussian_mean_variance = 0.0 if model.add_latent_piecewise_per_utterance and model.add_latent_gaussian_per_utterance: c, kl_divergence_cost, posterior_gaussian_mean_variance, latent_piecewise_utterance_variable_approx_posterior_alpha, latent_piecewise_utterance_variable_prior_alpha, kl_divergences_between_piecewise_prior_and_posterior, kl_divergences_between_gaussian_prior_and_posterior, latent_piecewise_posterior_sample = train_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) elif model.add_latent_gaussian_per_utterance: c, kl_divergence_cost, posterior_gaussian_mean_variance, kl_divergences_between_gaussian_prior_and_posterior = train_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) elif model.add_latent_piecewise_per_utterance: c, kl_divergence_cost, kl_divergences_between_piecewise_prior_and_posterior = train_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) else: c = train_batch(x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) kl_divergence_cost = 0.0 gamma_bounding() # Print batch statistics print('cost_sum', c) print('cost_mean', c / float(numpy.sum(x_cost_mask))) if model.add_latent_piecewise_per_utterance or model.add_latent_gaussian_per_utterance: print('kl_divergence_cost_sum', kl_divergence_cost) print( 'kl_divergence_cost_mean', kl_divergence_cost / float(len(numpy.where(x_data == model.eos_sym)[0]))) if model.add_latent_gaussian_per_utterance: print('posterior_gaussian_mean_variance', posterior_gaussian_mean_variance) print( 'kl_divergences_between_gaussian_prior_and_posterior', numpy.sum(kl_divergences_between_gaussian_prior_and_posterior), numpy.min(kl_divergences_between_gaussian_prior_and_posterior), numpy.max(kl_divergences_between_gaussian_prior_and_posterior)) if model.add_latent_piecewise_per_utterance: print( 'kl_divergences_between_piecewise_prior_and_posterior', numpy.sum( kl_divergences_between_piecewise_prior_and_posterior), numpy.min( kl_divergences_between_piecewise_prior_and_posterior), numpy.max( kl_divergences_between_piecewise_prior_and_posterior)) if numpy.isinf(c) or numpy.isnan(c): logger.warn("Got NaN cost .. skipping") gc.collect() continue train_cost += c train_kl_divergence_cost += kl_divergence_cost train_posterior_gaussian_mean_variance += posterior_gaussian_mean_variance train_done += batch['num_preds'] train_dialogues_done += batch['num_dialogues'] this_time = time.time() if step % state['train_freq'] == 0: elapsed = this_time - start_time # Keep track of training cost for the last 'train_freq' batches. current_train_cost = train_cost / train_done if prev_train_done >= 1 and abs(train_done - prev_train_done) > 0: current_train_cost = float( train_cost - prev_train_cost) / float(train_done - prev_train_done) if numpy.isinf(c) or numpy.isnan(c): current_train_cost = 0 prev_train_cost = train_cost prev_train_done = train_done h, m, s = ConvertTimedelta(this_time - start_time) # We need to catch exceptions due to high numbers in exp try: print(".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_kl_divergence_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\ state['time_stop'] - (time.time() - start_time)/60.,\ step, \ batch['x'].shape[1], \ batch['max_length'], \ float(train_cost/train_done), \ math.exp(float(train_cost/train_done)), \ current_train_cost, \ math.exp(current_train_cost), \ float(train_misclass)/float(train_done), \ float(train_kl_divergence_cost/train_done), \ float(train_posterior_gaussian_mean_variance/train_dialogues_done))) except: pass ### Inspection phase if (step % 20 == 0): if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance: try: print('posterior_gaussian_mean_combination', model.posterior_mean_combination.W.get_value()) except: pass print( 'latent_piecewise_utterance_variable_approx_posterior_alpha', numpy.mean( latent_piecewise_utterance_variable_approx_posterior_alpha ), latent_piecewise_utterance_variable_approx_posterior_alpha) print( 'latent_piecewise_utterance_variable_prior_alpha', numpy.mean( latent_piecewise_utterance_variable_prior_alpha), latent_piecewise_utterance_variable_prior_alpha) print( 'latent_piecewise_utterance_variable_alpha_diff', (latent_piecewise_utterance_variable_approx_posterior_alpha - latent_piecewise_utterance_variable_prior_alpha)) print('latent_piecewise_posterior_sample', numpy.min(latent_piecewise_posterior_sample), numpy.max(latent_piecewise_posterior_sample), latent_piecewise_posterior_sample[0, 0, :]) print('ran_uniform_const_utterance', numpy.min(ran_uniform_const_utterance), numpy.max(ran_uniform_const_utterance), ran_uniform_const_utterance[0, 0, :]) if model.utterance_decoder_gating.upper( ) == 'GRU' and model.decoder_bias_type.upper() == 'ALL': Wd_s_q = model.utterance_decoder.Wd_s_q.get_value() Wd_s_q_len = Wd_s_q.shape[0] print('model.utterance_decoder Wd_s_q full', numpy.mean(numpy.abs(Wd_s_q)), numpy.mean(Wd_s_q**2)) if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance: Wd_s_q_gaussian = Wd_s_q[ Wd_s_q_len - 2 * model.latent_piecewise_per_utterance_dim:Wd_s_q_len - model.latent_piecewise_per_utterance_dim, :] Wd_s_q_piecewise = Wd_s_q[ Wd_s_q_len - model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :] print('model.utterance_decoder Wd_s_q gaussian', numpy.mean(numpy.abs(Wd_s_q_gaussian)), numpy.mean(Wd_s_q_gaussian**2)) print('model.utterance_decoder Wd_s_q piecewise', numpy.mean(numpy.abs(Wd_s_q_piecewise)), numpy.mean(Wd_s_q_piecewise**2)) print( 'model.utterance_decoder Wd_s_q piecewise/gaussian', numpy.mean(numpy.abs(Wd_s_q_piecewise)) / numpy.mean(numpy.abs(Wd_s_q_gaussian)), numpy.mean(Wd_s_q_piecewise**2) / numpy.mean(Wd_s_q_gaussian**2)) elif model.add_latent_gaussian_per_utterance: Wd_s_q_piecewise = Wd_s_q[ Wd_s_q_len - model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :] print('model.utterance_decoder Wd_s_q piecewise', numpy.mean(numpy.abs(Wd_s_q_piecewise)), numpy.mean(Wd_s_q_piecewise**2)) elif model.add_latent_piecewise_per_utterance: Wd_s_q_gaussian = Wd_s_q[ Wd_s_q_len - model.latent_piecewise_per_utterance_dim:Wd_s_q_len, :] print('model.utterance_decoder Wd_s_q gaussian', numpy.mean(numpy.abs(Wd_s_q_gaussian)), numpy.mean(Wd_s_q_gaussian**2)) if model.utterance_decoder_gating.upper( ) == 'BOW' and model.decoder_bias_type.upper() == 'ALL': Wd_bow_W_in = model.utterance_decoder.Wd_bow_W_in.get_value() Wd_bow_W_in_len = Wd_bow_W_in.shape[0] print('model.utterance_decoder Wd_bow_W_in full', numpy.mean(numpy.abs(Wd_bow_W_in)), numpy.mean(Wd_bow_W_in**2)) if model.add_latent_gaussian_per_utterance and model.add_latent_piecewise_per_utterance: Wd_bow_W_in_gaussian = Wd_bow_W_in[ Wd_bow_W_in_len - 2 * model.latent_piecewise_per_utterance_dim: Wd_bow_W_in_len - model.latent_piecewise_per_utterance_dim, :] Wd_bow_W_in_piecewise = Wd_bow_W_in[ Wd_bow_W_in_len - model. latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :] print('model.utterance_decoder Wd_bow_W_in gaussian', numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)), numpy.mean(Wd_bow_W_in_gaussian**2)) print('model.utterance_decoder Wd_bow_W_in piecewise', numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)), numpy.mean(Wd_bow_W_in_piecewise**2)) print( 'model.utterance_decoder Wd_bow_W_in piecewise/gaussian', numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)) / numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)), numpy.mean(Wd_bow_W_in_piecewise**2) / numpy.mean(Wd_bow_W_in_gaussian**2)) elif model.add_latent_gaussian_per_utterance: Wd_bow_W_in_piecewise = Wd_bow_W_in[ Wd_bow_W_in_len - model. latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :] print('model.utterance_decoder Wd_bow_W_in piecewise', numpy.mean(numpy.abs(Wd_bow_W_in_piecewise)), numpy.mean(Wd_bow_W_in_piecewise**2)) elif model.add_latent_piecewise_per_utterance: Wd_bow_W_in_gaussian = Wd_bow_W_in[ Wd_bow_W_in_len - model. latent_piecewise_per_utterance_dim:Wd_bow_W_in_len, :] print('model.utterance_decoder Wd_bow_W_in gaussian', numpy.mean(numpy.abs(Wd_bow_W_in_gaussian)), numpy.mean(Wd_bow_W_in_gaussian**2)) ### Evaluation phase if valid_data is not None and\ step % state['valid_freq'] == 0 and step > 1: start_validation = True # Only start validation loop once it's time to validate and once all previous batches have been reset if start_validation and is_end_of_batch: start_validation = False valid_data.start() valid_cost = 0 valid_kl_divergence_cost = 0 valid_posterior_gaussian_mean_variance = 0 valid_wordpreds_done = 0 valid_dialogues_done = 0 logger.debug("[VALIDATION START]") while True: batch = valid_data.next() # Validation finished if not batch: break logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_reset = batch['x_reset'] ran_gaussian_const_utterance = batch[ 'ran_var_gaussian_constutterance'] ran_uniform_const_utterance = batch[ 'ran_var_uniform_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] posterior_gaussian_mean_variance = 0.0 c, c_list, kl_divergence_cost = eval_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_gaussian_const_utterance, ran_uniform_const_utterance, ran_decoder_drop_mask) # Rehape into matrix, where rows are validation samples and columns are tokens # Note that we use max_length-1 because we don't get a cost for the first token # (the first token is always assumed to be eos) c_list = c_list.reshape((batch['x'].shape[1], max_length - 1), order=(1, 0)) c_list = numpy.sum(c_list, axis=1) words_in_dialogues = numpy.sum(x_cost_mask, axis=0) c_list = c_list / words_in_dialogues if numpy.isinf(c) or numpy.isnan(c): continue valid_cost += c valid_kl_divergence_cost += kl_divergence_cost valid_posterior_gaussian_mean_variance += posterior_gaussian_mean_variance # Print batch statistics print('valid_cost', valid_cost) print('valid_kl_divergence_cost sample', kl_divergence_cost) print('posterior_gaussian_mean_variance', posterior_gaussian_mean_variance) valid_wordpreds_done += batch['num_preds'] valid_dialogues_done += batch['num_dialogues'] logger.debug("[VALIDATION END]") valid_cost /= max(1.0, valid_wordpreds_done) valid_kl_divergence_cost /= max(1.0, valid_wordpreds_done) valid_posterior_gaussian_mean_variance /= max( 1.0, valid_dialogues_done) if (len(timings["valid_cost"]) == 0) \ or (valid_cost < numpy.min(timings["valid_cost"])) \ or (save_model_on_first_valid and valid_rounds == 0): patience = state['patience'] # Save model if there is decrease in validation cost save(model, timings, train_data) print('best valid_cost', valid_cost) elif valid_cost >= timings["valid_cost"][-1] * state[ 'cost_threshold']: patience -= 1 if args.save_every_valid_iteration: save(model, timings, train_data, '_' + str(step) + '_') if args.auto_restart: save(model, timings, train_data, '_auto_') # We need to catch exceptions due to high numbers in exp try: print( "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid kldiv cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % (float(valid_cost), float( math.exp(valid_cost)), float(valid_kl_divergence_cost), float(valid_posterior_gaussian_mean_variance), patience)) except: try: print("** valid cost (NLL) = %.4f, patience = %d" % (float(valid_cost), patience)) except: pass timings["train_cost"].append(train_cost / train_done) timings["train_kl_divergence_cost"].append( train_kl_divergence_cost / train_done) timings["train_posterior_gaussian_mean_variance"].append( train_posterior_gaussian_mean_variance / train_dialogues_done) timings["valid_cost"].append(valid_cost) timings["valid_kl_divergence_cost"].append( valid_kl_divergence_cost) timings["valid_posterior_gaussian_mean_variance"].append( valid_posterior_gaussian_mean_variance) # Reset train cost, train misclass and train done metrics train_cost = 0 train_done = 0 prev_train_cost = 0 prev_train_done = 0 # Count number of validation rounds done so far valid_rounds += 1 step += 1 logger.debug("All done, exiting...")
MODEL_PREFIX = '/home/ml/mnosew1/SavedModels/Twitter/1489857182.98_TwitterModel' state_path = '%s_state.pkl' % MODEL_PREFIX model_path = '%s_model.npz' % MODEL_PREFIX state = prototype_state() with open(state_path, 'r') as handle: state.update(cPickle.load(handle)) #state['dictionary'] = '/home/ml/mnosew1/data/twitter/hred_bpe/Dataset.dict.pkl' state[ 'dictionary'] = '/home/ml/mnosew1/SavedModels/Twitter/Dataset.dict-5k.pkl' print 'Building model...' model = DialogEncoderDecoder(state) print 'Building sampler...' sampler = search.BeamSampler(model) print 'Loading model...' model.load(model_path) print 'Model built.' HISTORY = [] @app.route('/hred', methods=['POST']) def hred_response(): print 'Generating HRED response...' text = request.json['result']['resolvedQuery'] text = text.replace("'", " '") context = '<first_speaker> %s </s>' % text.strip().lower() HISTORY.append(context) print 'History:', HISTORY
def main(): args = parse_args() state = prototype_ubuntu_HRED() #prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" with open(state_path) as src: state.update(cPickle.load(src)) state['dictionary'] = "/home/ml/rlowe1/UbuntuData/Dataset.dict.pkl" # MODIFIED: Removed since configuring logging has to be before construction of any logging object # logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logging.basicConfig( format= '[%(asctime)s][%(levelname)s][%(filename)s][%(lineno)d] - %(message)s', datefmt='%d/%m/%Y %H:%M:%S', filename= '/home/2016/pparth2/Desktop/gods/Goal-Oriented_Dialogue_Systems/Gods-master/agents/hred/log.chat', filemode='a', level=logging.DEBUG) logger = logging.getLogger(__name__) model = DialogEncoderDecoder(state) if os.path.isfile(model_path): logger.debug("Loading previous model") model.load(model_path) else: raise Exception("Must specify a valid model path") logger.info("This model uses " + model.decoder_bias_type + " bias type") #sampler = search.RandomSampler(model) sampler = search.BeamSampler(model) # Start chat loop utterances = collections.deque() while (True): var = raw_input("User - ") # Increase number of utterances. We just set it to zero for simplicity so that model has no memory. # But it works fine if we increase this number while len(utterances) > 0: utterances.popleft() current_utterance = [model.end_sym_utterance] + [ '<first_speaker>' ] + var.split() + [model.end_sym_utterance] utterances.append(current_utterance) #TODO Sample a random reply. To spice it up, we could pick the longest reply or the reply with the fewest placeholders... seqs = list(itertools.chain(*utterances)) #TODO Retrieve only replies which are generated for second speaker... sentences = sample(model, \ seqs= [seqs], ignore_unk=args.ignore_unk, \ sampler=sampler, n_samples=1) if len(sentences) == 0: raise ValueError("Generation error, no sentences were produced!") utterances.append(sentences[0][0].split()) reply = sentences[0][0].encode('utf-8') print "AI - ", remove_speaker_tokens(reply)
def main(args): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") state = eval(args.prototype)() timings = init_timings() if args.resume != "": logger.debug("Resuming %s" % args.resume) state_file = args.resume + '_state.pkl' timings_file = args.resume + '_timing.npz' if os.path.isfile(state_file) and os.path.isfile(timings_file): logger.debug("Loading previous state") state = cPickle.load(open(state_file, 'r')) timings = dict(numpy.load(open(timings_file, 'r'))) for x, y in timings.items(): timings[x] = list(y) # Increment seed to make sure we get newly shuffled batches when training on large datasets state['seed'] = state['seed'] + 10 else: raise Exception("Cannot resume, cannot find files!") logger.debug("State:\n{}".format(pprint.pformat(state))) logger.debug("Timings:\n{}".format(pprint.pformat(timings))) if args.force_train_all_wordemb == True: state['fix_pretrained_word_embeddings'] = False model = DialogEncoderDecoder(state) rng = model.rng if args.resume != "": filename = args.resume + '_model.npz' if os.path.isfile(filename): logger.debug("Loading previous model") parameter_strings_to_ignore = [] if args.reinitialize_decoder_parameters: parameter_strings_to_ignore += ['latent_utterance_prior'] parameter_strings_to_ignore += [ 'latent_utterance_approx_posterior' ] if args.reinitialize_variational_parameters: parameter_strings_to_ignore += ['Wd_'] parameter_strings_to_ignore += ['bd_'] parameter_strings_to_ignore += ['variational_cost_weight'] load(model, filename, parameter_strings_to_ignore) else: raise Exception("Cannot resume, cannot find model file!") if 'run_id' not in model.state: raise Exception( 'Backward compatibility not ensured! (need run_id in state)') else: # assign new run_id key model.state['run_id'] = RUN_ID logger.debug("Compile trainer") if not state["use_nce"]: if ('add_latent_gaussian_per_utterance' in state) and (state["add_latent_gaussian_per_utterance"]): logger.debug( "Training using variational lower bound on log-likelihood") else: logger.debug("Training using exact log-likelihood") train_batch = model.build_train_function() else: logger.debug("Training with noise contrastive estimation") train_batch = model.build_nce_function() eval_batch = model.build_eval_function() if model.add_latent_gaussian_per_utterance: eval_grads = model.build_eval_grads() random_sampler = search.RandomSampler(model) beam_sampler = search.BeamSampler(model) logger.debug("Load data") train_data, \ valid_data, = get_train_iterator(state) train_data.start() use_secondary_data = False if ('secondary_train_dialogues' in state) and (len(state['secondary_train_dialogues']) > 0): logger.debug("Load secondary data") use_secondary_data = True secondary_train_data = get_secondary_train_iterator(state) secondary_train_data.start() secondary_rng = numpy.random.RandomState(state['seed']) # Build the data structures for Bleu evaluation if 'bleu_evaluation' in state: bleu_eval_n_1 = BleuEvaluator(n=1) bleu_eval_n_2 = BleuEvaluator(n=2) bleu_eval_n_3 = BleuEvaluator(n=3) bleu_eval_n_4 = BleuEvaluator(n=4) jaccard_eval = JaccardEvaluator() recall_at_1_eval = RecallEvaluator(n=1) recall_at_5_eval = RecallEvaluator(n=5) mrr_at_5_eval = MRREvaluator(n=5) tfidf_cs_at_1_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 1) tfidf_cs_at_5_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 5) samples = open(state['bleu_evaluation'], 'r').readlines() n = state['bleu_context_length'] contexts = [] targets = [] for x in samples: sentences = x.strip().split('\t') assert len(sentences) > n contexts.append(sentences[:n]) targets.append(sentences[n:]) # Start looping through the dataset step = 0 patience = state['patience'] start_time = time.time() train_cost = 0 train_variational_cost = 0 train_posterior_mean_variance = 0 train_misclass = 0 train_done = 0 train_dialogues_done = 0.0 prev_train_cost = 0 prev_train_done = 0 ex_done = 0 is_end_of_batch = True start_validation = False training_on_secondary_dataset = False batch = None while (step < state['loop_iters'] and (time.time() - start_time) / 60. < state['time_stop'] and patience >= 0): # Sample stuff if step % 200 == 0: # First generate stochastic samples for param in model.params: print "%s = %.4f" % (param.name, numpy.sum(param.get_value()** 2)**0.5) samples, costs = random_sampler.sample([[]], n_samples=1, n_turns=3) print "Sampled : {}".format(samples[0]) # Training phase # If we are training on a primary and secondary dataset, sample at random from either of them if is_end_of_batch: if use_secondary_data and (secondary_rng.uniform() > state['secondary_proportion']): training_on_secondary_dataset = True else: training_on_secondary_dataset = False if training_on_secondary_dataset: batch = secondary_train_data.next() else: batch = train_data.next() # Train finished if not batch: # Restart training logger.debug("Got None...") break logger.debug("[TRAIN] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_semantic = batch['x_semantic'] x_reset = batch['x_reset'] ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] is_end_of_batch = False if numpy.sum(numpy.abs(x_reset)) < 1: print 'END-OF-BATCH EXAMPLE!' is_end_of_batch = True if state['use_nce']: y_neg = rng.choice(size=(10, max_length, x_data.shape[1]), a=model.idim, p=model.noise_probs).astype('int32') c, variational_cost, posterior_mean_variance = train_batch( x_data, x_data_reversed, y_neg, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance, ran_decoder_drop_mask) else: c, variational_cost, posterior_mean_variance = train_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance, ran_decoder_drop_mask) print 'cost_sum', c print 'cost_mean', c / float(numpy.sum(x_cost_mask)) print 'variational_cost_sum', variational_cost print 'variational_cost_mean', variational_cost / float( len(numpy.where(x_data == model.eos_sym)[0])) print 'posterior_mean_variance', posterior_mean_variance #if variational_cost > 2: # print 'x_data', x_data # print 'x_data_reversed', x_data_reversed # print 'max_length', max_length # print 'x_cost_mask', x_cost_mask # print 'x_semantic', x_semantic # print 'x_reset', x_reset # print 'ran_cost_utterance', ran_cost_utterance[0:3, 0:3, 0:3] if numpy.isinf(c) or numpy.isnan(c): logger.warn("Got NaN cost .. skipping") gc.collect() continue train_cost += c train_variational_cost += variational_cost train_posterior_mean_variance += posterior_mean_variance train_done += batch['num_preds'] train_dialogues_done += batch['num_dialogues'] this_time = time.time() if step % state['train_freq'] == 0: elapsed = this_time - start_time # Keep track of training cost for the last 'train_freq' batches. current_train_cost = train_cost / train_done if prev_train_done >= 1: current_train_cost = float( train_cost - prev_train_cost) / float(train_done - prev_train_done) prev_train_cost = train_cost prev_train_done = train_done h, m, s = ConvertTimedelta(this_time - start_time) print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_variational_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\ state['time_stop'] - (time.time() - start_time)/60.,\ step, \ batch['x'].shape[1], \ batch['max_length'], \ float(train_cost/train_done), \ math.exp(float(train_cost/train_done)), \ current_train_cost, \ math.exp(current_train_cost), \ float(train_misclass)/float(train_done), \ float(train_variational_cost/train_done), \ float(train_posterior_mean_variance/train_dialogues_done)) if valid_data is not None and\ step % state['valid_freq'] == 0 and step > 1: start_validation = True # Evaluate gradient variance every 200 steps if (step % 200 == 0) and (model.add_latent_gaussian_per_utterance): k_eval = 10 softmax_costs = numpy.zeros((k_eval), dtype='float32') var_costs = numpy.zeros((k_eval), dtype='float32') gradients_wrt_softmax = numpy.zeros( (k_eval, model.qdim_decoder, model.qdim_decoder), dtype='float32') for k in range(0, k_eval): batch = add_random_variables_to_batch(model.state, model.rng, batch, None, False) ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] softmax_cost, var_cost, grads_wrt_softmax, grads_wrt_variational_cost = eval_grads( x_data, x_data_reversed, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance, ran_decoder_drop_mask) softmax_costs[k] = softmax_cost var_costs[k] = var_cost gradients_wrt_softmax[k, :, :] = grads_wrt_softmax print 'mean softmax_costs', numpy.mean(softmax_costs) print 'std softmax_costs', numpy.std(softmax_costs) print 'mean var_costs', numpy.mean(var_costs) print 'std var_costs', numpy.std(var_costs) print 'mean gradients_wrt_softmax', numpy.mean( numpy.abs(numpy.mean(gradients_wrt_softmax, axis=0))), numpy.mean( gradients_wrt_softmax, axis=0) print 'std gradients_wrt_softmax', numpy.mean( numpy.std(gradients_wrt_softmax, axis=0)), numpy.std(gradients_wrt_softmax, axis=0) print 'std greater than mean', numpy.where( numpy.std(gradients_wrt_softmax, axis=0) > numpy.abs( numpy.mean(gradients_wrt_softmax, axis=0)))[0].shape[0] Wd_s_q = model.utterance_decoder.Wd_s_q.get_value() print 'Wd_s_q all', numpy.sum(numpy.abs(Wd_s_q)), numpy.mean( numpy.abs(Wd_s_q)) print 'Wd_s_q latent', numpy.sum( numpy.abs( Wd_s_q[(Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])), numpy.mean( numpy.abs(Wd_s_q[( Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])) print 'Wd_s_q ratio', (numpy.sum( numpy.abs(Wd_s_q[(Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])) / numpy.sum(numpy.abs(Wd_s_q))) #print 'tmp_normalizing_constant_a', tmp_normalizing_constant_a #print 'tmp_normalizing_constant_b', tmp_normalizing_constant_b #print 'tmp_c', tmp_c.shape, tmp_c #print 'tmp_d', tmp_d.shape, tmp_d #print 'grads_wrt_softmax', grads_wrt_softmax.shape, numpy.sum(numpy.abs(grads_wrt_softmax)), numpy.abs(grads_wrt_softmax[0:5,0:5]) #print 'grads_wrt_variational_cost', grads_wrt_variational_cost.shape, numpy.sum(numpy.abs(grads_wrt_variational_cost)), numpy.abs(grads_wrt_variational_cost[0:5,0:5]) # Only start validation loop once it's time to validate and once all previous batches have been reset if start_validation and is_end_of_batch: start_validation = False valid_data.start() valid_cost = 0 valid_variational_cost = 0 valid_posterior_mean_variance = 0 valid_wordpreds_done = 0 valid_dialogues_done = 0 # Prepare variables for plotting histogram over word-perplexities and mutual information valid_data_len = valid_data.data_len valid_cost_list = numpy.zeros((valid_data_len, )) valid_pmi_list = numpy.zeros((valid_data_len, )) # Prepare variables for printing the training examples the model performs best and worst on valid_extrema_setsize = min(state['track_extrema_samples_count'], valid_data_len) valid_extrema_samples_to_print = min( state['print_extrema_samples_count'], valid_extrema_setsize) max_stored_len = 160 # Maximum number of tokens to store for dialogues with highest and lowest validation errors valid_lowest_costs = numpy.ones((valid_extrema_setsize, )) * 1000 valid_lowest_dialogues = numpy.ones( (valid_extrema_setsize, max_stored_len)) * 1000 valid_highest_costs = numpy.ones( (valid_extrema_setsize, )) * (-1000) valid_highest_dialogues = numpy.ones( (valid_extrema_setsize, max_stored_len)) * (-1000) logger.debug("[VALIDATION START]") while True: batch = valid_data.next() # Train finished if not batch: break logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_semantic = batch['x_semantic'] x_reset = batch['x_reset'] ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] c, c_list, variational_cost, posterior_mean_variance = eval_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance, ran_decoder_drop_mask) # Rehape into matrix, where rows are validation samples and columns are tokens # Note that we use max_length-1 because we don't get a cost for the first token # (the first token is always assumed to be eos) c_list = c_list.reshape((batch['x'].shape[1], max_length - 1), order=(1, 0)) c_list = numpy.sum(c_list, axis=1) words_in_dialogues = numpy.sum(x_cost_mask, axis=0) c_list = c_list / words_in_dialogues if numpy.isinf(c) or numpy.isnan(c): continue valid_cost += c valid_variational_cost += variational_cost valid_posterior_mean_variance += posterior_mean_variance print 'valid_cost', valid_cost print 'valid_variational_cost sample', variational_cost print 'posterior_mean_variance', posterior_mean_variance valid_wordpreds_done += batch['num_preds'] valid_dialogues_done += batch['num_dialogues'] logger.debug("[VALIDATION END]") valid_cost /= valid_wordpreds_done valid_variational_cost /= valid_wordpreds_done valid_posterior_mean_variance /= valid_dialogues_done if len(timings["valid_cost"]) == 0 or valid_cost < numpy.min( timings["valid_cost"]): patience = state['patience'] # Saving model if decrease in validation cost save(model, timings) print 'best valid_cost', valid_cost elif valid_cost >= timings["valid_cost"][-1] * state[ 'cost_threshold']: patience -= 1 if args.save_every_valid_iteration: save(model, timings, '_' + str(step) + '_') print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid variational cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % ( float(valid_cost), float( math.exp(valid_cost)), float(valid_variational_cost), float(valid_posterior_mean_variance), patience) timings["train_cost"].append(train_cost / train_done) timings["train_variational_cost"].append(train_variational_cost / train_done) timings["train_posterior_mean_variance"].append( train_posterior_mean_variance / train_dialogues_done) timings["valid_cost"].append(valid_cost) timings["valid_variational_cost"].append(valid_variational_cost) timings["valid_posterior_mean_variance"].append( valid_posterior_mean_variance) # Reset train cost, train misclass and train done train_cost = 0 train_done = 0 prev_train_cost = 0 prev_train_done = 0 step += 1 logger.debug("All done, exiting...")
def main(): ####yawa add raw_dict = cPickle.load(open('./Data/Dataset.dict.pkl', 'r')) str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict]) idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in raw_dict]) ######### args = parse_args() state = prototype_state() state_path = args.model_prefix + "_state.pkl" model_path = args.model_prefix + "_model.npz" with open(state_path) as src: state.update(cPickle.load(src)) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") model = DialogEncoderDecoder(state) sampler = search.RandomSampler(model) if args.beam_search: sampler = search.BeamSampler(model) if os.path.isfile(model_path): logger.debug("Loading previous model") model.load(model_path) else: raise Exception("Must specify a valid model path") contexts = [[]] lines = open(args.context, "r").readlines() if len(lines): contexts = [x.strip() for x in lines] #contexts = cPickle.load(open('./Data/Test.dialogues.pkl', 'r')) print('Sampling started...') context_samples, context_costs, att_weights, att_context = sampler.sample( contexts, n_samples=args.n_samples, n_turns=args.n_turns, ignore_unk=args.ignore_unk, verbose=args.verbose) print('Sampling finished.') print('Saving to file...') # Write to output file output_handle = open(args.output, "w") for context_sample in context_samples: print >> output_handle, '\t'.join(context_sample) outline = '' #for att_weight in att_weights: #for att_in in att_weight: #print >> output_handle, str(att_in) print "number of weights:" + str(len(att_weights)) #for i in range(len(att_weights)): #outline = att_weights[0] cPickle.dump(att_weights, open('Data/beam_search_2000_2_weight.pkl', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) cPickle.dump(att_context, open('Data/beam_search_2000_2_context.pkl', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) #for i in range(len(att_context)): #print att_context[i] #print numpy.array(att_weights[0]) #print type(att_weights[0]) #aa = numpy.array(att_weights[0]) #size = aa.shape[1] #bb = aa.reshape(5,5,size/5) #print bb.shape output_handle.close() print('Saving to file finished.') print('All done!')
def main(args): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") state = eval(args.prototype)() timings = init_timings() if args.resume != "": logger.debug("Resuming %s" % args.resume) state_file = args.resume + '_state.pkl' timings_file = args.resume + '_timing.npz' if os.path.isfile(state_file) and os.path.isfile(timings_file): logger.debug("Loading previous state") state = cPickle.load(open(state_file, 'r')) timings = dict(numpy.load(open(timings_file, 'r'))) for x, y in timings.items(): timings[x] = list(y) else: raise Exception("Cannot resume, cannot find files!") logger.debug("State:\n{}".format(pprint.pformat(state))) logger.debug("Timings:\n{}".format(pprint.pformat(timings))) model = DialogEncoderDecoder(state) rng = model.rng if args.resume != "": filename = args.resume + '_model.npz' if os.path.isfile(filename): logger.debug("Loading previous model") load(model, filename) else: raise Exception("Cannot resume, cannot find model file!") if 'run_id' not in model.state: raise Exception( 'Backward compatibility not ensured! (need run_id in state)') else: # assign new run_id key model.state['run_id'] = RUN_ID logger.debug("Compile trainer") if not state["use_nce"]: train_batch = model.build_train_function() else: train_batch = model.build_nce_function() eval_batch = model.build_eval_function() eval_misclass_batch = model.build_eval_misclassification_function() random_sampler = search.RandomSampler(model) beam_sampler = search.BeamSampler(model) logger.debug("Load data") train_data, \ valid_data, \ test_data = get_batch_iterator(rng, state) train_data.start() # Build the data structures for Bleu evaluation if 'bleu_evaluation' in state: bleu_eval = BleuEvaluator() jaccard_eval = JaccardEvaluator() recall_at_1_eval = RecallEvaluator(n=1) recall_at_5_eval = RecallEvaluator(n=5) mrr_at_5_eval = MRREvaluator(n=5) tfidf_cs_at_1_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 1) tfidf_cs_at_5_eval = TFIDF_CS_Evaluator(model, train_data.data_len, 5) samples = open(state['bleu_evaluation'], 'r').readlines() n = state['bleu_context_length'] contexts = [] targets = [] for x in samples: sentences = x.strip().split('\t') assert len(sentences) > n contexts.append(sentences[:n]) targets.append(sentences[n:]) # Start looping through the dataset step = 0 patience = state['patience'] start_time = time.time() train_cost = 0 train_misclass = 0 train_done = 0 ex_done = 0 while (step < state['loop_iters'] and (time.time() - start_time) / 60. < state['time_stop'] and patience >= 0): # Sample stuff if step % 200 == 0: for param in model.params: print "%s = %.4f" % (param.name, numpy.sum(param.get_value()** 2)**0.5) samples, costs = random_sampler.sample([[]], n_samples=1, n_turns=3) print "Sampled : {}".format(samples[0]) # Training phase batch = train_data.next() # Train finished if not batch: # Restart training logger.debug("Got None...") break logger.debug("[TRAIN] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] if state['use_nce']: y_neg = rng.choice(size=(10, max_length, x_data.shape[1]), a=model.idim, p=model.noise_probs).astype('int32') c = train_batch(x_data, y_neg, max_length, x_cost_mask) else: c = train_batch(x_data, max_length, x_cost_mask) if numpy.isinf(c) or numpy.isnan(c): logger.warn("Got NaN cost .. skipping") continue train_cost += c # Compute word-error rate miscl = eval_misclass_batch(x_data, max_length, x_cost_mask) if numpy.isinf(c) or numpy.isnan(c): logger.warn("Got NaN misclassification .. skipping") continue train_misclass += miscl train_done += batch['num_preds'] this_time = time.time() if step % state['train_freq'] == 0: elapsed = this_time - start_time h, m, s = ConvertTimedelta(this_time - start_time) print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f acc_mean_word_error = %.4f " % (h, m, s,\ state['time_stop'] - (time.time() - start_time)/60.,\ step, \ batch['x'].shape[1], \ batch['max_length'], \ float(train_cost/train_done), \ math.exp(float(train_cost/train_done)), \ float(train_misclass)/float(train_done)) if valid_data is not None and\ step % state['valid_freq'] == 0 and step > 1: valid_data.start() valid_cost = 0 valid_misclass = 0 valid_empirical_mutual_information = 0 valid_wordpreds_done = 0 valid_triples_done = 0 # Prepare variables for plotting histogram over word-perplexities and mutual information valid_data_len = valid_data.data_len valid_cost_list = numpy.zeros((valid_data_len, )) valid_pmi_list = numpy.zeros((valid_data_len, )) # Prepare variables for printing the training examples the model performs best and worst on valid_extrema_setsize = min(state['track_extrema_samples_count'], valid_data_len) valid_extrema_samples_to_print = min( state['print_extrema_samples_count'], valid_extrema_setsize) valid_lowest_costs = numpy.ones((valid_extrema_setsize, )) * 1000 valid_lowest_triples = numpy.ones( (valid_extrema_setsize, state['seqlen'])) * 1000 valid_highest_costs = numpy.ones( (valid_extrema_setsize, )) * (-1000) valid_highest_triples = numpy.ones( (valid_extrema_setsize, state['seqlen'])) * (-1000) logger.debug("[VALIDATION START]") while True: batch = valid_data.next() # Train finished if not batch: break logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] c, c_list = eval_batch(x_data, max_length, x_cost_mask) c_list = c_list.reshape((batch['x'].shape[1], max_length), order=(1, 0)) c_list = numpy.sum(c_list, axis=1) words_in_triples = numpy.sum(x_cost_mask, axis=0) c_list = c_list / words_in_triples if numpy.isinf(c) or numpy.isnan(c): continue valid_cost += c nxt = min((valid_triples_done + batch['x'].shape[1]), valid_data_len) triples_in_batch = nxt - valid_triples_done valid_cost_list[(nxt - triples_in_batch):nxt] = numpy.exp( c_list[0:triples_in_batch]) # Store best and worst validation costs con_costs = np.concatenate( [valid_lowest_costs, c_list[0:triples_in_batch]]) con_triples = np.concatenate( [valid_lowest_triples, x_data[:, 0:triples_in_batch].T], axis=0) con_indices = con_costs.argsort()[0:valid_extrema_setsize][::1] valid_lowest_costs = con_costs[con_indices] valid_lowest_triples = con_triples[con_indices] con_costs = np.concatenate( [valid_highest_costs, c_list[0:triples_in_batch]]) con_triples = np.concatenate( [valid_highest_triples, x_data[:, 0:triples_in_batch].T], axis=0) con_indices = con_costs.argsort( )[-valid_extrema_setsize:][::-1] valid_highest_costs = con_costs[con_indices] valid_highest_triples = con_triples[con_indices] # Compute word-error rate miscl = eval_misclass_batch(x_data, max_length, x_cost_mask) if numpy.isinf(c) or numpy.isnan(c): continue valid_misclass += miscl # Compute empirical mutual information if state['compute_mutual_information'] == True: # Compute marginal log-likelihood of last utterance in triple: # We approximate it with the margina log-probabiltiy of the utterance being observed first in the triple x_data_last_utterance = batch['x_last_utterance'] x_cost_mask_last_utterance = batch['x_mask_last_utterance'] marginal_last_utterance_loglikelihood, marginal_last_utterance_loglikelihood_list = eval_batch( x_data_last_utterance, max_length, x_cost_mask_last_utterance) marginal_last_utterance_loglikelihood_list = marginal_last_utterance_loglikelihood_list.reshape( (batch['x'].shape[1], max_length), order=(1, 0)) marginal_last_utterance_loglikelihood_list = numpy.sum( marginal_last_utterance_loglikelihood_list, axis=1) # If we wanted to normalize histogram plots by utterance length, we should enable this: #words_in_last_utterance = numpy.sum(x_cost_mask_last_utterance, axis=0) #marginal_last_utterance_loglikelihood_list = marginal_last_utterance_loglikelihood_list / words_in_last_utterance # Compute marginal log-likelihood of first utterances in triple by masking the last utterance x_cost_mask_first_utterances = x_cost_mask - x_cost_mask_last_utterance marginal_first_utterances_loglikelihood, marginal_first_utterances_loglikelihood_list = eval_batch( x_data, max_length, x_cost_mask_first_utterances) marginal_first_utterances_loglikelihood_list = marginal_first_utterances_loglikelihood_list.reshape( (batch['x'].shape[1], max_length), order=(1, 0)) marginal_first_utterances_loglikelihood_list = numpy.sum( marginal_first_utterances_loglikelihood_list, axis=1) # If we wanted to normalize histogram plots by utterance length, we should enable this: #words_in_first_utterances = numpy.sum(x_cost_mask_first_utterances, axis=0) #marginal_first_utterances_loglikelihood_list = marginal_first_utterances_loglikelihood_list / words_in_first_utterances # Compute empirical mutual information and pointwise empirical mutual information valid_empirical_mutual_information += -c + marginal_first_utterances_loglikelihood + marginal_last_utterance_loglikelihood valid_pmi_list[(nxt - triples_in_batch):nxt] = ( -c_list * words_in_triples + marginal_first_utterances_loglikelihood_list + marginal_last_utterance_loglikelihood_list )[0:triples_in_batch] valid_wordpreds_done += batch['num_preds'] valid_triples_done += batch['x'].shape[1] logger.debug("[VALIDATION END]") valid_cost /= valid_wordpreds_done valid_misclass /= float(valid_wordpreds_done) valid_empirical_mutual_information /= float(valid_triples_done) if len(timings["valid_cost"] ) == 0 or valid_cost < timings["valid_cost"][-1]: patience = state['patience'] # Saving model if decrease in validation cost save(model, timings) elif valid_cost >= timings["valid_cost"][-1] * state[ 'cost_threshold']: patience -= 1 print "** valid cost = %.4f, valid word-perplexity = %.4f, valid mean word-error = %.4f, valid emp. mutual information = %.4f, patience = %d" % ( float(valid_cost), float( math.exp(valid_cost)), float(valid_misclass), valid_empirical_mutual_information, patience) timings["train_cost"].append(train_cost / train_done) timings["train_misclass"].append( float(train_misclass) / float(train_done)) timings["valid_cost"].append(valid_cost) timings["valid_misclass"].append(valid_misclass) timings["valid_emi"].append(valid_empirical_mutual_information) # Reset train cost, train misclass and train done train_cost = 0 train_misclass = 0 train_done = 0 # Plot histogram over validation costs try: pylab.figure() bins = range(0, 50, 1) pylab.hist(valid_cost_list, normed=1, histtype='bar') pylab.savefig(model.state['save_dir'] + '/' + model.state['run_id'] + "_" + model.state['prefix'] + 'Valid_WordPerplexities_' + str(step) + '.png') except: pass # Print 5 of 10% validation samples with highest log-likelihood if state['track_extrema_validation_samples'] == True: print " highest word log-likelihood valid samples: " np.random.shuffle(valid_lowest_triples) for i in range(valid_extrema_samples_to_print): print " Sample: {}".format(" ".join( model.indices_to_words( numpy.ravel(valid_lowest_triples[i, :])))) print " lowest word log-likelihood valid samples: " np.random.shuffle(valid_highest_triples) for i in range(valid_extrema_samples_to_print): print " Sample: {}".format(" ".join( model.indices_to_words( numpy.ravel(valid_highest_triples[i, :])))) # Plot histogram over empirical pointwise mutual informations if state['compute_mutual_information'] == True: try: pylab.figure() bins = range(0, 100, 1) pylab.hist(valid_pmi_list, normed=1, histtype='bar') pylab.savefig(model.state['save_dir'] + '/' + model.state['run_id'] + "_" + model.state['prefix'] + 'Valid_PMI_' + str(step) + '.png') except: pass if 'bleu_evaluation' in state and \ step % state['valid_freq'] == 0 and step > 1: # Compute samples with beam search logger.debug( "Executing beam search to get targets for bleu, jaccard etc.") samples, costs = beam_sampler.sample(contexts, n_samples=5, ignore_unk=True) logger.debug("Finished beam search.") assert len(samples) == len(contexts) #print 'samples', samples # Bleu evaluation bleu = bleu_eval.evaluate(samples, targets) print "** bleu score = %.4f " % bleu[0] timings["valid_bleu"].append(bleu[0]) # Jaccard evaluation jaccard = jaccard_eval.evaluate(samples, targets) print "** jaccard score = %.4f " % jaccard timings["valid_jaccard"].append(jaccard) # Recall evaluation recall_at_1 = recall_at_1_eval.evaluate(samples, targets) print "** recall@1 score = %.4f " % recall_at_1 timings["valid_recall_at_1"].append(recall_at_1) recall_at_5 = recall_at_5_eval.evaluate(samples, targets) print "** recall@5 score = %.4f " % recall_at_5 timings["valid_recall_at_5"].append(recall_at_5) mrr_at_5 = mrr_at_5_eval.evaluate(samples, targets) # MRR evaluation (equivalent to mean average precision) print "** mrr@5 score = %.4f " % mrr_at_5 timings["valid_mrr_at_5"].append(mrr_at_5) # TF-IDF cosine similarity evaluation tfidf_cs_at_1 = tfidf_cs_at_1_eval.evaluate(samples, targets) print "** tfidf-cs@1 score = %.4f " % tfidf_cs_at_1 timings["tfidf_cs_at_1"].append(tfidf_cs_at_1) tfidf_cs_at_5 = tfidf_cs_at_5_eval.evaluate(samples, targets) print "** tfidf-cs@5 score = %.4f " % tfidf_cs_at_5 timings["tfidf_cs_at_5"].append(tfidf_cs_at_5) step += 1 logger.debug("All done, exiting...")
def train(args, state=None, commands=None): if commands: def shall_train(): return commands['train'] def shall_save(): return commands['save'] def shall_abort(): return commands['abort'] def saving_done(): commands['save'] = False #logging.basicConfig(level = logging.DEBUG, # format = "%(asctime)s: %(name)s: %(levelname)s: %(message)s") if not state: state = eval(args.prototype)() timings = init_timings() auto_restarting = False if args.auto_restart: assert not args.save_every_valid_iteration assert len(args.resume) == 0 directory = state['save_dir'] if not directory[-1] == '/': directory = directory + '/' auto_resume_postfix = state['prefix'] + '_auto_model.npz' if os.path.exists(directory): directory_files = [ f for f in listdir(directory) if isfile(join(directory, f)) ] resume_filename = '' for f in directory_files: if len(f) > len(auto_resume_postfix): if f[len(f) - len(auto_resume_postfix ):len(f)] == auto_resume_postfix: if len(resume_filename) > 0: print 'ERROR: FOUND MULTIPLE MODELS IN DIRECTORY:', directory assert False else: resume_filename = directory + f[ 0:len(f) - len('__auto_model.npz')] if len(resume_filename) > 0: logger.debug("Found model to automatically resume: %s" % resume_filename) auto_restarting = True # Setup training to automatically resume training with the model found args.resume = resume_filename # Disable training from reinitialization any parameters args.reinitialize_decoder_parameters = False args.reinitialize_latent_variable_parameters = False else: logger.debug( "Could not find any model to automatically resume...") step = 0 if args.resume != "": logger.debug("Resuming %s" % args.resume) state_file = args.resume + '_state.pkl' if commands: if commands['state_path']: state_file = commands['state_path'] timings_file = args.resume + '_timing.npz' if os.path.isfile(state_file) and os.path.isfile(timings_file): logger.debug("Loading previous state") state = cPickle.load(open(state_file, 'r')) timings = dict(numpy.load(open(timings_file, 'r'))) for x, y in timings.items(): timings[x] = list(y) step = timings['step'][0] # Increment seed to make sure we get newly shuffled batches when training on large datasets state['seed'] = state['seed'] + 10 else: raise Exception("Cannot resume, cannot find files!") logger.debug("State:\n{}".format(pprint.pformat(state))) logger.debug("Timings:\n{}".format(pprint.pformat(timings))) if args.force_train_all_wordemb == True: state['fix_pretrained_word_embeddings'] = False if state['test_values_enabled']: train_data, \ valid_data, = get_train_iterator(state) train_data.start() state['batch_iterator'] = train_data if not commands: model = DialogEncoderDecoder(state) if commands: commands['timings'] = timings if commands['resume_path']: model = commands['resume_path'][0] timings = commands['resume_path'][1] for key, value in timings.iteritems(): timings[key] = list(value) step = timings['step'][0] else: model = DialogEncoderDecoder(state) rng = model.rng valid_rounds = 0 save_model_on_first_valid = False if args.resume != "": filename = args.resume + '_model.npz' if os.path.isfile(filename): logger.debug("Loading previous model") parameter_strings_to_ignore = [] if args.reinitialize_decoder_parameters: parameter_strings_to_ignore += ['Wd_'] parameter_strings_to_ignore += ['bd_'] save_model_on_first_valid = True if args.reinitialize_latent_variable_parameters: parameter_strings_to_ignore += ['latent_utterance_prior'] parameter_strings_to_ignore += [ 'latent_utterance_approx_posterior' ] parameter_strings_to_ignore += ['kl_divergence_cost_weight'] parameter_strings_to_ignore += ['latent_dcgm_encoder'] save_model_on_first_valid = True load(model, filename, parameter_strings_to_ignore) else: raise Exception("Cannot resume, cannot find model file!") if 'run_id' not in model.state: print 'Backward compatibility not ensured! (need run_id in state)' else: # assign new run_id key model.state['run_id'] = RUN_ID logger.debug("Compile trainer") if not state["use_nce"]: if ('add_latent_gaussian_per_utterance' in state) and (state["add_latent_gaussian_per_utterance"]): logger.debug( "Training using variational lower bound on log-likelihood") else: logger.debug("Training using exact log-likelihood") train_batch = model.build_train_function() else: logger.debug("Training with noise contrastive estimation") train_batch = model.build_nce_function() eval_batch = model.build_eval_function() if model.add_latent_gaussian_per_utterance: eval_grads = model.build_eval_grads() random_sampler = search.RandomSampler(model) beam_sampler = search.BeamSampler(model) logger.debug("Load data") train_data, \ valid_data, = get_train_iterator(state) train_data.start() # Start looping through the dataset patience = state['patience'] start_time = time.time() train_cost = 0 train_kl_divergence_cost = 0 train_posterior_mean_variance = 0 train_misclass = 0 train_done = 0 train_dialogues_done = 0.0 prev_train_cost = 0 prev_train_done = 0 ex_done = 0 is_end_of_batch = True start_validation = False batch = None import theano.tensor word = 'what' word_idx = model.words_to_indices([word]) initial_sum = theano.tensor.sum(model.W_emb[word_idx]).eval() if 'fix_W_emb_steps' in state: model.W_emb_pretrained_mask.set_value( numpy.zeros(model.W_emb_pretrained_mask.shape.eval(), dtype='float32')) #for idx in xrange(10): #print theano.tensor.sum(model.W_emb[word_idx]).eval() total_token_time = 0 num_tokens_processed = 0 while (step < state['loop_iters'] and (time.time() - start_time) / 60. < state['time_stop'] and patience >= 0): timings['step'] = [step] if 'save_at_first_iter' in state and step == 1: save2(model, timings, commands) #print 'init: ',initial_sum #print 'changed to: ',theano.tensor.sum(model.W_emb[word_idx]).eval() if 'fix_W_emb_steps' in state: if state['fix_W_emb_steps'] < step: model.W_emb_pretrained_mask.set_value( numpy.ones(model.W_emb_pretrained_mask.shape.eval(), dtype='float32')) if commands: commands['timings'] = timings if not shall_train(): logging.debug('...training paused') wait_until(shall_train) if shall_save(): logging.debug('...saving model (from command)') save2(model, timings, commands) saving_done() if shall_abort(): break ### Sampling phase if step % 200 == 0: # First generate stochastic samples for param in model.params: print "%s = %.4f" % (param.name, numpy.sum(param.get_value()** 2)**0.5) samples, costs = random_sampler.sample([[]], n_samples=1, n_turns=3) print "Sampled : {}".format(samples[0]) if commands: commands['output'] = samples[0] ### Training phase batch = train_data.next() # Train finished if not batch: # Restart training logger.debug("Got None...") break logger.debug("[TRAIN] [STEP %d]- Got batch %d,%d" % (step + 1, batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_reset = batch['x_reset'] ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] is_end_of_batch = False if numpy.sum(numpy.abs(x_reset)) < 1: # Print when we reach the end of an example (e.g. the end of a dialogue or a document) # Knowing when the training procedure reaches the end is useful for diagnosing training problems #print 'END-OF-BATCH EXAMPLE!' is_end_of_batch = True if commands: token_time = time.time() if state['use_nce']: y_neg = rng.choice(size=(10, max_length, x_data.shape[1]), a=model.idim, p=model.noise_probs).astype('int32') c, kl_divergence_cost, posterior_mean_variance = train_batch( x_data, x_data_reversed, y_neg, max_length, x_cost_mask, x_reset, ran_cost_utterance, ran_decoder_drop_mask) else: c, kl_divergence_cost, posterior_mean_variance = train_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_cost_utterance, ran_decoder_drop_mask) total_token_time += token_time num_tokens_processed += (batch['x'].shape[1] * batch['max_length']) print '%.3f words/s' % (num_tokens_processed / total_token_time) if commands: token_time = time.time() - token_time commands['timings'] = timings commands['token_time'] += token_time commands['num_tokens_processed'] += (batch['x'].shape[1] * batch['max_length']) # Print batch statistics print 'cost_sum', c print 'cost_mean', c / float(numpy.sum(x_cost_mask)) print 'kl_divergence_cost_sum', kl_divergence_cost print 'kl_divergence_cost_mean', kl_divergence_cost / float( len(numpy.where(x_data == model.eos_sym)[0])) print 'posterior_mean_variance', posterior_mean_variance if numpy.isinf(c) or numpy.isnan(c): logger.warn("Got NaN cost .. skipping") gc.collect() continue train_cost += c train_kl_divergence_cost += kl_divergence_cost train_posterior_mean_variance += posterior_mean_variance train_done += batch['num_preds'] train_dialogues_done += batch['num_dialogues'] this_time = time.time() if step % state['train_freq'] == 0: elapsed = this_time - start_time # Keep track of training cost for the last 'train_freq' batches. current_train_cost = train_cost / train_done if prev_train_done >= 1 and abs(train_done - prev_train_done) > 0: current_train_cost = float( train_cost - prev_train_cost) / float(train_done - prev_train_done) if numpy.isinf(c) or numpy.isnan(c): current_train_cost = 0 prev_train_cost = train_cost prev_train_done = train_done h, m, s = ConvertTimedelta(this_time - start_time) # We need to catch exceptions due to high numbers in exp try: print ".. %.2d:%.2d:%.2d %4d mb # %d bs %d maxl %d acc_cost = %.4f acc_word_perplexity = %.4f cur_cost = %.4f cur_word_perplexity = %.4f acc_mean_word_error = %.4f acc_mean_kl_divergence_cost = %.8f acc_mean_posterior_variance = %.8f" % (h, m, s,\ state['time_stop'] - (time.time() - start_time)/60.,\ step, \ batch['x'].shape[1], \ batch['max_length'], \ float(train_cost/train_done), \ math.exp(float(train_cost/train_done)), \ current_train_cost, \ math.exp(current_train_cost), \ float(train_misclass)/float(train_done), \ float(train_kl_divergence_cost/train_done), \ float(train_posterior_mean_variance/train_dialogues_done)) except: pass #timings['train_progress'].append(math.exp(float(train_cost/train_done))) timings['train_progress'].append(math.exp(current_train_cost)) ### Inspection phase # Evaluate gradient variance every 200 steps for GRU decoder if state['utterance_decoder_gating'].upper() == "GRU": if (step % 200 == 0) and (model.add_latent_gaussian_per_utterance): k_eval = 10 softmax_costs = numpy.zeros((k_eval), dtype='float32') var_costs = numpy.zeros((k_eval), dtype='float32') gradients_wrt_softmax = numpy.zeros( (k_eval, model.qdim_decoder, model.qdim_decoder), dtype='float32') for k in range(0, k_eval): batch = add_random_variables_to_batch( model.state, model.rng, batch, None, False) ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] softmax_cost, var_cost, grads_wrt_softmax, grads_wrt_kl_divergence_cost = eval_grads( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_cost_utterance, ran_decoder_drop_mask) softmax_costs[k] = softmax_cost var_costs[k] = var_cost gradients_wrt_softmax[k, :, :] = grads_wrt_softmax print 'mean softmax_costs', numpy.mean(softmax_costs) print 'std softmax_costs', numpy.std(softmax_costs) print 'mean var_costs', numpy.mean(var_costs) print 'std var_costs', numpy.std(var_costs) print 'mean gradients_wrt_softmax', numpy.mean( numpy.abs(numpy.mean(gradients_wrt_softmax, axis=0))), numpy.mean( gradients_wrt_softmax, axis=0) print 'std gradients_wrt_softmax', numpy.mean( numpy.std(gradients_wrt_softmax, axis=0)), numpy.std(gradients_wrt_softmax, axis=0) print 'std greater than mean', numpy.where( numpy.std(gradients_wrt_softmax, axis=0) > numpy.abs( numpy.mean(gradients_wrt_softmax, axis=0)))[0].shape[0] Wd_s_q = model.utterance_decoder.Wd_s_q.get_value() print 'Wd_s_q all', numpy.sum(numpy.abs(Wd_s_q)), numpy.mean( numpy.abs(Wd_s_q)) print 'Wd_s_q latent', numpy.sum( numpy.abs(Wd_s_q[( Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])), numpy.mean( numpy.abs( Wd_s_q[(Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])) print 'Wd_s_q ratio', (numpy.sum( numpy.abs( Wd_s_q[(Wd_s_q.shape[0] - state['latent_gaussian_per_utterance_dim'] ):Wd_s_q.shape[0], :])) / numpy.sum(numpy.abs(Wd_s_q))) if 'latent_gaussian_linear_dynamics' in state: if state['latent_gaussian_linear_dynamics']: prior_Wl_linear_dynamics = model.latent_utterance_variable_prior_encoder.Wl_linear_dynamics.get_value( ) print 'prior_Wl_linear_dynamics', numpy.sum( numpy.abs(prior_Wl_linear_dynamics)), numpy.mean( numpy.abs( prior_Wl_linear_dynamics)), numpy.std( numpy.abs(prior_Wl_linear_dynamics)) approx_posterior_Wl_linear_dynamics = model.latent_utterance_variable_approx_posterior_encoder.Wl_linear_dynamics.get_value( ) print 'approx_posterior_Wl_linear_dynamics', numpy.sum( numpy.abs(approx_posterior_Wl_linear_dynamics) ), numpy.mean( numpy.abs(approx_posterior_Wl_linear_dynamics) ), numpy.std( numpy.abs(approx_posterior_Wl_linear_dynamics)) #print 'grads_wrt_softmax', grads_wrt_softmax.shape, numpy.sum(numpy.abs(grads_wrt_softmax)), numpy.abs(grads_wrt_softmax[0:5,0:5]) #print 'grads_wrt_kl_divergence_cost', grads_wrt_kl_divergence_cost.shape, numpy.sum(numpy.abs(grads_wrt_kl_divergence_cost)), numpy.abs(grads_wrt_kl_divergence_cost[0:5,0:5]) ### Evaluation phase if valid_data is not None and\ step % state['valid_freq'] == 0 and step > 1: start_validation = True # Only start validation loop once it's time to validate and once all previous batches have been reset if start_validation and is_end_of_batch: start_validation = False valid_data.start() valid_cost = 0 valid_kl_divergence_cost = 0 valid_posterior_mean_variance = 0 valid_wordpreds_done = 0 valid_dialogues_done = 0 logger.debug("[VALIDATION START]") while True: batch = valid_data.next() # Validation finished if not batch: break logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_reset = batch['x_reset'] ran_cost_utterance = batch['ran_var_constutterance'] ran_decoder_drop_mask = batch['ran_decoder_drop_mask'] c, kl_term, c_list, kl_term_list, posterior_mean_variance = eval_batch( x_data, x_data_reversed, max_length, x_cost_mask, x_reset, ran_cost_utterance, ran_decoder_drop_mask) # Rehape into matrix, where rows are validation samples and columns are tokens # Note that we use max_length-1 because we don't get a cost for the first token # (the first token is always assumed to be eos) c_list = c_list.reshape((batch['x'].shape[1], max_length - 1), order=(1, 0)) c_list = numpy.sum(c_list, axis=1) words_in_dialogues = numpy.sum(x_cost_mask, axis=0) c_list = c_list / words_in_dialogues if numpy.isinf(c) or numpy.isnan(c): continue valid_cost += c valid_kl_divergence_cost += kl_divergence_cost valid_posterior_mean_variance += posterior_mean_variance # Print batch statistics print 'valid_cost', valid_cost print 'valid_kl_divergence_cost sample', kl_divergence_cost print 'posterior_mean_variance', posterior_mean_variance valid_wordpreds_done += batch['num_preds'] valid_dialogues_done += batch['num_dialogues'] logger.debug("[VALIDATION END]") valid_cost /= valid_wordpreds_done valid_kl_divergence_cost /= valid_wordpreds_done valid_posterior_mean_variance /= valid_dialogues_done # We need to catch exceptions due to high numbers in exp try: print "** valid cost (NLL) = %.4f, valid word-perplexity = %.4f, valid kldiv cost (per word) = %.8f, valid mean posterior variance (per word) = %.8f, patience = %d" % ( float(valid_cost), float( math.exp(valid_cost)), float(valid_kl_divergence_cost), float(valid_posterior_mean_variance), patience) except: try: print "** valid cost (NLL) = %.4f, patience = %d" % ( float(valid_cost), patience) except: pass timings["train_cost"].append(train_cost / train_done) timings["train_kl_divergence_cost"].append( train_kl_divergence_cost / train_done) timings["train_posterior_mean_variance"].append( train_posterior_mean_variance / train_dialogues_done) timings["valid_cost"].append(valid_cost) timings["valid_perplexity"].append(float(math.exp(valid_cost))) timings["valid_kl_divergence_cost"].append( valid_kl_divergence_cost) timings["valid_posterior_mean_variance"].append( valid_posterior_mean_variance) if (len(timings["valid_cost"]) == 0) \ or (valid_cost < numpy.min(timings["valid_cost"])) \ or (save_model_on_first_valid and valid_rounds == 0): patience = state['patience'] # Save model if there is decrease in validation cost if commands: save2(model, timings, commands) else: save(model, timings) print 'best valid_cost', valid_cost elif valid_cost >= timings["valid_cost"][-1] * state[ 'cost_threshold']: patience -= 1 if args.save_every_valid_iteration: if commands: save2(model, timings, commands) else: save(model, timings, '_' + str(step) + '_') if args.auto_restart: if commands: save2(model, timings, commands) else: save(model, timings, '_auto_') # Reset train cost, train misclass and train done metrics train_cost = 0 train_done = 0 prev_train_cost = 0 prev_train_done = 0 # Count number of validation rounds done so far valid_rounds += 1 step += 1 logger.debug("All done, exiting...")
def main(args): logging.basicConfig(level = logging.DEBUG, format = "%(asctime)s: %(name)s: %(levelname)s: %(message)s") state = eval(args.prototype)() timings = init_timings() args.resume = 'Que26/models/1448530885.38_testmodel__225000' if args.resume != "": logger.debug("Resuming %s" % args.resume) state_file = args.resume + '_state.pkl' timings_file = args.resume + '_timing.npz' if os.path.isfile(state_file) and os.path.isfile(timings_file): logger.debug("Loading previous state") state = cPickle.load(open(state_file, 'r')) timings = dict(numpy.load(open(timings_file, 'r'))) for x, y in timings.items(): timings[x] = list(y) # Increment seed to make sure we get newly shuffled batches when training on large datasets state['seed'] = state['seed'] + 10 else: raise Exception("Cannot resume, cannot find files!") #logger.debug("State:\n{}".format(pprint.pformat(state))) #logger.debug("Timings:\n{}".format(pprint.pformat(timings))) if args.force_train_all_wordemb == True: state['fix_pretrained_word_embeddings'] = False # Load dictionary raw_dict = cPickle.load(open(state['dictionary'], 'r')) # Dictionaries to convert str to idx and vice-versa str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in raw_dict]) #字典里的每一项包含四个字段,(字符,字符号,词频,文本频率) idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq, _ in raw_dict]) model = DialogEncoderDecoder(state) rng = model.rng if args.resume != "": filename = args.resume + '_model.npz' if os.path.isfile(filename): logger.debug("Loading previous model") parameter_strings_to_ignore = [] if args.reinitialize_decoder_parameters: parameter_strings_to_ignore += ['latent_utterance_prior'] parameter_strings_to_ignore += ['latent_utterance_approx_posterior'] if args.reinitialize_variational_parameters: parameter_strings_to_ignore += ['Wd_'] parameter_strings_to_ignore += ['bd_'] load(model, filename, parameter_strings_to_ignore) else: raise Exception("Cannot resume, cannot find model file!") if 'run_id' not in model.state: raise Exception('Backward compatibility not ensured! (need run_id in state)') else: # assign new run_id key model.state['run_id'] = RUN_ID logger.debug("Compile trainer") test_batch = model.build_test_function() #测试函数 if model.add_latent_gaussian_per_utterance: eval_grads = model.build_eval_grads() random_sampler = search.RandomSampler(model) beam_sampler = search.BeamSampler(model) logger.debug("Load data") train_data, \ valid_data, = get_train_iterator(state) # Start looping through the dataset step = 0 patience = state['patience'] start_time = time.time() train_cost = 0 train_variational_cost = 0 train_posterior_mean_variance = 0 train_misclass = 0 train_done = 0 train_dialogues_done = 0.0 prev_train_cost = 0 prev_train_done = 0 ex_done = 0 batch = None valid_data.start() valid_cost = 0 valid_variational_cost = 0 valid_posterior_mean_variance = 0 valid_wordpreds_done = 0 valid_dialogues_done = 0 # Prepare variables for plotting histogram over word-perplexities and mutual information valid_data_len = valid_data.data_len valid_cost_list = numpy.zeros((valid_data_len,)) valid_pmi_list = numpy.zeros((valid_data_len,)) # Prepare variables for printing the training examples the model performs best and worst on valid_extrema_setsize = min(state['track_extrema_samples_count'], valid_data_len) valid_extrema_samples_to_print = min(state['print_extrema_samples_count'], valid_extrema_setsize) max_stored_len = 160 # Maximum number of tokens to store for dialogues with highest and lowest validation errors valid_lowest_costs = numpy.ones((valid_extrema_setsize,))*1000 valid_lowest_dialogues = numpy.ones((valid_extrema_setsize,max_stored_len))*1000 valid_highest_costs = numpy.ones((valid_extrema_setsize,))*(-1000) valid_highest_dialogues = numpy.ones((valid_extrema_setsize,max_stored_len))*(-1000) logger.debug("[VALIDATION START]") DocMtrix = [] NNN = 0 while True: NNN += 1 if NNN > 50: break batch = valid_data.next() # Train finished if not batch: break logger.debug("[VALID] - Got batch %d,%d" % (batch['x'].shape[1], batch['max_length'])) if batch['max_length'] == state['max_grad_steps']: continue x_data = batch['x'] x_data_reversed = batch['x_reversed'] max_length = batch['max_length'] x_cost_mask = batch['x_mask'] x_semantic = batch['x_semantic'] x_semantic_nonempty_indices = numpy.where(x_semantic >= 0) x_reset = batch['x_reset'] ran_cost_utterance = batch['ran_var_constutterance'] Gen_tar, Tar_Y, DocV= test_batch(x_data, x_data_reversed, max_length, x_cost_mask, x_semantic, x_reset, ran_cost_utterance) DocMtrix.append(DocV) print ''.join([idx_to_str[id_of_w] for id_of_w in list(x_data.T)[0]]) # Rehape into matrix, where rows are validation samples and columns are tokens # Note that we use max_length-1 because we don't get a cost for the first token # (the first token is always assumed to be eos) #c_list = c_list.reshape((batch['x'].shape[1],max_length-1), order=(1,0)) #c_list = numpy.sum(c_list, axis=1) #words_in_dialogues = numpy.sum(x_cost_mask, axis=0) #c_list = c_list / words_in_dialogues #print 'Original: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Tar_Y.T)[0]]) #'',join([idx_to_str[id_of_w] for id_of_w in Tar_Y]) #print 'Generations: ',''.join([idx_to_str[id_of_w] for id_of_w in list(Gen_tar.T)[0]]) #print 'Test:', type(test1), test1 #raw_input() """ if numpy.isinf(c) or numpy.isnan(c): continue valid_cost += c valid_variational_cost += variational_cost valid_posterior_mean_variance += posterior_mean_variance print 'valid_cost', valid_cost print 'Original: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Tar_Y.T)[0]]) #'',join([idx_to_str[id_of_w] for id_of_w in Tar_Y]) print 'Generations: ', ''.join([idx_to_str[id_of_w] for id_of_w in list(Gen_tar.T)[0]]) #print 'valid_variational_cost', valid_variational_cost #print 'posterior_mean_variance', posterior_mean_variance valid_wordpreds_done += batch['num_preds'] valid_dialogues_done += batch['num_dialogues'] """ logger.debug("[VALIDATION END]") DocM = numpy.row_stack(DocMtrix) simM = cosine_similarity(DocM,DocM) meanV = numpy.mean(DocM,axis=1) print simM print meanV """