def get_merge_activations(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_generator = VisualWordDataGenerator(self.args, input_dataset=self.args.checkpoint_dataset, hsn=self.args.hidden_size) self.data_generator.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_generator.index2word) if not self.use_sourcelang: hsn_size = 0 else: hsn_size = self.data_generator.hsn_size # ick m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.checkpoint, gru=self.args.gru) self.model =\ m.buildMergeActivations(use_image=self.use_image, use_sourcelang=self.use_sourcelang) self.generate_activations('val')
def prepare_datagenerator(self): self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index
def get_hidden_activations(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_generator.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_generator.index2word) t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=t) self.fhs = m.buildHSNActivations(use_image=self.use_image) if self.args.use_predicted_tokens and self.args.no_image == False: gen_m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=self.args.generation_timesteps) self.full_model = gen_m.buildKerasModel(use_image=self.use_image) self.new_generate_activations('train') self.new_generate_activations('val')
def generationModel(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index if self.use_sourcelang: # HACK FIXME unexpected problem with input_data self.hsn_size = 256 else: self.hsn_size = 0 m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru) self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image) self.generate_sentences(self.args.checkpoint, val=not self.args.test) self.bleu_score(self.args.checkpoint, val=not self.args.test) self.calculate_pplx(self.args.checkpoint, val=not self.args.test)
def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator( self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size()
def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size()
class Sweep(object): def __init__(self, args): ''' Initialise the model and set Theano debugging model if self.args.debug is true ''' self.args = args self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.data_generator = None self.prepare_datagenerator() if self.args.debug: theano.config.optimizer = 'fast_compile' theano.config.exception_verbosity = 'high' def random_sweep(self): ''' Start randomly sweeping through hyperparameter ranges. This current only supports sweeping through the L2 regularisation strength, the learning rate, and the dropout probability. ''' model = GroundedTranslation(self.args, datagen=self.data_generator) handle = open("../logs/sweeper-%s.log" % self.args.run_string, "w") handle.write("{:3} | {:10} | {:10} | {:10} | {:10} | {:10} \n".format("Run", "loss", "val_loss", "lr", "reg", "dropin")) handle.close() for sweep in xrange(self.args.num_sweeps): # randomly sample a learning rate and an L2 regularisation handle = open("../logs/sweeper-%s.log" % self.args.run_string, "a") if self.args.min_lr == ceil(self.args.min_lr): # you provided an exponent, we'll search in log-space lr = 10**uniform(self.args.min_lr, self.args.max_lr) else: # you provided a specific number lr = 10**uniform(log10(self.args.min_lr), log10(self.args.max_lr)) if self.args.min_l2 == ceil(self.args.min_l2): # you provided an exponent, we'll search in log-space l2 = 10**uniform(self.args.min_l2, self.args.max_l2) else: # you provide a specific number l2 = 10**uniform(log10(self.args.min_l2), log10(self.args.max_l2)) drop_in = uniform(self.args.min_dropin, self.args.max_dropin) # modify the arguments that will be used to create the graph model.args.lr = lr model.args.l2reg = l2 model.args.dropin = drop_in logger.info("Setting learning rate to: %.5e", lr) logger.info("Setting l2reg to: %.5e", l2) logger.info("Setting dropout to: %f", drop_in) # initialise and compile a new model losses = model.train_model() handle.write("{:3d} | {:5.5f} | {:5.5f} | {:5e} | {:5e} | {:5.4f} \n".format(sweep, losses.history['loss'][-1], losses.history['val_loss'][-1], lr, l2, drop_in)) handle.close() def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size()
class ExtractMergeActivations: def __init__(self, args): self.args = args self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 # consistent with models.py # maybe use_sourcelang isn't applicable here? self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' def get_merge_activations(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_generator = VisualWordDataGenerator(self.args, input_dataset=self.args.checkpoint_dataset, hsn=self.args.hidden_size) self.data_generator.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_generator.index2word) if not self.use_sourcelang: hsn_size = 0 else: hsn_size = self.data_generator.hsn_size # ick m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.checkpoint, gru=self.args.gru) self.model =\ m.buildMergeActivations(use_image=self.use_image, use_sourcelang=self.use_sourcelang) self.generate_activations('val') def generate_activations(self, split): ''' Generate and serialise merge state activations into --dataset. ''' logger.info("Generating merge state activations\ from this model for %s\n", split) if split == 'train': """ WARNING: This collects the *entirety of the training data* in hidden_states, so should not be used on non-toy training data. """ hidden_states = [] batch_start = 0 batch_end = 0 for train_input, trainY, indicator in\ self.data_generator.yield_training_batch(self.args.big_batch_size, self.use_sourcelang, self.use_image): feats = self.model.predict(train_input, batch_size=self.args.batch_size, verbose=1) for f in feats: activations = f[0] # we want the merge features hidden_states.append(activations) batch_end += 1 # Note: serialisation happens over training batches too. # now serialise the hidden representations in the h5 self.serialise_to_csv(split, hidden_states, batch_start, batch_end) batch_start = batch_end hidden_states = [] elif split == 'val': val_input, valY = self.data_generator.get_data_by_split('val', self.use_sourcelang, self.use_image) logger.info("Generating merge activations from this model for val\n") hidden_states = [] feats = self.model.predict(val_input, batch_size=self.args.batch_size, verbose=1) for f in feats: activations = f[0] # we want the merge features hidden_states.append(activations) # now serialise the hidden representations in the h5 self.serialise_to_csv(split, hidden_states) def serialise_to_csv(self, split, hidden_states, batch_start=None, batch_end=None): """ Serialise the hidden representations from generate_activations into a CSV for t-SNE visualisation.""" logger.info("Serialising merge state features from %s to csv", split) fhf_str = "%s-initial_hidden_features" % self.args.run_string if self.args.source_vectors is not None: fhf_str = "%s-multilingual_initial_hidden_features" % self.args.run_string f = open(fhf_str, 'a') for h in hidden_states: np.savetxt(f, h, delimiter=',', newline=',') f.write("\n") f.close()
class GroundedTranslationGenerator: def __init__(self, args): self.args = args self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 # consistent with models.py self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.model = None self.prepare_datagenerator() # this results in two file handlers for dataset (here and # data_generator) if not self.args.dataset: logger.warn("No dataset given, using flickr8k") self.dataset = h5py.File("flickr8k/dataset.h5", "r") else: self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r") if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' def prepare_datagenerator(self): self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index def generate(self): ''' Entry point for this module. Loads up a data generator to get the relevant image / source features. Builds the relevant model, given the command-line arguments. Generates sentences for the images in the val / test data. Calculates BLEU and PPLX, unless requested. ''' if self.use_sourcelang: # HACK FIXME unexpected problem with input_data self.hsn_size = self.data_gen.hsn_size else: self.hsn_size = 0 if self.model == None: self.build_model(generate=True) self.generate_sentences(self.args.checkpoint, val=not self.args.test) if not self.args.without_scores: score = self.bleu_score(self.args.checkpoint, val=not self.args.test) if self.args.multeval: score, _, _ = self.multeval_scores(self.args.checkpoint, val=not self.args.test) if not self.args.no_pplx: self.build_model(generate=False) self.calculate_pplx(self.args.checkpoint, val=not self.args.test) return score ################################################################################ # Helper functions for generate_sentences() def get_keep_func(self): "Builds a keep function, given a JSON file with info on what to keep." with open(self.args.keep_file) as f: d = json.load(f) whole_word = set(d['WHOLEWORD']) prefixes = d['STARTSWITH'] suffixes = d['ENDSWITH'] def keep_func(word): "Function to determine which words to keep in the beam." if word in whole_word: return True for pref in prefixes: if word.startswith(pref): return True for suf in suffixes: if word.endswith(suf): return True return False # Return the function: return keep_func def get_candidates(self, t, beams, structs, keep_func): """ Get candidate beams containing the next word. If the next word is one that should be kept according to keep_func, the beams will be added to kept_candidates. """ # Store the candidates produced at timestep t, will be # pruned at the end of the timestep candidates = [] kept_candidates = [] # we take a view of the datastructures, which means we're only # ever generating a prediction for the next word. This saves a # lot of cycles. preds = self.model.predict(structs, verbose=0) # The last indices in preds are the predicted words next_word_indices = preds['output'][:, t-1] sorted_indices = np.argsort(-next_word_indices, axis=1) # Each instance in structs is holding the history of a # beam, and so there is a direct connection between the # index of a beam in beams and the index of an instance in # structs. for beam_idx, b in enumerate(beams): # get the sorted predictions for the beam_idx'th beam beam_predictions = sorted_indices[beam_idx] for top_idx in range(self.args.beam_width): wordIndex = beam_predictions[top_idx] wordProb = next_word_indices[beam_idx][beam_predictions[top_idx]] # For the beam_idxth beam, add the log probability # of the top_idxth predicted word to the previous # log probability of the sequence, and append the # top_idxth predicted word to the sequence of words current_word = self.index2word[wordIndex] updated_beam = [b[0] + math.log(wordProb), b[1] + [wordIndex]] candidates.append(updated_beam) if keep_func(current_word): logger.info("WORD KEPT: " + current_word) self.found_negation = True kept_candidates.append(updated_beam) return candidates, kept_candidates def prune(self, candidates, max_beam_width, category='regular beams'): """ Prune the candidates, so that we are left with max_beam_width beams. Also return beams that are finished as a separate list. """ beams = candidates[:max_beam_width] # prune the beams finished = [] pruned = [] for b in beams: # If a top candidate emitted an EOS token then # a) add it to the list of finished sequences # b) remove it from the beams and decrease the # maximum size of the beams. if b[1][-1] == self.word2index["<E>"]: finished.append(b) if max_beam_width >= 1: max_beam_width -= 1 else: pruned.append(b) beams = pruned[:max_beam_width] if self.args.verbose: logger.info("Pruned beams " + ''.join(['(', category, ')'])) logger.info("---") for b in beams: logger.info(" ".join([self.index2word[x] for x in b[1]]) + "(%f)" % b[0]) return beams, finished def get_structs(self, beams, data, max_beam_width): "Get structs for the next round." structs = self.make_duplicate_matrices(data, max_beam_width) # Rewrite the 1-hot word features with the # so-far-predcicted tokens in a beam. for bidx, b in enumerate(beams): for idx, w in enumerate(b[1]): # This variable doesn't do anything. # next_word_index = w structs['text'][bidx, idx+1, w] = 1. return structs def log_finished(self, finished, category='regular beams'): "Log the Length-normalised samples." logger.info("Length-normalised samples" + ''.join(['(', category, ')'])) logger.info("---") for f in finished: logger.info(" ".join([self.index2word[x] for x in f[1]]) + "(%f)" % f[0]) ################################################################################ def generate_sentences(self, filepath, val=True): """ Generates descriptions of images for --generation_timesteps iterations through the LSTM. Each input description is clipped to the first <BOS> token, or, if --generate_from_N_words is set, to the first N following words (N + 1 BOS token). This process can be additionally conditioned on source language hidden representations, if provided by the --source_vectors parameter. The output is clipped to the first EOS generated, if it exists. TODO: duplicated method with generate.py """ try: assert self.args.beam_width > 1 except AssertionError: raise AssertionError('Beam size too small. Cannot use dual beam search.') neg_counter = 0 ident_desc_dict = dict() keep_func = self.get_keep_func() prefix = "val" if val else "test" handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words # Default 0 start_gen = start_gen + 1 # include BOS generator = self.data_gen.generation_generator(prefix, batch_size=1) # we are going to beam search for the most probably sentence. # let's do this one sentence at a time to make the logging output # easier to understand for seen, data in enumerate(generator, start=1): text = data['text'] # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.index2word[w]) del data['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data['output'] data['text'] = text max_beam_width = self.args.beam_width neg_max_beam_width = self.args.beam_width structs = self.make_duplicate_matrices(data, max_beam_width) # A beam is a 2-tuple with the probability of the sequence and # the words in that sequence. Start with empty beams beams = [(0.0, [])] neg_beams = [] # collects beams that are in the top candidates and # emitted a <E> token. finished = [] neg_finished = [] # Flag variable. Is set to True once the first negation is found. self.found_negation = False for t in range(start_gen, self.args.generation_timesteps): # Ensure that kept_candidates is there. (And that previous results are removed.) kept_candidates = [] ################################################################ # GET CANDIDATES if max_beam_width > 0: candidates, kept_candidates = self.get_candidates(t, beams, structs, keep_func) candidates.sort(reverse = True) if self.found_negation: neg_c, neg_kc = self.get_candidates(t, neg_beams, neg_structs, keep_func) # don't add neg_kc: don't add examples twice. neg_candidates = kept_candidates + neg_c neg_candidates.sort(reverse = True) ################################################################ # LOG NEW CANDIDATES if self.args.verbose: logger.info("Candidates in the beam") logger.info("---") if max_beam_width > 0: logger.info("REGULAR BEAM:") for c in candidates: logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0]) if self.found_negation: logger.info("SEPARATE BEAM:") for c in neg_candidates: logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0]) ################################################################ # PRUNE beams, finished_this_round = self.prune(candidates, max_beam_width, category='regular beams') finished.extend(finished_this_round) if self.found_negation: neg_beams, finished_this_round = self.prune(neg_candidates, neg_max_beam_width, category='selected beams') neg_finished.extend(finished_this_round) ################################################################ # STOP DECISION if self.found_negation: if neg_max_beam_width == 0: # We have sampled neg_max_beam_width sequences with an <E> # token so stop the beam search. break elif max_beam_width == 0: # We have sampled max_beam_width sequences with an <E> # token so stop the beam search. break ################################################################ # UPDATE STRUCTS # Reproduce the structs for the beam search so we can keep # track of the state of each beam if max_beam_width > 0: structs = self.get_structs(beams=beams, data=data, max_beam_width=max_beam_width) neg_structs = self.get_structs(beams=neg_beams, data=data, max_beam_width=neg_max_beam_width) #################################################################### # WRAPPING UP # If none of the sentences emitted an <E> token while # decoding, add the final beams into the final candidates if len(finished) == 0: for leftover in beams: finished.append(leftover) # Do the same for the neg beams. if self.found_negation and len(neg_finished) == 0: for leftover in neg_beams: neg_finished.append(leftover) # Normalise the probabilities by the length of the sequences # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711 for f in finished: f[0] = f[0] / len(f[1]) finished.sort(reverse=True) for f in neg_finished: f[0] = f[0] / len(f[1]) neg_finished.sort(reverse=True) #################################################################### # LOG FINISHED if self.args.verbose: self.log_finished(finished, category='regular beams') if self.found_negation: self.log_finished(finished, category='selected beams') # Emit the lowest (log) probability sequence best_beam = finished[0] if not self.found_negation else neg_finished[0] complete_sentences[i] = [self.index2word[x] for x in best_beam[1]] generated_sentence = ' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i])]) # The description data in the JSON file is stored together with a flag # indicating whether or not there is a negation in the sentence. ident_desc_dict[data['ident']] = [generated_sentence, self.found_negation] handle.write(generated_sentence + "\n") if self.args.verbose: logger.info("%s (%f)", generated_sentence, best_beam[0]) if self.found_negation: neg_counter += 1 if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break # Put together the filename for the JSON data, consisting of the following: json_path = ''.join([filepath, # folder '/', # trailing slash prefix, # 'val' or 'test' '_dual_beam_search_', # kind of generation str(self.args.beam_width), # beam width used '.json']) # filetype # Write the JSON data. with codecs.open(json_path, 'w', 'utf-8') as f: json.dump(ident_desc_dict, f) logger.info("Total number of kept sentences: " + str(neg_counter)) handle.close() def calculate_pplx(self, path, val=True): """ Splits the input data into batches of self.args.batch_size to reduce the memory footprint of holding all of the data in RAM. """ prefix = "val" if val else "test" logger.info("Calculating pplx over %s data", prefix) sum_logprobs = 0 y_len = 0 generator = self.data_gen.fixed_generator(prefix) seen = 0 for data in generator: Y_target = deepcopy(data['output']) del data['output'] preds = self.model.predict(data, verbose=0, batch_size=self.args.batch_size) for i in range(Y_target.shape[0]): for t in range(Y_target.shape[1]): target_idx = np.argmax(Y_target[i, t]) target_tok = self.index2word[target_idx] if target_tok != "<P>": log_p = math.log(preds['output'][i, t, target_idx],2) sum_logprobs += -log_p y_len += 1 seen += data['text'].shape[0] if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break norm_logprob = sum_logprobs / y_len pplx = math.pow(2, norm_logprob) logger.info("PPLX: %.4f", pplx) handle = open("%s/%sPPLX" % (path, prefix), "w") handle.write("%f\n" % pplx) handle.close() return pplx def reset_text_arrays(self, text_arrays, fixed_words=1): """ Reset the values in the text data structure to zero so we cannot accidentally pass them into the model. Helper function for generate_sentences(). """ reset_arrays = deepcopy(text_arrays) reset_arrays[:,fixed_words:, :] = 0 return reset_arrays def make_duplicate_matrices(self, generator_data, k): ''' Prepare K duplicates of the input data for a given instance yielded by the data generator. Helper function for the beam search decoder in generation_sentences(). ''' if self.use_sourcelang and self.use_image: # the data generator yielded a dictionary with the words, the # image features, and the source features dupes = [[],[],[]] words = generator_data['text'] img = generator_data['img'] source = generator_data['source'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(source[0,:,:]) dupes[2].append(img[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) dupes[2] = np.array(dupes[2]) return {'text': dupes[0], 'img': dupes[2], 'source': dupes[1]} elif self.use_image: # the data generator yielded a dictionary with the words and the # image features dupes = [[],[]] words = generator_data['text'] img = generator_data['img'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(img[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'img': dupes[1]} elif self.use_sourcelang: # the data generator yielded a dictionary with the words and the # source features dupes = [[],[]] words = generator_data['text'] source= generator_data['source'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(source[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'source': dupes[1]} def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best Metric" if self.args.best_pplx else "Best loss" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint)) return "%s/%s" % (self.args.model_checkpoints, checkpoint) def bleu_score(self, directory, val=True): ''' PPLX is only weakly correlated with improvements in BLEU, and thus improvements in human judgements. Let's also track BLEU score of a subset of generated sentences in the val split to decide on early stopping, etc. ''' prefix = "val" if val else "test" self.extract_references(directory, val) subprocess.check_call( ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU' % (directory, prefix, directory, prefix, directory, prefix)], shell=True) bleudata = open("%s/%sBLEU" % (directory, prefix)).readline() data = bleudata.split(",")[0] bleuscore = data.split("=")[1] bleu = float(bleuscore.lstrip()) return bleu def multeval_scores(self, directory, val=True): ''' Maybe you want to evaluate with Meteor, TER, and BLEU? ''' prefix = "val" if val else "test" self.extract_references(directory, val) with cd(MULTEVAL_DIR): subprocess.check_call( ['./multeval.sh eval --refs ../%s/%s_reference.* \ --hyps-baseline ../%s/%sGenerated \ --meteor.language de \ --threads 4 \ 2> multevaloutput 1> multevaloutput' % (directory, prefix, directory, prefix)], shell=True) handle = open("multevaloutput") multdata = handle.readlines() handle.close() for line in multdata: if line.startswith("RESULT: baseline: BLEU: AVG:"): mbleu = line.split(":")[4] mbleu = mbleu.replace("\n","") mbleu = mbleu.strip() lr = mbleu.split(".") mbleu = float(lr[0]+"."+lr[1][0:2]) if line.startswith("RESULT: baseline: METEOR: AVG:"): mmeteor = line.split(":")[4] mmeteor = mmeteor.replace("\n","") mmeteor = mmeteor.strip() lr = mmeteor.split(".") mmeteor = float(lr[0]+"."+lr[1][0:2]) if line.startswith("RESULT: baseline: TER: AVG:"): mter = line.split(":")[4] mter = mter.replace("\n","") mter = mter.strip() lr = mter.split(".") mter = float(lr[0]+"."+lr[1][0:2]) logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", mmeteor, mbleu, mter) return mmeteor, mbleu, mter def extract_references(self, directory, val=True): """ Get reference descriptions for split we are generating outputs for. Helper function for bleu_score(). """ prefix = "val" if val else "test" references = self.data_gen.get_refs_by_split_as_list(prefix) for refid in xrange(len(references[0])): codecs.open('%s/%s_reference.ref%d' % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references])) def build_model(self, generate=False): ''' Build a Keras model if one does not yet exist. Helper function for generate(). ''' if generate: t = self.args.generation_timesteps else: t = self.data_gen.max_seq_len if self.args.mrnn: m = models.MRNN(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) else: m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image)
class Sweep(object): def __init__(self, args): ''' Initialise the model and set Theano debugging model if self.args.debug is true ''' self.args = args self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.data_generator = None self.prepare_datagenerator() if self.args.debug: theano.config.optimizer = 'fast_compile' theano.config.exception_verbosity = 'high' def random_sweep(self): ''' Start randomly sweeping through hyperparameter ranges. This current only supports sweeping through the L2 regularisation strength, the learning rate, and the dropout probability. ''' model = GroundedTranslation(self.args, datagen=self.data_generator) handle = open("../logs/sweeper-%s.log" % self.args.run_string, "w") handle.write("{:3} | {:10} | {:10} | {:10} | {:10} | {:10} \n".format( "Run", "loss", "val_loss", "lr", "reg", "dropin")) handle.close() for sweep in xrange(self.args.num_sweeps): # randomly sample a learning rate and an L2 regularisation handle = open("../logs/sweeper-%s.log" % self.args.run_string, "a") if self.args.min_lr == ceil(self.args.min_lr): # you provided an exponent, we'll search in log-space lr = 10**uniform(self.args.min_lr, self.args.max_lr) else: # you provided a specific number lr = 10**uniform(log10(self.args.min_lr), log10(self.args.max_lr)) if self.args.min_l2 == ceil(self.args.min_l2): # you provided an exponent, we'll search in log-space l2 = 10**uniform(self.args.min_l2, self.args.max_l2) else: # you provide a specific number l2 = 10**uniform(log10(self.args.min_l2), log10(self.args.max_l2)) drop_in = uniform(self.args.min_dropin, self.args.max_dropin) # modify the arguments that will be used to create the graph model.args.lr = lr model.args.l2reg = l2 model.args.dropin = drop_in logger.info("Setting learning rate to: %.5e", lr) logger.info("Setting l2reg to: %.5e", l2) logger.info("Setting dropout to: %f", drop_in) # initialise and compile a new model losses = model.train_model() handle.write( "{:3d} | {:5.5f} | {:5.5f} | {:5e} | {:5e} | {:5.4f} \n". format(sweep, losses.history['loss'][-1], losses.history['val_loss'][-1], lr, l2, drop_in)) handle.close() def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator( self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size()
class GroundedTranslationGenerator: def __init__(self, args): self.args = args self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 # consistent with models.py self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image # this results in two file handlers for dataset (here and # data_generator) if not self.args.dataset: logger.warn("No dataset given, using flickr8k") self.dataset = h5py.File("flickr8k/dataset.h5", "r") else: self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r") if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' def generationModel(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index if self.use_sourcelang: # HACK FIXME unexpected problem with input_data self.hsn_size = 256 else: self.hsn_size = 0 m = models.OneLayerLSTM(self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru) self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image) self.generate_sentences(self.args.checkpoint, val=not self.args.test) self.bleu_score(self.args.checkpoint, val=not self.args.test) self.calculate_pplx(self.args.checkpoint, val=not self.args.test) def generate_sentences(self, filepath, val=True): """ Generates descriptions of images for --generation_timesteps iterations through the LSTM. Each input description is clipped to the first <BOS> token, or, if --generate_from_N_words is set, to the first N following words (N + 1 BOS token). This process can be additionally conditioned on source language hidden representations, if provided by the --source_vectors parameter. The output is clipped to the first EOS generated, if it exists. TODO: beam search TODO: duplicated method with generate.py """ prefix = "val" if val else "test" handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words # Default 0 start_gen = start_gen + 1 # include BOS # prepare the datastructures for generation (no batching over val) arrays = self.make_generation_arrays(prefix, start_gen, generation=self.args.use_predicted_tokens) N_sents = arrays[0].shape[0] logger.debug("Input arrays %d", len(arrays)) logger.debug("Instances %d", len(arrays[0])) # complete_sentences = [["<S>"] for _ in range(N_sents)] complete_sentences = [[] for _ in range(N_sents)] for t in range(start_gen): # minimum 1 for i in range(N_sents): w = np.argmax(arrays[0][i, t]) logger.debug(complete_sentences[3]) logger.debug(self.index2word[np.argmax(arrays[0][0])]) for t in range(start_gen, self.args.generation_timesteps): # we take a view of the datastructures, which means we're only # ever generating a prediction for the next word. This saves a # lot of cycles. preds = self.model.predict([arr[:, 0:t] for arr in arrays], verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds[:, -1], axis=1) # update array[0]/sentence-so-far with generated words. for i in range(N_sents): arrays[0][i, t, next_word_indices[i]] = 1. next_words = [self.index2word[x] for x in next_word_indices] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) # save each sentence until it hits the first end-of-string token for s in complete_sentences: handle.write(' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", s[1:])]) + "\n") handle.close() def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best PPLX" if self.args.best_pplx else "Best BLEU" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint)) return "%s/%s" % (self.args.model_checkpoints, checkpoint) def yield_chunks(self, len_split_indices, batch_size): ''' self.args.batch_size is not always cleanly divisible by the number of items in the split, so we need to always yield the correct number of items. ''' for i in xrange(0, len_split_indices, batch_size): # yield split_indices[i:i+batch_size] yield (i, i+batch_size-1) def make_generation_arrays(self, prefix, fixed_words, generation=False): """Create arrays that are used as input for generation. """ # Y_target is unused #if generation: # input_data, _ =\ # self.data_gen.get_generation_data_by_split(prefix, # self.use_sourcelang, self.use_image) #else: input_data, _ = self.data_gen.get_data_by_split(prefix, self.use_sourcelang, self.use_image) # Replace input words (input_data[0]) with zeros for generation, # except for the first args.generate_from_N_words # NOTE: this will include padding and BOS steps (fixed_words has been # incremented accordingly already in generate_sentences().) logger.info("Initialising with the first %d gold words (incl BOS)", fixed_words) gen_input_data = deepcopy(input_data) gen_input_data[0][:, fixed_words:, :] = 0 return gen_input_data def calculate_pplx(self, directory, val=True): """ Without batching. Robust against multiple descriptions/image, since it uses data_generator.get_data_by_split input. """ prefix = "val" if val else "test" logger.info("Calculating pplx over %s data", prefix) sum_logprobs = 0 y_len = 0 input_data, Y_target = self.data_gen.get_data_by_split(prefix, self.use_sourcelang, self.use_image) if self.args.debug: tic = time.time() preds = self.model.predict(input_data, verbose=0) if self.args.debug: logger.info("Forward pass took %f", time.time()-tic) for t in range(Y_target.shape[1]): for i in range(Y_target.shape[0]): target_idx = np.argmax(Y_target[i, t]) if self.index2word[target_idx] != "<P>": log_p = math.log(preds[i, t, target_idx],2) #logprobs.append(log_p) sum_logprobs += -log_p y_len += 1 norm_logprob = sum_logprobs / y_len pplx = math.pow(2, norm_logprob) logger.info("PPLX: %.4f", pplx) handle = open("%s/%sPPLX" % (directory, prefix), "w") handle.write("%f\n" % pplx) handle.close() return pplx def extract_references(self, directory, val=True): """ Get reference descriptions for val, training subsection. """ prefix = "val" if val else "test" references = self.data_gen.get_refs_by_split_as_list(prefix) for refid in xrange(len(references[0])): codecs.open('%s/%s_reference.ref%d' % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references])) def bleu_score(self, directory, val=True): ''' PPLX is only weakly correlated with improvements in BLEU, and thus improvements in human judgements. Let's also track BLEU score of a subset of generated sentences in the val split to decide on early stopping, etc. ''' prefix = "val" if val else "test" self.extract_references(directory, val) subprocess.check_call( ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU' % (directory, prefix, directory, prefix, directory, prefix)], shell=True)
class GroundedTranslationGenerator: def __init__(self, args): self.args = args self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 # consistent with models.py self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.model = None self.prepare_datagenerator() # this results in two file handlers for dataset (here and # data_generator) if not self.args.dataset: logger.warn("No dataset given, using flickr8k") self.dataset = h5py.File("flickr8k/dataset.h5", "r") else: self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r") if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' def prepare_datagenerator(self): self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index def generate(self): ''' Entry point for this module. Loads up a data generator to get the relevant image / source features. Builds the relevant model, given the command-line arguments. Generates sentences for the images in the val / test data. Calculates BLEU and PPLX, unless requested. ''' if self.use_sourcelang: # HACK FIXME unexpected problem with input_data self.hsn_size = self.data_gen.hsn_size else: self.hsn_size = 0 if self.model == None: self.build_model(generate=True) self.generate_sentences(self.args.checkpoint, val=not self.args.test) if not self.args.without_scores: score = self.bleu_score(self.args.checkpoint, val=not self.args.test) if self.args.multeval: score, _, _ = self.multeval_scores(self.args.checkpoint, val=not self.args.test) if not self.args.no_pplx: self.build_model(generate=False) self.calculate_pplx(self.args.checkpoint, val=not self.args.test) return score def generate_sentences(self, filepath, val=True): """ Generates descriptions of images for --generation_timesteps iterations through the LSTM. Each input description is clipped to the first <BOS> token, or, if --generate_from_N_words is set, to the first N following words (N + 1 BOS token). This process can be additionally conditioned on source language hidden representations, if provided by the --source_vectors parameter. The output is clipped to the first EOS generated, if it exists. TODO: duplicated method with generate.py """ if self.args.beam_width > 1: prefix = "val" if val else "test" handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words # Default 0 start_gen = start_gen + 1 # include BOS generator = self.data_gen.generation_generator(prefix, batch_size=1) seen = 0 # we are going to beam search for the most probably sentence. # let's do this one sentence at a time to make the logging output # easier to understand for data in generator: text = data[0]['text'] # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.index2word[w]) del data[0]['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data[1]['output'] data[0]['text'] = text max_beam_width = self.args.beam_width structs = self.make_duplicate_matrices(data[0], max_beam_width) # A beam is a 2-tuple with the probability of the sequence and # the words in that sequence. Start with empty beams beams = [(0.0, [])] # collects beams that are in the top candidates and # emitted a <E> token. finished = [] for t in range(start_gen, self.args.generation_timesteps): # Store the candidates produced at timestep t, will be # pruned at the end of the timestep candidates = [] # we take a view of the datastructures, which means we're only # ever generating a prediction for the next word. This saves a # lot of cycles. preds = self.model.predict(structs, verbose=0) # The last indices in preds are the predicted words next_word_indices = preds[:, t - 1] sorted_indices = np.argsort(-next_word_indices, axis=1) # Each instance in structs is holding the history of a # beam, and so there is a direct connection between the # index of a beam in beams and the index of an instance in # structs. for beam_idx, b in enumerate(beams): # get the sorted predictions for the beam_idx'th beam beam_predictions = sorted_indices[beam_idx] for top_idx in range(self.args.beam_width): wordIndex = beam_predictions[top_idx] wordProb = next_word_indices[beam_idx][ beam_predictions[top_idx]] # For the beam_idxth beam, add the log probability # of the top_idxth predicted word to the previous # log probability of the sequence, and append the # top_idxth predicted word to the sequence of words candidates.append([ b[0] + math.log(wordProb), b[1] + [wordIndex] ]) candidates.sort(reverse=True) if self.args.verbose: logger.info("Candidates in the beam") logger.info("---") for c in candidates: logger.info( " ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0]) beams = candidates[:max_beam_width] # prune the beams pruned = [] for b in beams: # If a top candidate emitted an EOS token then # a) add it to the list of finished sequences # b) remove it from the beams and decrease the # maximum size of the beams. if b[1][-1] == self.word2index["<E>"]: finished.append(b) if max_beam_width >= 1: max_beam_width -= 1 else: pruned.append(b) beams = pruned[:max_beam_width] if self.args.verbose: logger.info("Pruned beams") logger.info("---") for b in beams: logger.info( " ".join([self.index2word[x] for x in b[1]]) + "(%f)" % b[0]) if max_beam_width == 0: # We have sampled max_beam_width sequences with an <E> # token so stop the beam search. break # Reproduce the structs for the beam search so we can keep # track of the state of each beam structs = self.make_duplicate_matrices( data[0], max_beam_width) # Rewrite the 1-hot word features with the # so-far-predcicted tokens in a beam. for bidx, b in enumerate(beams): for idx, w in enumerate(b[1]): next_word_index = w structs['text'][bidx, idx + 1, w] = 1. # If none of the sentences emitted an <E> token while # decoding, add the final beams into the final candidates if len(finished) == 0: for leftover in beams: finished.append(leftover) # Normalise the probabilities by the length of the sequences # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711 for f in finished: f[0] = f[0] / len(f[1]) finished.sort(reverse=True) if self.args.verbose: logger.info("Length-normalised samples") logger.info("---") for f in finished: logger.info( " ".join([self.index2word[x] for x in f[1]]) + "(%f)" % f[0]) # Emit the lowest (log) probability sequence best_beam = finished[0] complete_sentences[i] = [ self.index2word[x] for x in best_beam[1] ] handle.write(' '.join([ x for x in itertools.takewhile(lambda n: n != "<E>", complete_sentences[i]) ]) + "\n") if self.args.verbose: logger.info( "%s (%f)", ' '.join([ x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i]) ]), best_beam[0]) seen += text.shape[0] if seen == self.data_gen.split_sizes['val']: # Hacky way to break out of the generator break handle.close() else: # We are going to arg max decode a sequence. prefix = "val" if val else "test" logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words + 1 # include BOS handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') generator = self.data_gen.generation_generator(prefix) seen = 0 for data in generator: text = deepcopy(data[0]['text']) # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.index2word[w]) del data[0]['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data[1]['output'] data[0]['text'] = text for t in range(start_gen, self.args.generation_timesteps): logger.debug("Input token: %s" % self.index2word[np.argmax(text[0, t - 1])]) preds = self.model.predict(data[0], verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds[:, t - 1], axis=1) logger.debug("Predicted token: %s" % self.index2word[next_word_indices[0]]) # update array[0]/sentence-so-far with generated words. for i in range(len(next_word_indices)): data[0]['text'][i, t, next_word_indices[i]] = 1. next_words = [ self.index2word[x] for x in next_word_indices ] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) sys.stdout.flush() # print/extract each sentence until it hits the first end-of-string token for s in complete_sentences: if self.args.verbose: logger.info( "%s", ' '.join([ x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i]) ])) decoded_str = ' '.join([ x for x in itertools.takewhile(lambda n: n != "<E>", s[1:]) ]) handle.write(decoded_str + "\n") seen += text.shape[0] if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break handle.close() def calculate_pplx(self, path, val=True): """ Splits the input data into batches of self.args.batch_size to reduce the memory footprint of holding all of the data in RAM. """ prefix = "val" if val else "test" logger.info("Calculating pplx over %s data", prefix) sum_logprobs = 0 y_len = 0 generator = self.data_gen.generation_generator(prefix) seen = 0 for data in generator: Y_target = deepcopy(data[1]['output']) del data[1]['output'] preds = self.model.predict(data[0], verbose=0, batch_size=self.args.batch_size) for i in range(Y_target.shape[0]): for t in range(Y_target.shape[1]): target_idx = np.argmax(Y_target[i, t]) target_tok = self.index2word[target_idx] if target_tok != "<P>": log_p = math.log(preds[i, t, target_idx], 2) sum_logprobs += -log_p y_len += 1 seen += data[0]['text'].shape[0] if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break norm_logprob = sum_logprobs / y_len pplx = math.pow(2, norm_logprob) logger.info("PPLX: %.4f", pplx) handle = open("%s/%sPPLX" % (path, prefix), "w") handle.write("%f\n" % pplx) handle.close() return pplx def reset_text_arrays(self, text_arrays, fixed_words=1): """ Reset the values in the text data structure to zero so we cannot accidentally pass them into the model. Helper function for generate_sentences(). """ reset_arrays = deepcopy(text_arrays) reset_arrays[:, fixed_words:, :] = 0 return reset_arrays def make_duplicate_matrices(self, generator_data, k): ''' Prepare K duplicates of the input data for a given instance yielded by the data generator. Helper function for the beam search decoder in generation_sentences(). ''' if self.use_sourcelang and self.use_image: # the data generator yielded a dictionary with the words, the # image features, and the source features dupes = [[], [], []] words = generator_data['text'] img = generator_data['img'] source = generator_data['src'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0, :, :])) dupes[1].append(source[0, :, :]) dupes[2].append(img[0, :, :]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) dupes[2] = np.array(dupes[2]) return {'text': dupes[0], 'img': dupes[2], 'src': dupes[1]} elif self.use_image: # the data generator yielded a dictionary with the words and the # image features dupes = [[], []] words = generator_data['text'] img = generator_data['img'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0, :, :])) dupes[1].append(img[0, :, :]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'img': dupes[1]} elif self.use_sourcelang: # the data generator yielded a dictionary with the words and the # source features dupes = [[], []] words = generator_data['text'] source = generator_data['src'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0, :, :])) dupes[1].append(source[0, :, :]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'src': dupes[1]} def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best loss" if self.args.best_pplx else "Best Metric" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint)) return "%s/%s" % (self.args.model_checkpoints, checkpoint) def bleu_score(self, directory, val=True): ''' PPLX is only weakly correlated with improvements in BLEU, and thus improvements in human judgements. Let's also track BLEU score of a subset of generated sentences in the val split to decide on early stopping, etc. ''' prefix = "val" if val else "test" self.extract_references(directory, val) subprocess.check_call([ 'perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU' % (directory, prefix, directory, prefix, directory, prefix) ], shell=True) bleudata = open("%s/%sBLEU" % (directory, prefix)).readline() data = bleudata.split(",")[0] bleuscore = data.split("=")[1] bleu = float(bleuscore.lstrip()) return bleu def multeval_scores(self, directory, val=True): ''' Maybe you want to evaluate with Meteor, TER, and BLEU? ''' prefix = "val" if val else "test" self.extract_references(directory, val) with cd(MULTEVAL_DIR): subprocess.check_call([ './multeval.sh eval --refs ../%s/%s_reference.* \ --hyps-baseline ../%s/%sGenerated \ --meteor.language %s \ --threads 4 \ 2> multevaloutput 1> multevaloutput' % (directory, prefix, directory, prefix, self.args.meteor_lang) ], shell=True) handle = open("multevaloutput") multdata = handle.readlines() handle.close() for line in multdata: if line.startswith("RESULT: baseline: BLEU: AVG:"): mbleu = line.split(":")[4] mbleu = mbleu.replace("\n", "") mbleu = mbleu.strip() lr = mbleu.split(".") mbleu = float(lr[0] + "." + lr[1][0:2]) if line.startswith("RESULT: baseline: METEOR: AVG:"): mmeteor = line.split(":")[4] mmeteor = mmeteor.replace("\n", "") mmeteor = mmeteor.strip() lr = mmeteor.split(".") mmeteor = float(lr[0] + "." + lr[1][0:2]) if line.startswith("RESULT: baseline: TER: AVG:"): mter = line.split(":")[4] mter = mter.replace("\n", "") mter = mter.strip() lr = mter.split(".") mter = float(lr[0] + "." + lr[1][0:2]) logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", mmeteor, mbleu, mter) return mmeteor, mbleu, mter def extract_references(self, directory, val=True): """ Get reference descriptions for split we are generating outputs for. Helper function for bleu_score(). """ prefix = "val" if val else "test" references = self.data_gen.get_refs_by_split_as_list(prefix) for refid in xrange(len(references[0])): codecs.open('%s/%s_reference.ref%d' % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join( [x[refid] for x in references])) def build_model(self, generate=False): ''' Build a Keras model if one does not yet exist. Helper function for generate(). ''' if generate: t = self.args.generation_timesteps else: t = self.data_gen.max_seq_len if self.args.mrnn: m = models.MRNN(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) else: m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image)
class ExtractFinalHiddenStateActivations: def __init__(self, args): self.args = args self.args.generate_from_N_words = 0 # Default 0 self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 self.MAX_HT = self.args.generation_timesteps - 1 # consistent with models.py # maybe use_sourcelang isn't applicable here? self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' self.source_type = "predicted" if self.args.use_predicted_tokens else "gold" self.source_encoder = "mt_enc" if self.args.no_image else "vis_enc" self.source_dim = self.args.hidden_size self.h5_dataset_str = "%s-hidden_feats-%s-%d" % (self.source_type, self.source_encoder, self.source_dim) logger.info("Serialising into %s" % self.h5_dataset_str) def get_hidden_activations(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_generator.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_generator.index2word) t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=t) self.fhs = m.buildHSNActivations(use_image=self.use_image) if self.args.use_predicted_tokens and self.args.no_image == False: gen_m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=self.args.generation_timesteps) self.full_model = gen_m.buildKerasModel(use_image=self.use_image) self.new_generate_activations('train') self.new_generate_activations('val') self.new_generate_activations('test') def new_generate_activations(self, split): ''' Generate and serialise final-timestep hidden state activations into --dataset. TODO: we should be able to serialise predicted final states instead of gold-standard final states for val and test data. ''' logger.info("%s: extracting final hidden state activations from this model", split) # Prepare the data generator based on whether we're going to work with # the gold standard input tokens or the automatically predicted tokens if self.args.use_predicted_tokens: the_generator = self.data_generator.generation_generator(split=split) else: the_generator = self.data_generator.fixed_generator(split=split) counter = 0 if split == 'train': hidden_states = [] batch_start = 0 batch_end = 0 for data in the_generator: if self.args.use_predicted_tokens: tokens = self.get_predicted_tokens(data) data['text'] = self.set_text_arrays(tokens, data['text']) print(data['text'].shape) # We extract the FHS from either the oracle input tokens hsn = self.fhs.predict({'text': data['text'], 'img': data['img']}, batch_size=self.args.batch_size, verbose=1) for idx, h in enumerate(hsn['rnn']): # get final_hidden index on a sentence-by-sentence # basis by searching for the first <E> in each trainY eos = False for widx, warr in enumerate(data['output'][idx]): w = np.argmax(warr) if self.data_generator.index2word[w] == "<E>": final_hidden = h[widx] hidden_states.append(final_hidden) eos = True logger.debug(widx) break if not eos: final_hidden = h[self.MAX_HT] hidden_states.append(final_hidden) batch_end += 1 # Note: serialisation happens over training batches too. # now serialise the hidden representations in the h5 self.to_h5_indices(split, data['indices'], hidden_states) batch_start = batch_end counter += len(hidden_states) hidden_states = [] logger.info("Processed %d instances" % counter) if batch_end >= self.data_generator.split_sizes[split]: break elif split == 'val' or split == "test": hidden_states = [] batch_start = 0 batch_end = 0 for data in the_generator: if self.args.use_predicted_tokens: tokens = self.get_predicted_tokens(data) data['text'] = self.set_text_arrays(tokens, data['text']) # We extract the FHS from either the oracle input tokens hsn = self.fhs.predict({'text': data['text'], 'img': data['img']}, batch_size=self.args.batch_size, verbose=1) for idx, h in enumerate(hsn['rnn']): # get final_hidden index on a sentence-by-sentence # basis by searching for the first <E> in each trainY eos = False for widx, warr in enumerate(data['output'][idx]): w = np.argmax(warr) if self.data_generator.index2word[w] == "<E>": final_hidden = h[widx] hidden_states.append(final_hidden) eos = True break if not eos: final_hidden = h[self.MAX_HT] hidden_states.append(final_hidden) batch_end += 1 # Note: serialisation happens over training batches too. # now serialise the hidden representations in the h5 self.to_h5_indices(split, data['indices'], hidden_states) batch_start = batch_end counter += len(hidden_states) hidden_states = [] logger.info("Processed %d instances" % counter) if batch_end >= self.data_generator.split_sizes[split]: break def get_predicted_tokens(self, data): """ We're not going to work with the gold standard input tokens. Instead we're going to automatically predict them and then extract the final hidden state from the inferred data. Helper function used by new_generate_activations(). """ # We are going to arg max decode a sequence. start_gen = self.args.generate_from_N_words + 1 # include BOS text = deepcopy(data['text']) # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.data_generator.index2word[w]) del data['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data['output'] data['text'] = text for t in range(start_gen, self.args.generation_timesteps): logger.debug("Input token: %s" % self.data_generator.index2word[np.argmax(data['text'][0,t-1])]) preds = self.full_model.predict(data, verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds['output'][:, t-1], axis=1) logger.debug("Predicted token: %s" % self.data_generator.index2word[next_word_indices[0]]) # update array[0]/sentence-so-far with generated words. for i in range(len(next_word_indices)): data['text'][i, t, next_word_indices[i]] = 1. next_words = [self.data_generator.index2word[x] for x in next_word_indices] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) # extract each sentence until it hits the first end-of-string token pruned_sentences = [] for s in complete_sentences: pruned_sentences.append([x for x in itertools.takewhile( lambda n: n != "<E>", s)]) return pruned_sentences def set_text_arrays(self, predicted_tokens, text_arrays): """ Set the values of the text tokens in the text arrays based on the tokens predicted by the model. Helper function used by new_generate_activations() """ pidx = 0 new_arrays = deepcopy(text_arrays) for pairs in zip(predicted_tokens, text_arrays): toks = pairs[0] struct = pairs[1] for tidx, t in enumerate(toks): struct[tidx, self.data_generator.word2index[t]] = 1 new_arrays[pidx] = struct pidx += 1 return new_arrays def reset_text_arrays(self, text_arrays, fixed_words=1): """ Reset the values in the text data structure to zero so we cannot accidentally pass them into the model. Helper function for generate_sentences(). """ reset_arrays = deepcopy(text_arrays) reset_arrays[:,fixed_words:, :] = 0 return reset_arrays def generate_activations(self, split): ''' Generate and serialise final-timestep hidden state activations into --dataset. TODO: we should be able to serialise predicted final states instead of gold-standard final states for val and test data. ''' logger.info("%s: extracting final hidden state activations from this model", split) if split == 'train': """ WARNING: This collects the *entirety of the training data* in hidden_states, so should not be used on non-toy training data. """ hidden_states = [] batch_start = 0 batch_end = 0 for train_input, trainY, indicator, keys in\ self.data_generator.yield_training_batch(self.args.big_batch_size, self.use_sourcelang, self.use_image, return_keys=True): if self.args.use_predicted_tokens is True and\ self.args.no_image is False: # Reset the word indices and then generate the # descriptions of the images from scratch fixed_words = self.args.generate_from_N_words + 1 train_input[0][:, fixed_words:, :] = 0 predicted_words = self.generate_sentences(split, arrays=train_input) self.sentences_to_h5_keys(split, keys, predicted_words) # TODO: code duplication from make_generation_arrays pred_inputs = deepcopy(train_input) tokens = pred_inputs[0] tokens[:, fixed_words, :] = 0 # reset the inputs for prediction, words in zip(predicted_words, tokens): for idx, t in enumerate(prediction): words[idx, self.data_generator.word2index[t]] = 1. trainY = self.data_generator.get_target_descriptions(tokens) hsn = self.fhs.predict(train_input, batch_size=self.args.batch_size, verbose=1) else: # We extract the FHS from oracle training input tokens hsn = self.fhs.predict(train_input, batch_size=self.args.batch_size, verbose=1) logger.info(len(hsn)) for idx, h in enumerate(hsn): # get final_hidden index on a sentence-by-sentence # basis by searching for the first <E> in each trainY eos = False for widx, warr in enumerate(trainY[idx]): w = np.argmax(warr) if self.data_generator.index2word[w] == "<E>": final_hidden = h[widx] hidden_states.append(final_hidden) eos = True break if not eos: final_hidden = h[30] hidden_states.append(final_hidden) batch_end += 1 logger.info(len(hidden_states)) # Note: serialisation happens over training batches too. # now serialise the hidden representations in the h5 #self.serialise_to_h5(split, len(hidden_states[0]), hidden_states, # batch_start, batch_end) # KEYS ARE OVER IMAGES NOT DESCRIPTIONS # THIS WILL BREAK IF THERE ARE MULTIPLE DESCRIPTIONS/IMAGE self.serialise_to_h5_keys(split, keys, hidden_states, batch_start, batch_end) batch_start = batch_end hidden_states = [] elif split == 'val' or split == "test": # TODO: get keys and do serialise_to_h5 with keys. inputs, Ys = self.data_generator.get_data_by_split(split, self.use_sourcelang, self.use_image) hidden_states = [] # We can extract the FGS from either oracle or predicted word # sequences for val / test data . if self.args.use_predicted_tokens is True and self.args.no_image is False: predicted_words = self.generate_sentences(split) self.sentences_to_h5(split, predicted_words) inputs, Ys = self.make_generation_arrays(split, self.args.generate_from_N_words, predicted_tokens=predicted_words) hsn = self.fhs.predict(inputs, batch_size=self.args.batch_size, verbose=1) for idx, h in enumerate(hsn): # get final_hidden index on a sentence-by-sentence # basis by searching for the first <E> in each trainY for widx, warr in enumerate(Ys[idx]): w = np.argmax(warr) if self.data_generator.index2word[w] == "<E>": logger.debug("Sentence length %d", widx) final_hidden = h[widx] hidden_states.append(final_hidden) break # now serialise the hidden representations in the h5 self.serialise_to_h5(split, len(hidden_states[0]), hidden_states) def make_generation_arrays(self, prefix, fixed_words, predicted_tokens=None): ''' Create arrays that are used as input for generation / activation. ''' if predicted_tokens is not None: input_data, targets = self.data_generator.get_data_by_split(prefix, self.use_sourcelang, self.use_image) logger.info("Initialising generation arrays with predicted tokens") gen_input_data = deepcopy(input_data) tokens = gen_input_data[0] tokens[:, fixed_words, :] = 0 # reset the inputs for prediction, words, tgt in zip(predicted_tokens, tokens, targets): for idx, t in enumerate(prediction): words[idx, self.data_generator.word2index[t]] = 1. targets = self.data_generator.get_target_descriptions(tokens) return gen_input_data, targets else: # Replace input words (input_data[0]) with zeros for generation, # except for the first args.generate_from_N_words # NOTE: this will include padding and BOS steps (fixed_words has been # incremented accordingly already in generate_sentences().) input_data = self.data_generator.get_generation_data_by_split(prefix, self.use_sourcelang, self.use_image) logger.info("Initialising with the first %d gold words (incl BOS)", fixed_words) gen_input_data = deepcopy(input_data) gen_input_data[0][:, fixed_words:, :] = 0 return gen_input_data def generate_sentences(self, split, arrays=None): """ Generates descriptions of images for --generation_timesteps iterations through the LSTM. Each input description is clipped to the first <BOS> token, or, if --generate_from_N_words is set, to the first N following words (N + 1 BOS token). This process can be additionally conditioned on source language hidden representations, if provided by the --source_vectors parameter. The output is clipped to the first EOS generated, if it exists. TODO: beam search TODO: duplicated method with generate.py and Callbacks.py """ logger.info("%s: generating descriptions", split) start_gen = self.args.generate_from_N_words # Default 0 start_gen = start_gen + 1 # include BOS # prepare the datastructures for generation (no batching over val) if arrays == None: arrays = self.make_generation_arrays(split, start_gen) N_sents = arrays[0].shape[0] complete_sentences = [[] for _ in range(N_sents)] for t in range(start_gen): # minimum 1 for i in range(N_sents): w = np.argmax(arrays[0][i, t]) complete_sentences[i].append(self.data_generator.index2word[w]) for t in range(start_gen, self.args.generation_timesteps): # we take a view of the datastructures, which means we're only # ever generating a prediction for the next word. This saves a # lot of cycles. preds = self.full_model.predict([arr[:, 0:t] for arr in arrays], verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds[:, -1], axis=1) # update array[0]/sentence-so-far with generated words. for i in range(N_sents): arrays[0][i, t, next_word_indices[i]] = 1. next_words = [self.data_generator.index2word[x] for x in next_word_indices] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) # extract each sentence until it hits the first end-of-string token pruned_sentences = [] for s in complete_sentences: pruned_sentences.append([x for x in itertools.takewhile( lambda n: n != "<E>", s)]) return pruned_sentences def to_h5_indices(self, split, indices, hidden_states): hsn_shape = len(hidden_states[0]) fhf_str = "final_hidden_features" logger.info("Serialising final hidden state features from %s to H5", split) for idx, data_key in enumerate(indices): ident = data_key[0] desc_idx = data_key[1] self.data_generator.set_source_features(split, ident, self.h5_dataset_str, hidden_states[idx], hsn_shape, desc_idx) def serialise_to_h5_keys(self, split, data_keys, hidden_states): hsn_shape = len(hidden_states[0]) fhf_str = "final_hidden_features" logger.info("Serialising final hidden state features from %s to H5", split) for idx, data_key in enumerate(data_keys): self.data_generator.set_source_features(split, data_key, self.h5_dataset_str, hidden_states[idx], hsn_shape) #try: # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # fhf_str, (hsn_shape,), dtype='float32') #except RuntimeError: # # the dataset already exists, retrieve it into RAM and then overwrite it # del self.data_generator.dataset[split][data_key][fhf_str] # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # fhf_str, (hsn_shape,), dtype='float32') #try: # hsn_data[:] = hidden_states[idx] #except IndexError: # raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % ( # data_key, len(data_keys), idx, len(hidden_states))) # break def sentences_to_h5(self, split, sentences): ''' Save the predicted sentences into the h5 dataset object. This is useful for subsequently (i.e. in a different program) extracting LM-only final hidden states from predicted sentences. Specifically, this can be compared to generating LM-only hidden states over gold-standard tokens. ''' idx = 0 logger.info("Serialising sentences from %s to H5", split) data_keys = self.data_generator.dataset[split] if split == 'val' and self.args.small_val: data_keys = ["%06d" % x for x in range(len(sentences))] else: data_keys = ["%06d" % x for x in range(len(sentences))] for data_key in data_keys: self.data_generator.set_predicted_description(split, data_key, sentences[idx][1:]) idx += 1 def sentences_to_h5_keys(self, split, data_keys, sentences): logger.info("Serialising sentences from %s to H5", split) for idx, data_key in enumerate(data_keys): self.data_generator.set_predicted_description(split, data_key, sentences[idx]) def serialise_to_h5(self, split, hsn_shape, hidden_states, batch_start=None, batch_end=None): """ Serialise the hidden representations from generate_activations into the h5 dataset. This assumes one hidden_state per image key, which is maybe not appropriate if there are multiple descriptions/image. """ idx = 0 logger.info("Serialising final hidden state features from %s to H5", split) if batch_start is not None: logger.info("Start at %d, end at %d", batch_start, batch_end) data_keys = ["%06d" % x for x in range(batch_start, batch_end)] assert len(hidden_states) == len(data_keys),\ "keys: %d hidden %d; start %d end %d" % (len(data_keys), len(hidden_states), batch_start, batch_end) else: data_keys = self.data_generator.dataset[split] if split == 'val' and self.args.small_val: data_keys = ["%06d" % x for x in range(len(hidden_states))] else: data_keys = ["%06d" % x for x in range(len(hidden_states))] for data_key in data_keys: self.data_generator.set_source_features(split, data_key, self.h5_dataset_str, hidden_states[idx], hsn_shape) #try: # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # fhf_str, (hsn_shape,), dtype='float32') #except RuntimeError: # # the dataset already exists, retrieve it into RAM and then overwrite it # del self.data_generator.dataset[split][data_key][fhf_str] # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # fhf_str, (hsn_shape,), dtype='float32') #try: # hsn_data[:] = hidden_states[idx] #except IndexError: # raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % ( # data_key, len(data_keys), # idx, len(hidden_states))) # break idx += 1 def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best PPLX" if self.args.best_pplx else "Best BLEU" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break return "%s/%s" % (self.args.model_checkpoints, checkpoint)
class GroundedTranslation(object): def __init__(self, args, datagen=None): ''' Initialise the model and set Theano debugging model if self.args.debug is true. Prepare the data generator if necessary. ''' self.args = args self.data_generator = datagen self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.log_run_arguments() self.data_generator = datagen self.prepare_datagenerator() if self.args.debug: theano.config.optimizer = 'fast_compile' theano.config.exception_verbosity = 'high' def train_model(self): ''' Initialise the data generator to process the data in a memory-friendly manner. Then build the Keras model, given the user-specified arguments (or the initial defaults). Train the model for self.args.max_epochs and return the training and validation losses. The losses object contains a history variable. The history variable is a dictionary with a list of training and validation losses: losses.history.['loss'] losses.history.['val_loss'] ''' if not self.use_sourcelang: hsn_size = 0 else: hsn_size = self.data_generator.hsn_size # ick if self.args.mrnn: m = models.MRNN(self.args.embed_size, self.args.hidden_size, self.V, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.init_from_checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=self.data_generator.max_seq_len, lr=self.args.lr) else: m = models.NIC(self.args.embed_size, self.args.hidden_size, self.V, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.init_from_checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=self.data_generator.max_seq_len, lr=self.args.lr) model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image) callbacks = CompilationOfCallbacks(self.data_generator.word2index, self.data_generator.index2word, self.args, self.args.dataset, self.data_generator, use_sourcelang=self.use_sourcelang, use_image=self.use_image) train_generator = self.data_generator.random_generator('train') train_size = self.data_generator.split_sizes['train'] val_generator = self.data_generator.fixed_generator('val') val_size = self.data_generator.split_sizes['val'] losses = model.fit_generator(generator=train_generator, samples_per_epoch=train_size, nb_epoch=self.args.max_epochs, verbose=1, callbacks=[callbacks], nb_worker=1, validation_data=val_generator, nb_val_samples=val_size) return losses def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator( self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size() def log_run_arguments(self): ''' Save the command-line arguments, along with the method defaults, used to parameterise this run. ''' logger.info("Run arguments:") for arg, value in self.args.__dict__.iteritems(): logger.info("%s: %s" % (arg, str(value)))
class GroundedTranslation(object): def __init__(self, args, datagen=None): ''' Initialise the model and set Theano debugging model if self.args.debug is true. Prepare the data generator if necessary. ''' self.args = args self.data_generator = datagen self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.log_run_arguments() self.data_generator=datagen self.prepare_datagenerator() if self.args.debug: theano.config.optimizer = 'fast_compile' theano.config.exception_verbosity = 'high' def train_model(self): ''' Initialise the data generator to process the data in a memory-friendly manner. Then build the Keras model, given the user-specified arguments (or the initial defaults). Train the model for self.args.max_epochs and return the training and validation losses. The losses object contains a history variable. The history variable is a dictionary with a list of training and validation losses: losses.history.['loss'] losses.history.['val_loss'] ''' if not self.use_sourcelang: hsn_size = 0 else: hsn_size = self.data_generator.hsn_size # ick if self.args.mrnn: m = models.MRNN(self.args.embed_size, self.args.hidden_size, self.V, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.init_from_checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=self.data_generator.max_seq_len, lr=self.args.lr) else: m = models.NIC(self.args.embed_size, self.args.hidden_size, self.V, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.init_from_checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=self.data_generator.max_seq_len, lr=self.args.lr) model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image) callbacks = CompilationOfCallbacks(self.data_generator.word2index, self.data_generator.index2word, self.args, self.args.dataset, self.data_generator, use_sourcelang=self.use_sourcelang, use_image=self.use_image) train_generator = self.data_generator.random_generator('train') train_size = self.data_generator.split_sizes['train'] val_generator = self.data_generator.fixed_generator('val') val_size = self.data_generator.split_sizes['val'] losses = model.fit_generator(generator=train_generator, samples_per_epoch=train_size, nb_epoch= self.args.max_epochs, verbose=1, callbacks=[callbacks], nb_worker=1, validation_data=val_generator, nb_val_samples=val_size) return losses def prepare_datagenerator(self): ''' Initialise the data generator and its datastructures, unless a valid data generator was already passed into the GroundedTranslation.__init() function. ''' # Initialise the data generator if it has not yet been initialised if self.data_generator == None: self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) # Extract the working vocabulary from the training dataset if self.args.existing_vocab != "": self.data_generator.set_vocabulary(self.args.existing_vocab) else: self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size() def log_run_arguments(self): ''' Save the command-line arguments, along with the method defaults, used to parameterise this run. ''' logger.info("Run arguments:") for arg, value in self.args.__dict__.iteritems(): logger.info("%s: %s" % (arg, str(value)))
class ExtractFinalHiddenStateActivations: def __init__(self, args): self.args = args self.args.generate_from_N_words = 0 # Default 0 self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 self.MAX_HT = self.args.generation_timesteps - 1 # consistent with models.py # maybe use_sourcelang isn't applicable here? self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' self.source_type = "predicted" if self.args.use_predicted_tokens else "gold" self.source_encoder = "mt_enc" if self.args.no_image else "vis_enc" self.source_dim = self.args.hidden_size self.h5_dataset_str = "%s-hidden_feats-%s-%d" % ( self.source_type, self.source_encoder, self.source_dim) logger.info("Serialising into %s" % self.h5_dataset_str) def get_hidden_activations(self): ''' In the model, we will merge the VGG image representation with the word embeddings. We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.data_generator = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_generator.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_generator.index2word) t = self.args.generation_timesteps if self.args.use_predicted_tokens else self.data_generator.max_seq_len m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=t) self.fhs = m.buildHSNActivations(use_image=self.use_image) if self.args.use_predicted_tokens and self.args.no_image == False: gen_m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, weights=self.args.checkpoint, gru=self.args.gru, t=self.args.generation_timesteps) self.full_model = gen_m.buildKerasModel(use_image=self.use_image) self.new_generate_activations('train') self.new_generate_activations('val') self.new_generate_activations('test') def new_generate_activations(self, split): ''' Generate and serialise final-timestep hidden state activations into --dataset. TODO: we should be able to serialise predicted final states instead of gold-standard final states for val and test data. ''' logger.info( "%s: extracting final hidden state activations from this model", split) # Prepare the data generator based on whether we're going to work with # the gold standard input tokens or the automatically predicted tokens if self.args.use_predicted_tokens: the_generator = self.data_generator.generation_generator( split=split) else: the_generator = self.data_generator.fixed_generator(split=split) counter = 0 hidden_states = [] batch_start = 0 batch_end = 0 for data in the_generator: if self.args.use_predicted_tokens: tokens = self.get_predicted_tokens(data) data[0]['text'] = self.set_text_arrays(tokens, data[0]['text']) # We extract the FHS from either the oracle input tokens hsn = self.fhs.predict( { 'text': data[0]['text'], 'img': data[0]['img'] }, batch_size=self.args.batch_size, verbose=1) for idx, h in enumerate(hsn): # get final_hidden index on a sentence-by-sentence # basis by searching for the first <E> in each trainY eos = False for widx, warr in enumerate(data[1]['output'][idx]): w = np.argmax(warr) if self.data_generator.index2word[w] == "<E>": final_hidden = h[widx] hidden_states.append(final_hidden) eos = True logger.debug(widx) break if not eos: final_hidden = h[self.MAX_HT] hidden_states.append(final_hidden) batch_end += 1 # Note: serialisation happens over training batches too. # now serialise the hidden representations in the h5 self.to_h5_indices(split, data[0]['indices'], hidden_states) batch_start = batch_end counter += len(hidden_states) hidden_states = [] logger.info("Processed %d instances" % counter) if batch_end >= self.data_generator.split_sizes[split]: break # elif split == 'val' or split == "test": # hidden_states = [] # batch_start = 0 # batch_end = 0 # for data in the_generator: # if self.args.use_predicted_tokens: # tokens = self.get_predicted_tokens(data) # data['text'] = self.set_text_arrays(tokens, data['text']) # # # We extract the FHS from either the oracle input tokens # hsn = self.fhs.predict({'text': data['text'], # 'img': data['img']}, # batch_size=self.args.batch_size, # verbose=1) # # for idx, h in enumerate(hsn['rnn']): # # get final_hidden index on a sentence-by-sentence # # basis by searching for the first <E> in each trainY # eos = False # for widx, warr in enumerate(data['output'][idx]): # w = np.argmax(warr) # if self.data_generator.index2word[w] == "<E>": # final_hidden = h[widx] # hidden_states.append(final_hidden) # eos = True # break # if not eos: # final_hidden = h[self.MAX_HT] # hidden_states.append(final_hidden) # batch_end += 1 # # # Note: serialisation happens over training batches too. # # now serialise the hidden representations in the h5 # self.to_h5_indices(split, data['indices'], hidden_states) # # batch_start = batch_end # counter += len(hidden_states) # hidden_states = [] # logger.info("Processed %d instances" % counter) # if batch_end >= self.data_generator.split_sizes[split]: # break def get_predicted_tokens(self, data): """ We're not going to work with the gold standard input tokens. Instead we're going to automatically predict them and then extract the final hidden state from the inferred data. Helper function used by new_generate_activations(). """ # We are going to arg max decode a sequence. start_gen = self.args.generate_from_N_words + 1 # include BOS text = deepcopy(data['text']) # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.data_generator.index2word[w]) del data['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data['output'] data['text'] = text for t in range(start_gen, self.args.generation_timesteps): logger.debug("Input token: %s" % self.data_generator.index2word[np.argmax( data['text'][0, t - 1])]) preds = self.full_model.predict(data, verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds['output'][:, t - 1], axis=1) logger.debug("Predicted token: %s" % self.data_generator.index2word[next_word_indices[0]]) # update array[0]/sentence-so-far with generated words. for i in range(len(next_word_indices)): data['text'][i, t, next_word_indices[i]] = 1. next_words = [ self.data_generator.index2word[x] for x in next_word_indices ] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) # extract each sentence until it hits the first end-of-string token pruned_sentences = [] for s in complete_sentences: pruned_sentences.append( [x for x in itertools.takewhile(lambda n: n != "<E>", s)]) return pruned_sentences def set_text_arrays(self, predicted_tokens, text_arrays): """ Set the values of the text tokens in the text arrays based on the tokens predicted by the model. Helper function used by new_generate_activations() """ pidx = 0 new_arrays = deepcopy(text_arrays) for pairs in zip(predicted_tokens, text_arrays): toks = pairs[0] struct = pairs[1] for tidx, t in enumerate(toks): struct[tidx, self.data_generator.word2index[t]] = 1 new_arrays[pidx] = struct pidx += 1 return new_arrays def reset_text_arrays(self, text_arrays, fixed_words=1): """ Reset the values in the text data structure to zero so we cannot accidentally pass them into the model. Helper function for generate_sentences(). """ reset_arrays = deepcopy(text_arrays) reset_arrays[:, fixed_words:, :] = 0 return reset_arrays # def make_generation_arrays(self, prefix, fixed_words, # predicted_tokens=None): # ''' # Create arrays that are used as input for generation / activation. # ''' # # # if predicted_tokens is not None: # input_data, targets = self.data_generator.get_data_by_split(prefix, # self.use_sourcelang, self.use_image) # logger.info("Initialising generation arrays with predicted tokens") # gen_input_data = deepcopy(input_data) # tokens = gen_input_data[0] # tokens[:, fixed_words, :] = 0 # reset the inputs # for prediction, words, tgt in zip(predicted_tokens, tokens, targets): # for idx, t in enumerate(prediction): # words[idx, self.data_generator.word2index[t]] = 1. # targets = self.data_generator.get_target_descriptions(tokens) # return gen_input_data, targets # # else: # # Replace input words (input_data[0]) with zeros for generation, # # except for the first args.generate_from_N_words # # NOTE: this will include padding and BOS steps (fixed_words has been # # incremented accordingly already in generate_sentences().) # input_data = self.data_generator.get_generation_data_by_split(prefix, # self.use_sourcelang, self.use_image) # logger.info("Initialising with the first %d gold words (incl BOS)", # fixed_words) # gen_input_data = deepcopy(input_data) # gen_input_data[0][:, fixed_words:, :] = 0 # return gen_input_data # # def generate_sentences(self, split, arrays=None): # """ # Generates descriptions of images for --generation_timesteps # iterations through the LSTM. Each input description is clipped to # the first <BOS> token, or, if --generate_from_N_words is set, to the # first N following words (N + 1 BOS token). # This process can be additionally conditioned # on source language hidden representations, if provided by the # --source_vectors parameter. # The output is clipped to the first EOS generated, if it exists. # # TODO: beam search # TODO: duplicated method with generate.py and Callbacks.py # """ # logger.info("%s: generating descriptions", split) # # start_gen = self.args.generate_from_N_words # Default 0 # start_gen = start_gen + 1 # include BOS # # # prepare the datastructures for generation (no batching over val) # if arrays == None: # arrays = self.make_generation_arrays(split, start_gen) # N_sents = arrays[0].shape[0] # # complete_sentences = [[] for _ in range(N_sents)] # for t in range(start_gen): # minimum 1 # for i in range(N_sents): # w = np.argmax(arrays[0][i, t]) # complete_sentences[i].append(self.data_generator.index2word[w]) # # for t in range(start_gen, self.args.generation_timesteps): # # we take a view of the datastructures, which means we're only # # ever generating a prediction for the next word. This saves a # # lot of cycles. # preds = self.full_model.predict([arr[:, 0:t] for arr in arrays], # verbose=0) # # # Look at the last indices for the words. # next_word_indices = np.argmax(preds[:, -1], axis=1) # # update array[0]/sentence-so-far with generated words. # for i in range(N_sents): # arrays[0][i, t, next_word_indices[i]] = 1. # next_words = [self.data_generator.index2word[x] for x in next_word_indices] # for i in range(len(next_words)): # complete_sentences[i].append(next_words[i]) # # # extract each sentence until it hits the first end-of-string token # pruned_sentences = [] # for s in complete_sentences: # pruned_sentences.append([x for x # in itertools.takewhile( # lambda n: n != "<E>", s)]) # return pruned_sentences def to_h5_indices(self, split, indices, hidden_states): hsn_shape = len(hidden_states[0]) fhf_str = "final_hidden_features" logger.info("Serialising final hidden state features from %s to H5", split) for idx, data_key in enumerate(indices): ident = data_key[0] desc_idx = data_key[1] self.data_generator.set_source_features(split, ident, self.h5_dataset_str, hidden_states[idx], hsn_shape, desc_idx) # def serialise_to_h5_keys(self, split, data_keys, hidden_states): # hsn_shape = len(hidden_states[0]) # fhf_str = "final_hidden_features" # logger.info("Serialising final hidden state features from %s to H5", # split) # for idx, data_key in enumerate(data_keys): # self.data_generator.set_source_features(split, data_key, # self.h5_dataset_str, # hidden_states[idx], # hsn_shape) # #try: # # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # # fhf_str, (hsn_shape,), dtype='float32') # #except RuntimeError: # # # the dataset already exists, retrieve it into RAM and then overwrite it # # del self.data_generator.dataset[split][data_key][fhf_str] # # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # # fhf_str, (hsn_shape,), dtype='float32') # #try: # # hsn_data[:] = hidden_states[idx] # #except IndexError: # # raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % ( # # data_key, len(data_keys), idx, len(hidden_states))) # # break # # def sentences_to_h5(self, split, sentences): # ''' # Save the predicted sentences into the h5 dataset object. # This is useful for subsequently (i.e. in a different program) # extracting LM-only final hidden states from predicted sentences. # Specifically, this can be compared to generating LM-only hidden # states over gold-standard tokens. # ''' # idx = 0 # logger.info("Serialising sentences from %s to H5", split) # data_keys = self.data_generator.dataset[split] # if split == 'val' and self.args.small_val: # data_keys = ["%06d" % x for x in range(len(sentences))] # else: # data_keys = ["%06d" % x for x in range(len(sentences))] # for data_key in data_keys: # self.data_generator.set_predicted_description(split, data_key, # sentences[idx][1:]) # idx += 1 # # def sentences_to_h5_keys(self, split, data_keys, sentences): # logger.info("Serialising sentences from %s to H5", # split) # for idx, data_key in enumerate(data_keys): # self.data_generator.set_predicted_description(split, data_key, # sentences[idx]) # # def serialise_to_h5(self, split, hsn_shape, hidden_states, # batch_start=None, batch_end=None): # """ Serialise the hidden representations from generate_activations # into the h5 dataset. # This assumes one hidden_state per image key, which is maybe not # appropriate if there are multiple descriptions/image. # """ # idx = 0 # logger.info("Serialising final hidden state features from %s to H5", # split) # if batch_start is not None: # logger.info("Start at %d, end at %d", batch_start, batch_end) # data_keys = ["%06d" % x for x in range(batch_start, batch_end)] # assert len(hidden_states) == len(data_keys),\ # "keys: %d hidden %d; start %d end %d" % (len(data_keys), # len(hidden_states), batch_start, # batch_end) # else: # data_keys = self.data_generator.dataset[split] # if split == 'val' and self.args.small_val: # data_keys = ["%06d" % x for x in range(len(hidden_states))] # else: # data_keys = ["%06d" % x for x in range(len(hidden_states))] # for data_key in data_keys: # self.data_generator.set_source_features(split, data_key, # self.h5_dataset_str, # hidden_states[idx], # hsn_shape) # #try: # # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # # fhf_str, (hsn_shape,), dtype='float32') # #except RuntimeError: # # # the dataset already exists, retrieve it into RAM and then overwrite it # # del self.data_generator.dataset[split][data_key][fhf_str] # # hsn_data = self.data_generator.dataset[split][data_key].create_dataset( # # fhf_str, (hsn_shape,), dtype='float32') # #try: # # hsn_data[:] = hidden_states[idx] # #except IndexError: # # raise IndexError("data_key %s of %s; index idx %d, len hidden %d" % ( # # data_key, len(data_keys), # # idx, len(hidden_states))) # # break # idx += 1 def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best loss" if self.args.best_pplx else "Best Metric" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break return "%s/%s" % (self.args.model_checkpoints, checkpoint)
def train_model(self): ''' In the model, we will merge the word embeddings with the VGG image representation (if used) and the source-language multimodal vectors (if used). We need to feed the data as a list, in which the order of the elements in the list is _crucial_. ''' self.log_run_arguments() self.data_generator = VisualWordDataGenerator( self.args, self.args.dataset) self.data_generator.extract_vocabulary() self.V = self.data_generator.get_vocab_size() # Keras doesn't do batching of val set, so # assume val data is small enough to get all at once. # val_input is the list passed to model.fit() # val_input can contain image, source features as well (or not) if not self.args.enable_val_pplx: val_input, valY = self.data_generator.get_data_by_split('val', self.use_sourcelang, self.use_image) if not self.use_sourcelang: hsn_size = 0 else: hsn_size = self.data_generator.hsn_size # ick m = models.OneLayerLSTM(self.args.hidden_size, self.V, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=hsn_size, weights=self.args.init_from_checkpoint, gru=self.args.gru) model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image) callbacks = CompilationOfCallbacks(self.data_generator.word2index, self.data_generator.index2word, self.args, self.args.dataset, self.data_generator, use_sourcelang=self.use_sourcelang, use_image=self.use_image) big_batch_size = self.args.big_batch_size if big_batch_size > 0: if self.args.small: batches = ceil(SMALL_NUM_DESCRIPTIONS/self.args.big_batch_size) else: batches = ceil(float(self.data_generator.split_sizes['train']) / self.args.big_batch_size) batches = int(batches) else: # if big_batch_size == 0, reset to training set size. big_batch_size = self.data_generator.split_sizes['train'] batches = 1 # for epoch in range(self.args.epochs): epoch = 0 while True: # the program will exit with sys.exit(0) in # Callbacks.early_stop_decision(). Do not put any clean-up # after this loop. It will NEVER be executed! batch = 1 for train_input, trainY, indicator in\ self.data_generator.yield_training_batch(big_batch_size, self.use_sourcelang, self.use_image): if self.args.predefined_epochs: logger.info("Epoch %d/%d, big-batch %d/%d", epoch+1, self.args.max_epochs, batch, batches) else: logger.info("Epoch %d, big-batch %d/%d", epoch+1, batch, batches) if indicator is True: # let's test on the val after training on these batches model.fit(train_input, trainY, validation_data=None if self.args.enable_val_pplx else (val_input, valY), callbacks=[callbacks], nb_epoch=1, verbose=1, batch_size=self.args.batch_size, shuffle=True) else: model.fit(train_input, trainY, nb_epoch=1, verbose=1, batch_size=self.args.batch_size, shuffle=True) batch += 1 epoch += 1 if self.args.predefined_epochs and epoch >= self.args.max_epochs: # stop training because we've exceeded self.args.max_epochs break
class GroundedTranslationGenerator: def __init__(self, args): self.args = args self.vocab = dict() self.unkdict = dict() self.counter = 0 self.maxSeqLen = 0 # consistent with models.py self.use_sourcelang = args.source_vectors is not None self.use_image = not args.no_image self.model = None self.prepare_datagenerator() # this results in two file handlers for dataset (here and # data_generator) if not self.args.dataset: logger.warn("No dataset given, using flickr8k") self.dataset = h5py.File("flickr8k/dataset.h5", "r") else: self.dataset = h5py.File("%s/dataset.h5" % self.args.dataset, "r") if self.args.debug: theano.config.optimizer = 'None' theano.config.exception_verbosity = 'high' def prepare_datagenerator(self): self.data_gen = VisualWordDataGenerator(self.args, self.args.dataset) self.args.checkpoint = self.find_best_checkpoint() self.data_gen.set_vocabulary(self.args.checkpoint) self.vocab_len = len(self.data_gen.index2word) self.index2word = self.data_gen.index2word self.word2index = self.data_gen.word2index def generate(self): ''' Entry point for this module. Loads up a data generator to get the relevant image / source features. Builds the relevant model, given the command-line arguments. Generates sentences for the images in the val / test data. Calculates BLEU and PPLX, unless requested. ''' if self.use_sourcelang: # HACK FIXME unexpected problem with input_data self.hsn_size = self.data_gen.hsn_size else: self.hsn_size = 0 if self.model == None: self.build_model(generate=True) self.generate_sentences(self.args.checkpoint, val=not self.args.test) if not self.args.without_scores: score = self.bleu_score(self.args.checkpoint, val=not self.args.test) if self.args.multeval: score, _, _ = self.multeval_scores(self.args.checkpoint, val=not self.args.test) if not self.args.no_pplx: self.build_model(generate=False) self.calculate_pplx(self.args.checkpoint, val=not self.args.test) return score def generate_sentences(self, filepath, val=True): """ Generates descriptions of images for --generation_timesteps iterations through the LSTM. Each input description is clipped to the first <BOS> token, or, if --generate_from_N_words is set, to the first N following words (N + 1 BOS token). This process can be additionally conditioned on source language hidden representations, if provided by the --source_vectors parameter. The output is clipped to the first EOS generated, if it exists. TODO: duplicated method with generate.py """ if self.args.beam_width > 1: prefix = "val" if val else "test" handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words # Default 0 start_gen = start_gen + 1 # include BOS generator = self.data_gen.generation_generator(prefix, batch_size=1) seen = 0 # we are going to beam search for the most probably sentence. # let's do this one sentence at a time to make the logging output # easier to understand for data in generator: text = data[0]['text'] # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text.shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.index2word[w]) del data[0]['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data[1]['output'] data[0]['text'] = text max_beam_width = self.args.beam_width structs = self.make_duplicate_matrices(data[0], max_beam_width) # A beam is a 2-tuple with the probability of the sequence and # the words in that sequence. Start with empty beams beams = [(0.0, [])] # collects beams that are in the top candidates and # emitted a <E> token. finished = [] for t in range(start_gen, self.args.generation_timesteps): # Store the candidates produced at timestep t, will be # pruned at the end of the timestep candidates = [] # we take a view of the datastructures, which means we're only # ever generating a prediction for the next word. This saves a # lot of cycles. preds = self.model.predict(structs, verbose=0) # The last indices in preds are the predicted words next_word_indices = preds[:, t-1] sorted_indices = np.argsort(-next_word_indices, axis=1) # Each instance in structs is holding the history of a # beam, and so there is a direct connection between the # index of a beam in beams and the index of an instance in # structs. for beam_idx, b in enumerate(beams): # get the sorted predictions for the beam_idx'th beam beam_predictions = sorted_indices[beam_idx] for top_idx in range(self.args.beam_width): wordIndex = beam_predictions[top_idx] wordProb = next_word_indices[beam_idx][beam_predictions[top_idx]] # For the beam_idxth beam, add the log probability # of the top_idxth predicted word to the previous # log probability of the sequence, and append the # top_idxth predicted word to the sequence of words candidates.append([b[0] + math.log(wordProb), b[1] + [wordIndex]]) candidates.sort(reverse = True) if self.args.verbose: logger.info("Candidates in the beam") logger.info("---") for c in candidates: logger.info(" ".join([self.index2word[x] for x in c[1]]) + " (%f)" % c[0]) beams = candidates[:max_beam_width] # prune the beams pruned = [] for b in beams: # If a top candidate emitted an EOS token then # a) add it to the list of finished sequences # b) remove it from the beams and decrease the # maximum size of the beams. if b[1][-1] == self.word2index["<E>"]: finished.append(b) if max_beam_width >= 1: max_beam_width -= 1 else: pruned.append(b) beams = pruned[:max_beam_width] if self.args.verbose: logger.info("Pruned beams") logger.info("---") for b in beams: logger.info(" ".join([self.index2word[x] for x in b[1]]) + "(%f)" % b[0]) if max_beam_width == 0: # We have sampled max_beam_width sequences with an <E> # token so stop the beam search. break # Reproduce the structs for the beam search so we can keep # track of the state of each beam structs = self.make_duplicate_matrices(data[0], max_beam_width) # Rewrite the 1-hot word features with the # so-far-predcicted tokens in a beam. for bidx, b in enumerate(beams): for idx, w in enumerate(b[1]): next_word_index = w structs['text'][bidx, idx+1, w] = 1. # If none of the sentences emitted an <E> token while # decoding, add the final beams into the final candidates if len(finished) == 0: for leftover in beams: finished.append(leftover) # Normalise the probabilities by the length of the sequences # as suggested by Graves (2012) http://arxiv.org/abs/1211.3711 for f in finished: f[0] = f[0] / len(f[1]) finished.sort(reverse=True) if self.args.verbose: logger.info("Length-normalised samples") logger.info("---") for f in finished: logger.info(" ".join([self.index2word[x] for x in f[1]]) + "(%f)" % f[0]) # Emit the lowest (log) probability sequence best_beam = finished[0] complete_sentences[i] = [self.index2word[x] for x in best_beam[1]] handle.write(' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i])]) + "\n") if self.args.verbose: logger.info("%s (%f)",' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i])]), best_beam[0]) seen += text.shape[0] if seen == self.data_gen.split_sizes['val']: # Hacky way to break out of the generator break handle.close() else: # We are going to arg max decode a sequence. prefix = "val" if val else "test" logger.info("Generating %s descriptions", prefix) start_gen = self.args.generate_from_N_words + 1 # include BOS handle = codecs.open("%s/%sGenerated" % (filepath, prefix), "w", 'utf-8') generator = self.data_gen.generation_generator(prefix) seen = 0 for data in generator: text = deepcopy(data[0]['text']) # Append the first start_gen words to the complete_sentences list # for each instance in the batch. complete_sentences = [[] for _ in range(text.shape[0])] for t in range(start_gen): # minimum 1 for i in range(text .shape[0]): w = np.argmax(text[i, t]) complete_sentences[i].append(self.index2word[w]) del data[0]['text'] text = self.reset_text_arrays(text, start_gen) Y_target = data[1]['output'] data[0]['text'] = text for t in range(start_gen, self.args.generation_timesteps): logger.debug("Input token: %s" % self.index2word[np.argmax(text[0,t-1])]) preds = self.model.predict(data[0], verbose=0) # Look at the last indices for the words. next_word_indices = np.argmax(preds[:, t-1], axis=1) logger.debug("Predicted token: %s" % self.index2word[next_word_indices[0]]) # update array[0]/sentence-so-far with generated words. for i in range(len(next_word_indices)): data[0]['text'][i, t, next_word_indices[i]] = 1. next_words = [self.index2word[x] for x in next_word_indices] for i in range(len(next_words)): complete_sentences[i].append(next_words[i]) sys.stdout.flush() # print/extract each sentence until it hits the first end-of-string token for s in complete_sentences: if self.args.verbose: logger.info("%s",' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", complete_sentences[i])])) decoded_str = ' '.join([x for x in itertools.takewhile( lambda n: n != "<E>", s[1:])]) handle.write(decoded_str + "\n") seen += text.shape[0] if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break handle.close() def calculate_pplx(self, path, val=True): """ Splits the input data into batches of self.args.batch_size to reduce the memory footprint of holding all of the data in RAM. """ prefix = "val" if val else "test" logger.info("Calculating pplx over %s data", prefix) sum_logprobs = 0 y_len = 0 generator = self.data_gen.generation_generator(prefix) seen = 0 for data in generator: Y_target = deepcopy(data[1]['output']) del data[1]['output'] preds = self.model.predict(data[0], verbose=0, batch_size=self.args.batch_size) for i in range(Y_target.shape[0]): for t in range(Y_target.shape[1]): target_idx = np.argmax(Y_target[i, t]) target_tok = self.index2word[target_idx] if target_tok != "<P>": log_p = math.log(preds[i, t, target_idx],2) sum_logprobs += -log_p y_len += 1 seen += data[0]['text'].shape[0] if seen == self.data_gen.split_sizes[prefix]: # Hacky way to break out of the generator break norm_logprob = sum_logprobs / y_len pplx = math.pow(2, norm_logprob) logger.info("PPLX: %.4f", pplx) handle = open("%s/%sPPLX" % (path, prefix), "w") handle.write("%f\n" % pplx) handle.close() return pplx def reset_text_arrays(self, text_arrays, fixed_words=1): """ Reset the values in the text data structure to zero so we cannot accidentally pass them into the model. Helper function for generate_sentences(). """ reset_arrays = deepcopy(text_arrays) reset_arrays[:,fixed_words:, :] = 0 return reset_arrays def make_duplicate_matrices(self, generator_data, k): ''' Prepare K duplicates of the input data for a given instance yielded by the data generator. Helper function for the beam search decoder in generation_sentences(). ''' if self.use_sourcelang and self.use_image: # the data generator yielded a dictionary with the words, the # image features, and the source features dupes = [[],[],[]] words = generator_data['text'] img = generator_data['img'] source = generator_data['src'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(source[0,:,:]) dupes[2].append(img[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) dupes[2] = np.array(dupes[2]) return {'text': dupes[0], 'img': dupes[2], 'src': dupes[1]} elif self.use_image: # the data generator yielded a dictionary with the words and the # image features dupes = [[],[]] words = generator_data['text'] img = generator_data['img'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(img[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'img': dupes[1]} elif self.use_sourcelang: # the data generator yielded a dictionary with the words and the # source features dupes = [[],[]] words = generator_data['text'] source= generator_data['src'] for x in range(k): # Make a deep copy of the word_feats structures # so the arrays will never be shared dupes[0].append(deepcopy(words[0,:,:])) dupes[1].append(source[0,:,:]) # Turn the list of arrays into a numpy array dupes[0] = np.array(dupes[0]) dupes[1] = np.array(dupes[1]) return {'text': dupes[0], 'src': dupes[1]} def find_best_checkpoint(self): ''' Read the summary file from the directory and scrape out the run ID of the highest BLEU scoring checkpoint. Then do an ls-stlye function in the directory and return the exact path to the best model. Assumes only one matching prefix in the model checkpoints directory. ''' summary_data = open("%s/summary" % self.args.model_checkpoints).readlines() summary_data = [x.replace("\n", "") for x in summary_data] best_id = None target = "Best loss" if self.args.best_pplx else "Best Metric" for line in summary_data: if line.startswith(target): best_id = "%03d" % (int(line.split(":")[1].split("|")[0])) checkpoint = None if best_id is not None: checkpoints = os.listdir(self.args.model_checkpoints) for c in checkpoints: if c.startswith(best_id): checkpoint = c break logger.info("Best checkpoint: %s/%s" % (self.args.model_checkpoints, checkpoint)) return "%s/%s" % (self.args.model_checkpoints, checkpoint) def bleu_score(self, directory, val=True): ''' PPLX is only weakly correlated with improvements in BLEU, and thus improvements in human judgements. Let's also track BLEU score of a subset of generated sentences in the val split to decide on early stopping, etc. ''' prefix = "val" if val else "test" self.extract_references(directory, val) subprocess.check_call( ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated | tee %s/%sBLEU' % (directory, prefix, directory, prefix, directory, prefix)], shell=True) bleudata = open("%s/%sBLEU" % (directory, prefix)).readline() data = bleudata.split(",")[0] bleuscore = data.split("=")[1] bleu = float(bleuscore.lstrip()) return bleu def multeval_scores(self, directory, val=True): ''' Maybe you want to evaluate with Meteor, TER, and BLEU? ''' prefix = "val" if val else "test" self.extract_references(directory, val) with cd(MULTEVAL_DIR): subprocess.check_call( ['./multeval.sh eval --refs ../%s/%s_reference.* \ --hyps-baseline ../%s/%sGenerated \ --meteor.language %s \ --threads 4 \ 2> multevaloutput 1> multevaloutput' % (directory, prefix, directory, prefix, self.args.meteor_lang)], shell=True) handle = open("multevaloutput") multdata = handle.readlines() handle.close() for line in multdata: if line.startswith("RESULT: baseline: BLEU: AVG:"): mbleu = line.split(":")[4] mbleu = mbleu.replace("\n","") mbleu = mbleu.strip() lr = mbleu.split(".") mbleu = float(lr[0]+"."+lr[1][0:2]) if line.startswith("RESULT: baseline: METEOR: AVG:"): mmeteor = line.split(":")[4] mmeteor = mmeteor.replace("\n","") mmeteor = mmeteor.strip() lr = mmeteor.split(".") mmeteor = float(lr[0]+"."+lr[1][0:2]) if line.startswith("RESULT: baseline: TER: AVG:"): mter = line.split(":")[4] mter = mter.replace("\n","") mter = mter.strip() lr = mter.split(".") mter = float(lr[0]+"."+lr[1][0:2]) logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", mmeteor, mbleu, mter) return mmeteor, mbleu, mter def extract_references(self, directory, val=True): """ Get reference descriptions for split we are generating outputs for. Helper function for bleu_score(). """ prefix = "val" if val else "test" references = self.data_gen.get_refs_by_split_as_list(prefix) for refid in xrange(len(references[0])): codecs.open('%s/%s_reference.ref%d' % (directory, prefix, refid), 'w', 'utf-8').write('\n'.join([x[refid] for x in references])) def build_model(self, generate=False): ''' Build a Keras model if one does not yet exist. Helper function for generate(). ''' if generate: t = self.args.generation_timesteps else: t = self.data_gen.max_seq_len if self.args.mrnn: m = models.MRNN(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) else: m = models.NIC(self.args.embed_size, self.args.hidden_size, self.vocab_len, self.args.dropin, self.args.optimiser, self.args.l2reg, hsn_size=self.hsn_size, weights=self.args.checkpoint, gru=self.args.gru, clipnorm=self.args.clipnorm, t=t) self.model = m.buildKerasModel(use_sourcelang=self.use_sourcelang, use_image=self.use_image)