def aggregate(self, predicted_begins, predicted_ends, contexts_text, q_ids): batch_size = predicted_begins.shape[0] for i in range(batch_size): predicted_answer = detokenize( map(vec2str, contexts_text[i][predicted_begins[i]:predicted_ends[i]])) q_id = vec2str(q_ids[i]) self.predictions[q_id] = predicted_answer
def retrieve(self, batch): """Retrieves all definitions for a batch of words sequences. TODO: definitions of phrases, phrasal verbs, etc. Returns ------- defs A list of word definitions, each definition is a list of words. def_map A list of triples (batch_index, time_step, def_index). Maps words to their respective definitions from `defs`. """ definitions = [] def_map = [] word_def_indices = {} for seq_pos, sequence in enumerate(batch): for word_pos, word in enumerate(sequence): if isinstance(word, numpy.ndarray): word = vec2str(word) if not word: continue self._debug_info['N_words'] += 1 word_id = self._vocab_text.word_to_id(word) if (self._exclude_top_k and word_id != self._vocab_text.unk and word_id < self._exclude_top_k): self._debug_info['N_excluded_words'] += 1 continue if word not in word_def_indices: word_def_indices[word] = [] # The first time a word is encountered in a batch word_defs = self._dictionary.get_definitions(word) if self._max_def_per_word < len(word_defs): if self._with_too_many_defs == 'random': word_defs = self._rng.choice( word_defs, self._max_def_per_word, replace=False) else: # (rizar): if there's too many definition for a words, # maybe let's just accept that it's a "semantic prime"? word_defs = [] # Debug info self._debug_info['N_distinct_words'] += 1 self._debug_info['N_missed_distinct_words'] += ( len(word_defs) == 0) # End of debug info for i, def_ in enumerate(word_defs): self._debug_info['N_def'] += 1 if self._with_too_long_defs == 'drop': if len(def_) > self._max_def_length: self._debug_info['N_dropped_def'] += 1 continue elif self._with_too_long_defs == 'crop': def_ = def_[0:self._max_def_length] else: raise NotImplementedError() final_def_ = [] if self._add_bod_eod: final_def_.append(self._vocab_def.bod) for token in def_: final_def_.append( self._vocab_def.word_to_id(token)) if self._add_bod_eod: final_def_.append(self._vocab_def.eod) word_def_indices[word].append(len(definitions)) definitions.append(final_def_) # Debug info self._debug_info['N_queried_words'] += 1 if len(word_def_indices[word]) == 0: self._debug_info['N_missed_words'] += 1 if len(self._debug_info['missed_word_sample']) == 10000: self._debug_info['missed_word_sample'][ numpy.random.randint(10000)] = word else: self._debug_info['missed_word_sample'].append(word) # End of debug info for def_index in word_def_indices[word]: def_map.append((seq_pos, word_pos, def_index)) return definitions, def_map
def test_vec2str(): vector = map(ord, 'abc') + [0, 0] assert vec2str(vector) == 'abc'
def perform(self, node, inputs, output_storage): words = inputs[0] words_flat = words.reshape(-1, words.shape[-1]) word_counts = numpy.array( [self._vocab.word_freq(vec2str(word)) for word in words_flat]) output_storage[0][0] = word_counts.reshape(words.shape[:-1])
def generate_embeddings(config, tar_path, part, dest_path, format_, average=False, encoder_embeddings=None, **kwargs): """ generate embeddings for all the defintions, average them and serialize OR if encoder_embeddings, serialize the models' encoder embeddings config: name of the config of the model tar_path: tar path of the model parameters part: part of the dataset (should be either 'train', 'valid', 'test' or 'all') dest_path: directory where the serialized embeddings will be written format: either 'dict' or 'glove' encoder_embeddings: None, 'only', 'mixed', 'if_missing' - None: don't include encoder embeddings - 'only': don't read any data, just serialize the encoder embeddings - 'mixed': add the encoder embeddings to the list of definition embeddings - 'if_missing': add the encoder embeddings when there is no corresponding def average: if true, multi-prototype embeddings will be averaged """ if not os.path.exists(dest_path): os.makedirs(dest_path) c = config data, model = initialize_data_and_model(c, train_phase=False) words = T.ltensor3('words') words_mask = T.matrix('words_mask') keys = T.lmatrix('keys') n_identical_keys = T.lvector('n_identical_keys') sym_args = [words, words_mask] if format_ not in ['dict', 'glove']: raise ValueError("format should be either: dict, glove") if not c['encoder'] and encoder_embeddings != 'only': raise ValueError('Error: this model does not have an encoder.') if use_keys(c): sym_args.append(keys) if use_n_identical_keys(c): sym_args.append(n_identical_keys) costs = model.apply(*sym_args, train_phase=False) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) if encoder_embeddings: if encoder_embeddings == 'only' and not c['encoder']: embeddings_array = model.get_def_embeddings_params('key').eval() else: embeddings_array = model.get_def_embeddings_params('main').eval() entries = model.get_embeddings_entries() enc_embeddings = { e: np.asarray(a) for e, a in zip(entries, embeddings_array) } if encoder_embeddings == 'only': serialize_embeddings(enc_embeddings, format_, dest_path, "encoder_embeddings") return 0 embeddings_var, = VariableFilter(name='embeddings')(cg) compute = dict({"embeddings": embeddings_var}) if c['proximity_coef'] != 0: prox_var, = VariableFilter(name='proximity_term')(cg) compute["proximity_term"] = prox_var print "sym args", sym_args predict_f = theano.function(sym_args, compute) batch_size = 256 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=c['max_length'], remove_keys=False, remove_n_identical_keys=False) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 vocab = model._vocab print "start computing" embeddings = defaultdict(list) for input_data in stream.get_epoch_iterator(as_dict=True): if i % 10 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] keys = input_data['keys'] n_identical_keys = input_data['n_identical_keys'] args = [words, words_mask] if use_keys(c): args.append(keys) if use_n_identical_keys(c): args.append(n_identical_keys) to_save = predict_f(*args) for k, h in zip(keys, to_save['embeddings']): key = vec2str(k) if encoder_embeddings == 'if_missing': try: del enc_embeddings[key] except KeyError: pass embeddings[key].append(h) i += 1 if encoder_embeddings in ['mixed', 'if_missing']: for k, e in enc_embeddings.iteritems(): embeddings[k].append(e) if encoder_embeddings == 'mixed': prefix_fname = 'mix_e_' elif encoder_embeddings == 'if_missing': prefix_fname = 'if_mis_e_' else: prefix_fname = '' # combine: if average: mean_embeddings = {} for k in embeddings.keys(): mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0) serialize_embeddings(mean_embeddings, format_, dest_path, prefix_fname + "mean_embeddings") else: serialize_embeddings(embeddings, format_, dest_path, prefix_fname + "embeddings")
def evaluate_extractive_qa(config, tar_path, part, num_examples, dest_path, qids=None, dataset=None): if not dest_path: dest_path = os.path.join(os.path.dirname(tar_path), 'predictions.json') log_path = os.path.splitext(dest_path)[0] + '_log.json' if qids: qids = qids.split(',') if dataset: dataset = SQuADDataset(dataset, ('all', )) c = config data, qam = initialize_data_and_model(c) costs = qam.apply_with_default_vars() cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) compute = {'begins': predicted_begins, 'ends': predicted_ends} if c['coattention']: d2q_att_weights, = VariableFilter(name='d2q_att_weights')(cg) q2d_att_weights, = VariableFilter(name='q2d_att_weights')(cg) compute.update({'d2q': d2q_att_weights, 'q2d': q2d_att_weights}) compute['costs'] = costs predict_func = theano.function(qam.input_vars.values(), compute) logger.debug("Ready to evaluate") done_examples = 0 num_correct = 0 def print_stats(): print('EXACT MATCH RATIO: {}'.format(num_correct / float(done_examples))) predictions = {} log = {} stream = data.get_stream(part, batch_size=1, shuffle=part == 'train', raw_text=True, q_ids=True, dataset=dataset) for example in stream.get_epoch_iterator(as_dict=True): if done_examples == num_examples: break q_id = vec2str(example['q_ids'][0]) if qids and not q_id in qids: continue example['contexts_text'] = [map(vec2str, example['contexts_text'][0])] example['questions_text'] = [ map(vec2str, example['questions_text'][0]) ] feed = dict(example) del feed['q_ids'] del feed['contexts_text'] del feed['questions_text'] del feed['contexts_text_mask'] result = predict_func(**feed) correct_answer_span = slice(example['answer_begins'][0], example['answer_ends'][0]) predicted_answer_span = slice(result['begins'][0], result['ends'][0]) correct_answer = example['contexts_text'][0][correct_answer_span] answer = example['contexts_text'][0][predicted_answer_span] is_correct = correct_answer_span == predicted_answer_span context = example['contexts_text'][0] question = example['questions_text'][0] context_def_map = example['contexts_def_map'] # pretty print outcome = 'correct' if is_correct else 'wrong' print('#{}'.format(done_examples)) print(u"CONTEXT:", detokenize(context)) print(u"QUESTION:", detokenize(question)) print(u"RIGHT ANSWER: {}".format(detokenize(correct_answer))) print( u"ANSWER (span=[{}, {}], {}):".format(predicted_answer_span.start, predicted_answer_span.stop, outcome), detokenize(answer)) print(u"COST: {}".format(float(result['costs'][0]))) print(u"DEFINITIONS AVAILABLE FOR:") for pos in set(context_def_map[:, 1]): print(context[pos]) print() # update statistics done_examples += 1 num_correct += is_correct # save the results predictions[q_id] = detokenize(answer) log_entry = { 'context': context, 'question': question, 'answer': answer, 'correct_answer': correct_answer, 'cost': float(result['costs'][0]) } if c['coattention']: log_entry['d2q'] = cPickle.dumps(result['d2q'][0]) log_entry['q2d'] = cPickle.dumps(result['q2d'][0]) log[q_id] = log_entry if done_examples % 100 == 0: print_stats() print_stats() with open(log_path, 'w') as dst: json.dump(log, dst, indent=2, sort_keys=True) with open(dest_path, 'w') as dst: json.dump(predictions, dst, indent=2, sort_keys=True)