def send(self): # Don't send consecutive utterances with entities if self.sent_entity and not self.env.consecutive_entity: return None if self.matched_item is not None: return self.select(self.matched_item) for i in xrange(1): tokens = self.decode() if tokens is not None: break if tokens is None: return None if self._has_entity(tokens): self.sent_entity = True else: self.sent_entity = False for token in tokens: if is_entity(token): self.mentioned_entities.add(token[1][0]) if self.env.realizer is None: tokens = [x if not is_entity(x) else x[0] for x in tokens] else: tokens = self.env.realizer.realize_entity(tokens) if len(tokens) > 1 and tokens[0] == markers.SELECT and tokens[ 1].startswith('item-'): item_id = int(tokens[1].split('-')[1]) self.selected_items.add(item_id) item = self.kb.items[item_id] return self.select(item) tokens = self.naturalize(tokens) s = self.attach_punct(' '.join(tokens)) return self.message(s)
def process_utterance(self, utterance, stage=None): if stage is None: return [ self.get_entity_form(x, 'canonical') if is_entity(x) else x for x in utterance ] else: return [ self.get_entity_form(x, self.entity_forms[stage]) if is_entity(x) else x for x in utterance ]
def read_utterance(self, tokens, stage=None): ''' Map entities to node ids and tokens to -1. Add new nodes if needed. tokens: from batch['encoder/decoder_tokens']; entities are represented as (surface_form, (canonical_form, type)), i.e. output of entitylink. ''' entities = [x[1] for x in tokens if is_entity(x)] new_entities = set([x for x in entities if not self.nodes.has(x)]) if len(new_entities) > 0: self.add_entity_nodes(new_entities) node_ids = [self.nodes.to_ind(x[1]) for x in tokens if is_entity(x)] self.entities.append(node_ids)
def build_vocab(dialogues, special_symbols=[], entity_forms=[]): vocab = Vocabulary(offset=0, unk=True) def _add_entity(entity): for entity_form in entity_forms: # If copy entity embedding from the graph embedding, don't need entity in vocab if entity_form != 'graph': word = Preprocessor.get_entity_form(entity, entity_form) vocab.add_word(word) # Add words for dialogue in dialogues: assert dialogue.is_int is False for turns in dialogue.token_turns: for turn in turns: for token in chain.from_iterable(turn): if is_entity(token): _add_entity(token) else: vocab.add_word(token) # Add special symbols vocab.add_words(special_symbols) print 'Vocabulary size:', vocab.size return vocab
def receive(self, event): #self.log.write('receive event:%s\n' % str(event.to_dict())) # Reset status self.sent_entity = False # Parse utterance if event.action == 'select': self.matched_item = self._match(event.data) if self.matched_item is None: entity_tokens = [markers.SELECT ] + self.env.preprocessor.item_to_entities( event.data, self.kb.attributes) else: # Got a match; we're done. return elif event.action == 'message': entity_tokens = self.env.preprocessor.process_event( event, self.kb, mentioned_entities=self.mentioned_entities, known_kb=False) print entity_tokens[0] # Empty message if entity_tokens is None: return else: # Take the encoding version of sequence entity_tokens = entity_tokens[0] else: raise ValueError('Unknown event action %s.' % event.action) for token in entity_tokens: if is_entity(token): self.mentioned_entities.add(token[1][0]) entity_tokens += [markers.EOS] self.encode(entity_tokens)
def process_event(self, e, kb, mentioned_entities=None, known_kb=True): ''' Convert event to two lists of tokens and entities for encoding and decoding. ''' if e.action == 'message': # Lower, tokenize, link entity entity_tokens = self.lexicon.link_entity( tokenize(e.data), kb=kb, mentioned_entities=mentioned_entities, known_kb=known_kb) #print e.data #print entity_tokens entity_tokens = [ normalize_number(x) if not is_entity(x) else x for x in entity_tokens ] if entity_tokens: # NOTE: have two copies because we might change it given decoding/encoding return (entity_tokens, copy.copy(entity_tokens)) else: return None elif e.action == 'select': # Convert an item to item-id (wrt to the speaker) item_id = self.get_item_id(kb, e.data) # We use the entities to represent the item during encoding and item-id during decoding return ([markers.SELECT] + self.item_to_entities(e.data, kb.attributes), [markers.SELECT, item_to_entity(item_id)]) else: raise ValueError('Unknown event action.')
def text_to_int(self, utterance, stage=None): ''' Process entities in the utterance based on whether it is used for encoding, decoding or ground truth. ''' if stage is not None: use_entity_map = self.setting[stage] tokens = self.preprocessor.process_utterance(utterance, stage) if not use_entity_map: return [self.vocab.to_ind(token) for token in tokens] else: offset = self.vocab.size return [self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens] else: tokens = self.preprocessor.process_utterance(utterance) offset = self.vocab.size return [self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens]
def _process_target_tokens(self, tokens): ''' TODO: for now evaluate against canonical entities. In future, evaluate against actual utterances. ''' targets = [token[1] if is_entity(token) else token for token in tokens] targets = [x for x in targets if x not in (markers.PAD,)] return targets
def _pred_to_token(self, preds): if self.env.copy: preds = self.graph.copy_preds(preds, self.env.vocab.size) entity_tokens, _ = pred_to_token(preds, self.env.stop_symbol, self.env.remove_symbols, self.env.textint_map) entity_tokens = [[(x[0], x) if is_entity(x) else x for x in toks] for toks in entity_tokens] return entity_tokens
def update_ngram_counts(counts, utterance): tokens = [x if not is_entity(x) else ('<%s>' % x[1][1]) for x in utterance] for x in tokens: counts[1][(x, )] += 1 for x, y in izip(tokens, tokens[1:]): counts[2][(x, y)] += 1 for x, y, z in izip(tokens, tokens[1:], tokens[2:]): counts[3][(x, y, z)] += 1 return counts
def eval(self, kb, utterance): ''' utterance: a list of tokens and entities represented as a tuple (surface_form, (caninical_form, type)) ''' #print 'eval:', utterance N = len(utterance) i = 0 while i < N: token = utterance[i] if is_entity(token) and token[1][1] != 'item': self.inc_fact() if i + 1 < N and utterance[i + 1] == 'and': # number ent1 and ent2 if i - 1 < 0 or i + 3 > N: self.inc_undecided() i += 1 else: start, end = i - 1, i + 3 if not is_entity(utterance[i + 2]): self.inc_undecided() else: if end + 1 < N and utterance[end:end + 2] == ['in', 'those']: self.inc_coref() i = end + 2 else: self.eval_joint(kb, utterance[start:end]) i = end elif i - 1 > 0: # number ent start, end = i - 1, i + 1 if end + 1 < N and utterance[end:end + 2] == ['in', 'those']: self.inc_coref() i = end + 2 else: self.eval_single(kb, utterance[start:end]) i = end else: self.inc_undecided() i += 1 else: i += 1
def receive(self, event): self.sent_entity = False if event.action == 'message': raw_utterance = event.data # handle bilingual. translate into English no matter what source lang is translation = translate_client.translate(raw_utterance, target_language='en') new_eng = translation['translatedText'] # entity_tokens = self.lexicon.link_entity(tokenize(raw_utterance), kb=self.kb, mentioned_entities=self.mentioned_entities, known_kb=False) entity_tokens = self.lexicon.link_entity(tokenize(new_eng), kb=self.kb, mentioned_entities=self.mentioned_entities, known_kb=False) for token in entity_tokens: if is_entity(token): # it works!! # print '***' # print 'IS ENTITY:', token # print '***' self.mentioned_entities.add(token[1][0]) entities = [word[1] for word in entity_tokens if is_entity(word)] if self.is_question(entity_tokens): self.asked_entities = entities # Update item weights if len(entities) > 0: if len([x for x in entity_tokens if x in ('no', 'none', "don't", 'zero')]) > 0: negative = True else: negative = False self.update_item_weights(entities, -10. if negative else 1.) row_entities, col_entities = self.get_related_entity(entities) self.update_entity_weights(entities, -10. if negative else 1.) self.update_entity_weights(row_entities, -1. if negative else 2.) self.update_entity_weights(col_entities, 1. if negative else 0.5) elif event.action == 'select': for item in self.kb.items: if item == event.data: self.matched_item = item
def add_utterance(self, agent, utterances, prepend): # Prepend entities to decoder utterances if prepend: decoder_utterance = utterances[self.DEC] decoder_entities = [x for x in decoder_utterance if is_entity(x) ] + [markers.EOE] utterances[self.DEC][:0] = decoder_entities # Same agent talking if len(self.agents) > 0 and agent == self.agents[-1]: for i in xrange(2): self.token_turns[i][-1].append(utterances[i]) else: self.agents.append(agent) for i in xrange(2): self.token_turns[i].append([utterances[i]])
def _process_example(self, ex): ''' Convert example to turn-based dialogue. ''' kbs = ex.scenario.kbs dialogue = Dialogue(kbs, ex.uuid) mentioned_entities = set() for e in ex.events: utterances = self.process_event(e, kbs[e.agent], mentioned_entities) if utterances: dialogue.add_utterance(e.agent, utterances) for token in utterances[0]: if is_entity(token): mentioned_entities.add(token[1][0]) return dialogue
def get_linguistic_template(template_summary_map, utterance): if len(utterance) == 0: return template = [] for token in utterance: if is_entity(token): template.append('<%s>' % get_entity_type(token)) else: # if token not in stopwords.words('english'): template.append(token) k = tuple(template) if k not in template_summary_map.keys(): template_summary_map[k] = 0. template_summary_map['total'] += 1. template_summary_map[k] += 1.
def _process_example(self, ex,wizardkb): ''' Convert example to turn-based dialogue. ''' kbs = ex.scenario.kbs attributes = ex.scenario.attributes kbs.append(KB.from_dict(attributes, wizardkb)) # append wizard kb dialogue = Dialogue(kbs, ex.uuid) mentioned_entities = set() for e in ex.events: utterances = self.process_event(e, kbs[e.agent], mentioned_entities) if utterances: dialogue.add_utterance(e.agent, utterances) for token in utterances[0]: if is_entity(token): mentioned_entities.add(token[1][0]) return dialogue
def get_stats(chat, agent_id, preprocessor): ex = Example.from_dict(None, chat) kbs = ex.scenario.kbs mentioned_entities = set() stats = {} vocab = set() for i, event in enumerate(ex.events): if agent_id != event.agent: continue if event.action == 'select': utterance = [] logstats.update_summary_map(stats, {'num_select': 1}) elif event.action == 'message': utterance = preprocessor.process_event(event, kbs[event.agent], mentioned_entities) # Skip empty utterances if not utterance: continue else: utterance = utterance[0] for token in utterance: if is_entity(token): logstats.update_summary_map(stats, {'num_entity': 1}) mentioned_entities.add(token[1][0]) else: vocab.add(token) logstats.update_summary_map(stats, {'utterance_len': len(utterance)}) speech_act = get_speech_act(defaultdict(int), event, utterance) if speech_act[0] in ('inform', 'ask', 'answer'): logstats.update_summary_map(stats, {'SA_' + speech_act[0]: 1}) logstats.update_summary_map(stats, {'num_utterance': 1}) new_stats = {} for k in stats: if k in ('num_select', 'num_utterance', 'num_entity'): new_stats[k] = stats[k]['sum'] elif k in ('utterance_len', ): new_stats[k] = stats[k]['mean'] elif k.startswith('SA_'): new_stats[k] = stats[k]['sum'] new_stats['vocab_size'] = len(vocab) return new_stats
def check_fact(summary_map, tokens, kb): ''' Simple fact checker: each utterance is converted to a list of numbers and entities and we assume that the number describes the following entities, which will cause some false negatives. ''' hypothesis = [] N = len(kb.items) for token in tokens: if is_entity(token): if len(hypothesis) > 0: # Represent entity as its canonical form hypothesis[-1][1].append(token[1][0]) else: number = to_number(token, N) if number: hypothesis.append((number, [])) for n, entities in hypothesis: if len(entities) > 0: correct = 1 if n == count_kb_entity(kb, entities) else 0 logstats.update_summary_map(summary_map, {'correct': correct})
return_entities=True, agent=agent, uuid=scenario_uuid) for c in candidate_annotation: # Entity, Span, Type fout.write(c[1][0] + "\t" + c[0] + "\t" + c[1][1] + "\n") preprocessor = Preprocessor(schema, lexicon, 'canonical', 'canonical', 'canonical') for raw in examples: ex = Example.from_dict(None, raw) kbs = ex.scenario.kbs mentioned_entities = set() for i, event in enumerate(ex.events): if event.action == 'message': utterance = preprocessor.process_event(event, kbs[event.agent], mentioned_entities) # Skip empty utterances if utterance: utterance = utterance[0] for token in utterance: if is_entity(token): span, entity = token entity, type_ = entity # Entity, Span, Type fout.write(entity + "\t" + span + "\t" + type_ + "\n") fout.close()
def is_inform(tokens): for token in tokens: if is_entity(token): return True return False
def analyze_strategy(all_chats, scenario_db, preprocessor, text_output, lm): fout = open(text_output, 'w') if text_output is not None else None speech_act_summary_map = defaultdict(int) kb_strategy_summary_map = {} dialog_summary_map = {} fact_summary_map = {} utterance_counts = defaultdict(lambda: defaultdict(int)) ngram_counts = defaultdict(lambda: defaultdict(int)) template_summary_map = {'total': 0.} speech_act_sequence_summary_map = {'total': 0.} alpha_stats = [] num_items_stats = [] num_attrs_mentioned = 0. most_mentioned_attrs = 0. entity_mention_summary_map = {} total_events = 0 total_dialogues = 0. lm_summary_map = {} for raw in all_chats: ex = Example.from_dict(scenario_db, raw) kbs = ex.scenario.kbs if ex.outcome is None or ex.outcome["reward"] == 0: continue # skip incomplete dialogues total_dialogues += 1. dialog = [] mentioned_entities = set() for i, event in enumerate(ex.events): if event.action == 'select': utterance = [] elif event.action == 'message': utterance = preprocessor.process_event(event, kbs[event.agent], mentioned_entities) # Skip empty utterances if not utterance: continue else: utterance = utterance[0] for token in utterance: if is_entity(token): mentioned_entities.add(token[1][0]) logstats.update_summary_map( dialog_summary_map, {'utterance_length': len(utterance)}) check_fact(fact_summary_map, utterance, kbs[event.agent]) if lm: logstats.update_summary_map(lm_summary_map, { 'score': lm.score(' '.join(entity_to_type(utterance))) }) update_ngram_counts(ngram_counts, utterance) if fout: fout.write('%s\n' % (' '.join(entity_to_type(utterance)))) else: raise ValueError('Unknown event action %s.' % event.action) total_events += 1 speech_act = get_speech_act(speech_act_summary_map, event, utterance) get_linguistic_template(template_summary_map, utterance) entities = [x[1] for x in utterance if is_entity(x)] dialog.append((event.agent, speech_act, entities, utterance)) get_dialog_stats(dialog_summary_map, utterance_counts, dialog) get_speech_act_histograms(speech_act_sequence_summary_map, dialog) get_entity_mention(entity_mention_summary_map, dialog, kbs) orders, mentioned_attrs, most_mentioned_label = get_kb_strategy( kbs, dialog) orders = tuple(orders) most_mentioned_attrs += alpha_labels_to_values[most_mentioned_label] if len(orders) not in kb_strategy_summary_map.keys(): kb_strategy_summary_map[len(orders)] = {} if orders not in kb_strategy_summary_map[len(orders)].keys(): kb_strategy_summary_map[len(orders)][orders] = 0.0 kb_strategy_summary_map[len(orders)][tuple(orders)] += 1.0 alphas = ex.scenario.alphas num_attrs_mentioned += len(orders) / len(alphas) first_mentioned_label = NO_ALPHA_MENTION if len(orders) > 0: first_mentioned_label = orders[0] if len(mentioned_attrs) > 0: first_mentioned_type, first_mentioned_attr, first_agent = mentioned_attrs[ 0] update_item_stats(num_items_stats, first_mentioned_type, first_mentioned_attr, kbs[first_agent]) if first_mentioned_label != NO_ALPHA_MENTION: update_alpha_stats(alpha_stats, kbs[first_agent], first_mentioned_label) # print "First mentioned attribute alpha:", first_mentioned, alpha_labels_to_values[first_mentioned] if fout: fout.close() # Summarize stats total = float(total_events) kb_strategy_totals = { k1: sum(v2 for v2 in v1.values()) for k1, v1 in kb_strategy_summary_map.items() } dialog_stats = { k: dialog_summary_map[k]['mean'] for k in dialog_summary_map } dialog_stats['entity_type_token_ratio'] = dialog_summary_map[ 'num_entity_type_per_dialog']['sum'] / float( dialog_summary_map['num_entity_per_dialog']['sum']) unigram_counts = {k[0]: v for k, v in ngram_counts[1].iteritems()} dialog_stats['vocab_size'] = len(unigram_counts) dialog_stats['unigram_entropy'] = count_to_entropy(unigram_counts) multi_speech_act = sum([ speech_act_summary_map[k] for k in speech_act_summary_map if len(k) > 1 ]) / total return { 'speech_act': { k: speech_act_summary_map[k] / total for k in speech_act_summary_map.keys() }, 'kb_strategy': { k1: { ", ".join(k2): v2 / kb_strategy_totals[k1] for k2, v2 in v1.items() } for k1, v1 in kb_strategy_summary_map.items() }, 'dialog_stats': dialog_stats, 'lm_score': -1 if not lm else lm_summary_map['score']['mean'], 'utterance_counts': utterance_counts, 'ngram_counts': ngram_counts, 'linguistic_templates': template_summary_map, 'speech_act_sequences': speech_act_sequence_summary_map, 'correct': fact_summary_map['correct']['mean'], 'entity_mention': { k: np.mean(v) for k, v in entity_mention_summary_map['first'].iteritems() }, 'multi_speech_act': multi_speech_act, 'alpha_stats': alpha_stats, 'num_items_stats': num_items_stats }
def get_entity(x): return [e for e in x if is_entity(e)]
def entity_to_type(tokens): return [x if not is_entity(x) else '<%s>' % x[1][1] for x in tokens]
def get_entity_type(entity): if not is_entity(entity): return None _, (_, entity_type) = entity return entity_type
def get_entities(self, utterance): return [-1 if not is_entity(token) else self.entity_map.to_ind(token) for token in tokens]
def _has_entity(self, tokens): for token in tokens: if is_entity(token): return True return False
def realize_entity(self, entity_tokens): return [ token if not is_entity(token) else self._realize_entity(token[1]) for token in entity_tokens ]