def process_utterance(self, utterance, stage=None): ''' Input: utterance is a list of tokens, stage is either encoding, decoding or target Output: in most cases, stage will be declared. Based on a combination of the model_type and stage, we choose whether or not to summarize the utterance. Models with "sum" should be summarized to only include selected keywords, models with "seq" will keep the full sequence. ''' if stage is None: return [ self.get_entity_form(x, 'canonical') if is_entity(x) else x for x in utterance ] else: if stage == 'encoding': summary = self.summarize(utterance) if self.model in [ "sum2sum", "sum2seq" ] else utterance elif (stage == 'decoding') or (stage == 'target'): if self.model == "sum2sum": summary = self.summarize(utterance) elif self.model == "sum2seq": summary = self.summarize(utterance) summary.append(markers.END_SUM) summary.extend(utterance) else: summary = utterance return [ self.get_entity_form(x, self.entity_forms[stage]) if is_entity(x) else x for x in summary ]
def __init__(self, model, vocab, temperature=1, max_length=100, cuda=False): super(LFSampler, self).__init__(model, vocab, temperature=temperature, max_length=max_length, cuda=cuda) self.price_actions = map( self.vocab.to_ind, ('init-price', 'counter-price', markers.OFFER)) self.prices = set([ id_ for w, id_ in self.vocab.word_to_ind.iteritems() if is_entity(w) ]) self.price_list = list(self.prices) self.eos = self.vocab.to_ind(markers.EOS) # TODO: fix the hard coding actions = set([ w for w in self.vocab.word_to_ind if not (is_entity(w) or w in category_markers or w in sequence_markers or w in (vocab.UNK, '</sum>', '<slot>', '</slot>')) ]) self.actions = map(self.vocab.to_ind, actions)
def send(self): # Don't send consecutive utterances with entities if self.sent_entity and not self.env.consecutive_entity: return None if self.matched_item is not None: return self.select(self.matched_item) for i in xrange(1): tokens = self.decode() if tokens is not None: break if tokens is None: return None if self._has_entity(tokens): self.sent_entity = True else: self.sent_entity = False for token in tokens: if is_entity(token): self.mentioned_entities.add(token[1][0]) if self.env.realizer is None: tokens = [x if not is_entity(x) else x[0] for x in tokens] else: tokens = self.env.realizer.realize_entity(tokens) if len(tokens) > 1 and tokens[0] == markers.SELECT and tokens[1].startswith('item-'): item_id = int(tokens[1].split('-')[1]) self.selected_items.add(item_id) item = self.kb.items[item_id] return self.select(item) tokens = self.naturalize(tokens) s = self.attach_punct(' '.join(tokens)) return self.message(s)
def process_utterance(self, utterance, stage=None): if stage is None: return [ self.get_entity_form(x, 'canonical') if is_entity(x) else x for x in utterance ] else: return [ self.get_entity_form(x, self.entity_forms[stage]) if is_entity(x) else x for x in utterance ]
def process_utterance(self, utterance, stage=None): # Input: utterance is a list of tokens, stage is either encoding, decoding or target if stage is None: return [ self.get_entity_form(x, 'canonical') if is_entity(x) else x for x in utterance ] else: return [ self.get_entity_form(x, self.entity_forms[stage]) if is_entity(x) else x for x in utterance ]
def _treebank_to_liwc_token(self, tokens): ''' In LIWC dictinoary, "'re", "n't" etc are not separated. ''' new_tokens = [] for token in tokens: if not is_entity(token) and ( token.startswith("'") or token == "n't" ) and len(new_tokens) > 0 and not is_entity(new_tokens[-1]): new_tokens[-1] += token else: new_tokens.append(token) return new_tokens
def combine_repeated_entity(self, entity_tokens): is_entity = lambda x: not isinstance(x, basestring) prev_entity = None max_dist = 1 cache = [] combined_entity_tokens = [] for i, token in enumerate(entity_tokens): if is_entity(token): if prev_entity is not None and token[0] != prev_entity[ 0] and token[1] == prev_entity[1] and (len(cache) <= max_dist): surface = '%s %s %s' % (prev_entity[0], ' '.join(cache), token[0]) combined_entity_tokens[-1] = (surface, prev_entity[1]) else: combined_entity_tokens.extend(cache) combined_entity_tokens.append(token) prev_entity = token cache = [] elif prev_entity is None: combined_entity_tokens.append(token) else: cache.append(token) combined_entity_tokens.extend(cache) return combined_entity_tokens
def receive(self, event): #self.log.write('receive event:%s\n' % str(event.to_dict())) # Reset status self.sent_entity = False # Parse utterance if event.action == 'select': self.matched_item = self._match(event.data) if self.matched_item is None: entity_tokens = [markers.SELECT] + self.env.preprocessor.item_to_entities(event.data, self.kb.attributes) else: # Got a match; we're done. return elif event.action == 'message': entity_tokens = self.env.preprocessor.process_event(event, self.kb, mentioned_entities=self.mentioned_entities, known_kb=False) print entity_tokens[0] # Empty message if entity_tokens is None: return else: # Take the encoding version of sequence entity_tokens = entity_tokens[0] else: raise ValueError('Unknown event action %s.' % event.action) for token in entity_tokens: if is_entity(token): self.mentioned_entities.add(token[1][0]) entity_tokens += [markers.EOS] self.encode(entity_tokens)
def is_valid_action(self, action_tokens): if not action_tokens: return False if action_tokens[0] in self.price_actions and \ not (len(action_tokens) > 1 and is_entity(action_tokens[1])): return False return True
def _add_utterance(self, agent, utterance, lf=None): # Same agent talking if len(self.agents) > 0 and agent == self.agents[-1]: new_turn = False else: new_turn = True utterance = self._insert_markers(agent, utterance, new_turn) entities = [x if is_entity(x) else None for x in utterance] if lf: lf = self._insert_markers(agent, self.lf_to_tokens(self.kb, lf), new_turn) else: lf = [] if new_turn: self.agents.append(agent) role = self.agent_to_role[agent] self.roles.append(role) self.token_turns.append(utterance) self.entities.append(entities) self.lfs.append(lf) else: self.token_turns[-1].extend(utterance) self.entities[-1].extend(entities) self.lfs[-1].extend(lf)
def send(self): tokens = self.generate() if tokens is None: return None self.dialogue.add_utterance(self.agent, list(tokens)) if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity(tokens[1]): try: price = self.builder.get_price_number(tokens[1], self.kb) return self.offer({'price': price}) except ValueError: #return None pass tokens = self.builder.entity_to_str(tokens, self.kb) if len(tokens) > 0: if tokens[0] == markers.ACCEPT: return self.accept() elif tokens[0] == markers.REJECT: return self.reject() elif tokens[0] == markers.QUIT: return self.quit() s = self.attach_punct(' '.join(tokens)) #print 'send:', s return self.message(s)
def calculate_lengths(self, preds): total_len = len(preds) # TODO: this doesn't work with Marker class #marker_len = len([x for x in preds if x in markers]) entity_len = len([x for x in preds if is_entity(x)]) keyword_len = total_len - marker_len - entity_len return (total_len, keyword_len, marker_len, entity_len)
def process_event(self, e, kb, mentioned_entities=None, known_kb=True): ''' Convert event to two lists of tokens and entities for encoding and decoding. ''' if e.action == 'message': # Lower, tokenize, link entity entity_tokens = self.lexicon.link_entity( tokenize(e.data), kb=kb, mentioned_entities=mentioned_entities, known_kb=known_kb) #print e.data #print entity_tokens entity_tokens = [ normalize_number(x) if not is_entity(x) else x for x in entity_tokens ] if entity_tokens: # NOTE: have two copies because we might change it given decoding/encoding return (entity_tokens, copy.copy(entity_tokens)) else: return None elif e.action == 'select': # Convert an item to item-id (wrt to the speaker) item_id = self.get_item_id(kb, e.data) # We use the entities to represent the item during encoding and item-id during decoding return ([markers.SELECT] + self.item_to_entities(e.data, kb.attributes), [markers.SELECT, item_to_entity(item_id)]) else: raise ValueError('Unknown event action.')
def build_vocab(dialogues, special_symbols=[], entity_forms=[]): vocab = Vocabulary(offset=0, unk=True) def _add_entity(entity): for entity_form in entity_forms: # If copy entity embedding from the graph embedding, don't need entity in vocab if entity_form != 'graph': word = Preprocessor.get_entity_form(entity, entity_form) vocab.add_word(word) # Add words for dialogue in dialogues: assert dialogue.is_int is False for turns in dialogue.token_turns: for turn in turns: for token in chain.from_iterable(turn): if is_entity(token): _add_entity(token) else: vocab.add_word(token) # Add special symbols vocab.add_words(special_symbols) print('Vocabulary size:', vocab.size) return vocab
def log(self, sent_number): """ Log translation to stdout. """ user_utterance = ' '.join( [str(x) if is_entity(x) else x for x in self.src_raw]) output = u'RAW INPUT: {}\n'.format(user_utterance) best_pred = self.pred_sents[0] best_score = self.pred_scores[0] pred_sent = ' '.join([str(x) for x in best_pred]) output += 'PRED OUTPUT: {}\n'.format(pred_sent) # output += "PRED SCORE: {:.4f}\n".format(best_score) if self.gold_sent is not None: tgt_sent = ' '.join([str(x) for x in self.gold_sent]) output += u'GOLD: {}\n'.format(tgt_sent) # gold score is always 0 because that is the highest possible # output += "GOLD SCORE: {:.4f}\n".format(self.gold_score) if len(self.pred_sents) > 1: output += 'BEST HYP:\n' for score, sent in zip(self.pred_scores, self.pred_sents): output += "[{:.4f}] {}\n".format(score, sent) output += "\n" return output
def build_utterance_vocab(dialogues, special_symbols=[], entity_forms=[], except_words=[]): vocab = Vocabulary(offset=0, unk=True, except_words=except_words) def _add_entity(entity): for entity_form in entity_forms: word = get_entity_form(entity, entity_form) vocab.add_word(word) # Add words for dialogue in dialogues: assert dialogue.is_int is False for turn in dialogue.token_turns: for token in turn: if is_entity(token): _add_entity(token) else: vocab.add_word(token) # Add special symbols vocab.add_words(special_symbols, special=True) vocab.finish(size_threshold=10000) print('Utterance vocab size:', vocab.size) return vocab
def get_policyHistogram(self): import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re allNum = len(self.policy_history) tmpData = np.mean(self.policy_history, axis=0)[0] r = re.compile(u'\d+[.]?\d*') x, w = [], [] for i in range(len(tmpData)): tmp = self.vocab.ind_to_word[i] if not is_entity(tmp): continue name = tmp.canonical.value if abs(name) > 10.1: continue x.append(name) w.append(tmpData[i]) w = w / np.sum(w) from scipy.stats import norm sns.distplot( x, bins=100, kde=False, hist_kws={'weights': w}, )
def _tokens_to_event(self, tokens, output_data, semi_event=False): # if self.agent == 0 : # try: # tokens = [0, 0] # tokens[0] = markers.OFFER # tokens[1] = '$60' # except ValueError: # #return None # pass if isinstance(tokens, tuple): tokens = list(tokens) if isinstance(tokens[1], float): tokens[1] = CanonicalEntity(type='price', value=tokens[1]) if semi_event: # From scale to real price # print('semi_event: {}->'.format(tokens[1]),end='') if tokens[1] is not None: tokens[1] = self.builder.get_price_number(tokens[1], self.kb) # print('{}.'.format(tokens[1])) return tokens if isinstance(tokens[0], int): tokens[0] = self.env.vocab.to_word(tokens[0]) if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity(tokens[1]): try: price = self.builder.get_price_number(tokens[1], self.kb) return self.offer({'price': price}, metadata={"output_data": output_data}) except ValueError: # return None pass elif tokens[0] == markers.OFFER: assert False tokens = self.builder.entity_to_str(tokens, self.kb) if len(tokens) > 0: if tokens[0] == markers.ACCEPT: return self.accept(metadata={"output_data": output_data}) elif tokens[0] == markers.REJECT: return self.reject(metadata={"output_data": output_data}) elif tokens[0] == markers.QUIT: return self.quit(metadata={"output_data": output_data}) while len(tokens) > 0 and tokens[-1] == None: tokens = tokens[:-1] s = self.attach_punct(' '.join(tokens)) # print 'send:', s # print(">>> sender's intent: ", tokens) role = self.kb.facts['personal']['Role'] category = self.kb.facts['item']['Category'] real_uttr = self.uttr_gen(tokens, role, category) # print(">>> sender's uttr: ", real_uttr) return self.message(s, metadata={"output_data": output_data, "real_uttr": real_uttr})
def label_liwc(self, liwc): for utterance in self.iter_utterances(): if utterance.action == 'message': tokens = self._treebank_to_liwc_token(utterance.tokens) for token in ifilter(lambda x: not is_entity(x), tokens): cats = liwc.lookup(token) for cat in cats: utterance.categories[cat][token] += 1
def map_prices(self, entity_tokens): # NOTE: entities are CanonicalEntities, change to Entity entity_tokens = Dialogue.original_price(self.kb, entity_tokens) tokens = [ str(x.canonical.value) if is_entity(x) else x for x in entity_tokens ] return tokens
def _process_target_tokens(self, tokens): ''' TODO: for now evaluate against canonical entities. In future, evaluate against actual utterances. ''' targets = [token[1] if is_entity(token) else token for token in tokens] #targets = [x for x in targets if x not in (markers.PAD,)] return targets
def parse_message(self, event, dialogue_state): tokens = self.lexicon.link_entity(event.data) tokens = [x.lower() if not is_entity(x) else x for x in tokens] utterance = Utterance(raw_text=event.data, tokens=tokens) intent = self.classify_intent(utterance, dialogue_state) template = self.extract_template(tokens, dialogue_state) utterance.lf = LF(intent, titles=self.get_entities(tokens, 'title')) utterance.template = template return utterance
def parse_message(self, event, dialogue_state): tokens = self.lexicon.link_entity(event.data) tokens = [x.lower() if not is_entity(x) else x for x in tokens] utterance = Utterance(raw_text=event.data, tokens=tokens) intent = "placeholder_intent" template = self.extract_template(tokens, dialogue_state) utterance.lf = LF(intent, topic="placeholder") utterance.template = template return utterance
def eval(self, kb, utterance): ''' utterance: a list of tokens and entities represented as a tuple (surface_form, (caninical_form, type)) ''' #print 'eval:', utterance N = len(utterance) i = 0 while i < N: token = utterance[i] if is_entity(token) and token[1][1] != 'item': self.inc_fact() if i + 1 < N and utterance[i + 1] == 'and': # number ent1 and ent2 if i - 1 < 0 or i + 3 > N: self.inc_undecided() i += 1 else: start, end = i - 1, i + 3 if not is_entity(utterance[i + 2]): self.inc_undecided() else: if end + 1 < N and utterance[end:end + 2] == ['in', 'those']: self.inc_coref() i = end + 2 else: self.eval_joint(kb, utterance[start:end]) i = end elif i - 1 > 0: # number ent start, end = i - 1, i + 1 if end + 1 < N and utterance[end:end + 2] == ['in', 'those']: self.inc_coref() i = end + 2 else: self.eval_single(kb, utterance[start:end]) i = end else: self.inc_undecided() i += 1 else: i += 1
def parser_stats(self, parsed_dialogues, agent=None): stats = {} non_entity_vocab = set() ents = set() stats['intents'] = defaultdict(int) intent_utterances = defaultdict(list) for dialogue in parsed_dialogues: for utterance in dialogue: if agent and utterance.agent != agent: continue if utterance.tokens is not None: tokens = [ x.canonical.type if is_entity(x) else x for x in utterance.tokens ] e = [x.surface for x in utterance.tokens if is_entity(x)] ents.update(e) non_entity_vocab.update(tokens) if utterance.lf and utterance.lf.intent != '<start>': stats['intents'][utterance.lf.intent] += 1 if utterance.text is not None: intent_utterances[utterance.lf.intent].append( tokenize(utterance.text)) stats['non_entity_vocab_size'] = len(non_entity_vocab) #print 'entities:', len(ents) #global no_ent_vocab #no_ent_vocab = non_entity_vocab #for x in all_vocab: # if not x in non_entity_vocab: # print x stats['intent_corpus_perplexity'] = self.intent_sequence_perplexity( intent_utterances) # Percentage intents #s = float(sum(stats['intents'].values())) #stats['intents'] = sorted( # [(k, v, v / s) for k, v in stats['intents'].iteritems()], # key=lambda x: x[1], reverse=True) self.print_stats(stats, 'parser stats') return stats
def var_to_sent(self, variables, vocab=None): if not vocab: vocab = self.vocab sent_ids = variables.data.cpu().numpy() pad_id = vocab.to_ind(markers.PAD) sent_words = [vocab.to_word(x) for x in sent_ids if x != pad_id] sent_strings = [str(x) if is_entity(x) else x for x in sent_words] readable_sent = ' '.join(sent_strings) return readable_sent
def process_turn(cls, turn): ''' Process entities. ''' # Represent price as "[x]" where x is the normalized value if len(turn) == 1 and turn[0] == markers.EOS: # NOTE: don't use <> because this is ignored by the analyzer tokens = ['_start_'] else: tokens = ['_price_' if is_entity(x) else x for x in turn] tokens = tokens return ' '.join(tokens)
def text_to_int(self, utterance, stage=None): ''' Process entities in the utterance based on whether it is used for encoding, decoding or ground truth. ''' if stage is not None: use_entity_map = self.setting[stage] tokens = self.preprocessor.process_utterance(utterance, stage) if not use_entity_map: return [self.vocab.to_ind(token) for token in tokens] else: offset = self.vocab.size return [ self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens ] else: tokens = self.preprocessor.process_utterance(utterance) offset = self.vocab.size return [ self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens ]
def get_first_price(self, ex): agents = {1: None, 0: None} for e in ex.events: if e.action == 'message': for sent_tokens in e.tokens: for token in sent_tokens: if agents[1] and agents[0]: return agents # Return at the first mention if is_entity(token): price = token.canonical.value agents[e.agent] = (e.role, price) return agents return agents
def extract_template(self, tokens, dialogue_state): template = [] type_count = defaultdict(int) for token in tokens: if token in self.numbers or token in ('no', 'all'): template.append('{number}') elif is_entity(token): type_ = token.canonical.type template.append('{{{0}[{1}]}}'.format(type_, type_count[type_])) type_count[type_] += 1 else: template.append(token) return template