def build_vocab(dialogues, special_symbols=[], entity_forms=[]): vocab = Vocabulary(offset=0, unk=True) def _add_entity(entity): for entity_form in entity_forms: # If copy entity embedding from the graph embedding, don't need entity in vocab if entity_form != 'graph': word = Preprocessor.get_entity_form(entity, entity_form) vocab.add_word(word) # Add words for dialogue in dialogues: assert dialogue.is_int is False for turns in dialogue.token_turns: for turn in turns: for token in chain.from_iterable(turn): if is_entity(token): _add_entity(token) else: vocab.add_word(token) # Add special symbols vocab.add_words(special_symbols) print 'Vocabulary size:', vocab.size return vocab
def build_schema_mappings(schema, num_items): entity_map = Vocabulary(unk=True) for type_, values in schema.values.iteritems(): entity_map.add_words(((value.lower(), type_) for value in values)) # Add item nodes for i in xrange(num_items): entity_map.add_word(item_to_entity(i)[1]) # Add attr nodes #for attr in schema.attributes: # entity_map.add_word((attr.name.lower(), 'attr')) relation_map = Vocabulary(unk=False) attribute_types = schema.get_attributes() # {attribute_name: value_type} relation_map.add_words((a.lower() for a in attribute_types.keys())) relation_map.add_word('has') # Inverse relation relation_map.add_words([inv_rel(r) for r in relation_map.word_to_ind]) return entity_map, relation_map
class Graph(object): ''' Maintain a (dynamic) knowledge graph of the agent. ''' metadata = None def __init__(self, kb): assert Graph.metadata is not None self.kb = kb self.reset() def reset(self): ''' Clear all information from dialogue history and only keep KB information. This is required during training when we go through one dialogue multiple times. ''' # Map each node in the graph to an integer self.nodes = Vocabulary(unk=False) # All paths in the KB; each path is a 3-tuple (node_id, edge_id, node_id) # NOTE: The first path is always a padding path self.paths = [Graph.metadata.PATH_PAD] # Read information form KB to fill in nodes and paths self.num_items = len(self.kb.items) self.load_kb(self.kb) # Input data to feed_dict self.node_ids = np.arange(self.nodes.size, dtype=np.int32) self.entity_ids = np.array([ Graph.metadata.entity_map.to_ind(self.nodes.to_word(i)) for i in xrange(self.nodes.size) ], dtype=np.int32) self.paths = np.array(self.paths, dtype=np.int32) self.feats = self.get_features() self.node_paths = self.get_node_paths() # Entity/token sequence in the dialogue self.entities = [] def get_node_paths(self): node_paths = [] for node_id in self.node_ids: # Skip the first padding path paths = [ path_id for path_id, path in enumerate(self.paths) if path_id != Graph.metadata.PAD_PATH_ID and path[0] == node_id ] node_paths.append(np.array(paths, dtype=np.int32)) return node_paths def get_input_data(self): ''' Return feed_dict data to the GraphEmbed model. ''' assert self.node_ids.shape[0] == self.feats.shape[0] return (self.node_ids, self.entity_ids, self.paths, self.feats) def _add_path(self, node1, relation, node2): node1_id = self.nodes.to_ind(node1) node2_id = self.nodes.to_ind(node2) rel = Graph.metadata.relation_map.to_ind(relation) irel = Graph.metadata.relation_map.to_ind(inv_rel(relation)) self.paths.append((node1_id, rel, node2_id)) self.paths.append((node2_id, irel, node1_id)) def load_kb(self, kb): ''' Construct 3 types of nodes: item, entity, attribute and 2 types of paths: (item, has_attr, entity) and (attr has entity) ''' attr_ents = defaultdict(set) # Entities of each attribute for i, item in enumerate(kb.items): # Item nodes item_node = (item_to_str(i), 'item') #item_name = item_to_str(i) #item_node = (item_name, item_name) self.nodes.add_word(item_node) attrs = sorted(item.items(), key=lambda x: x[0]) for attr_name, value in attrs: type_ = Graph.metadata.attribute_types[attr_name] attr_name = attr_name.lower() value = value.lower() # Attribute nodes attr_node = (attr_name, 'attr') #attr_node = (attr_name, attr_name) self.nodes.add_word(attr_node) # Entity nodes entity_node = (value, type_) self.nodes.add_word(entity_node) # Path: item has_attr entity self._add_path(item_node, attr_name, entity_node) attr_ents[attr_node].add(entity_node) # Path: attr has entity for attr_node, ent_set in attr_ents.iteritems(): for entity_node in ent_set: self._add_path(attr_node, 'has', entity_node) self.paths = np.array(self.paths, dtype=np.int32) def read_utterance(self, tokens, stage=None): ''' Map entities to node ids and tokens to -1. Add new nodes if needed. tokens: from batch['encoder/decoder_tokens']; entities are represented as (surface_form, (canonical_form, type)), i.e. output of entitylink. ''' entities = [x[1] for x in tokens if is_entity(x)] new_entities = set([x for x in entities if not self.nodes.has(x)]) if len(new_entities) > 0: self.add_entity_nodes(new_entities) node_ids = [self.nodes.to_ind(x[1]) for x in tokens if is_entity(x)] self.entities.append(node_ids) def _update_nodes(self, entities): self.nodes.add_words(entities) self.node_ids = np.arange(self.nodes.size, dtype=np.int32) def _update_feats(self, entities): # degree=0, node_type=entity type feats = [[0, self._node_type(x)] for x in entities] new_feat_vec = self.get_feat_vec(feats) self.feats = np.concatenate((self.feats, new_feat_vec), axis=0) def _update_entity_ids(self, entities): self.entity_ids = np.concatenate([ self.entity_ids, [Graph.metadata.entity_map.to_ind(entity) for entity in entities] ], axis=0) def _update_node_paths(self, entities): ''' New entities map to the padded path. ''' for _ in entities: self.node_paths.append(np.array([Graph.metadata.PAD_PATH_ID])) def add_entity_nodes(self, entities): # Paths do not change, no need to update self._update_nodes(entities) self._update_entity_ids(entities) self._update_feats(entities) self._update_node_paths(entities) def get_entity_list(self): ''' Return a list of unique entities in these utterances for the last n utterances ''' if Graph.metadata.entity_hist_len > 0: last_n = min(Graph.metadata.entity_hist_len, len(self.entities)) return list( set([ e for entities in self.entities[-1 * last_n:] for e in entities ])) else: entities = self.entities if len(entities) == 0: return [] if len(entities[-1]) == 0: if len(entities) < 2: return [] return list(set(self.entities[-2])) else: return list(set(self.entities[-1])) def _node_type(self, node): # Use fine categorty for item and attr nodes name, type_ = node return name if type_ == 'item' or type_ == 'attr' else type_ #return type_ def get_features(self): nodes = [self.nodes.to_word(i) for i in xrange(self.nodes.size)] # For entity node, -1 degree so that it excludes the edge incident to the attr node feats = [[0, self._node_type(node)] if node[1] == 'item' or node[1] == 'attr' else [-1, self._node_type(node)] for node in nodes] # Compute degree of each node for path in self.paths: n1, r, n2 = path feats[n1][0] += 1 return self.get_feat_vec(feats) @classmethod def degree_feat_size(cls): return 6 def _bin_degree(self, degree): # NOTE: we consider degree only for attr and entity nodes (only count edges connected # to item nodes). assert degree <= self.num_items p = degree / float(self.num_items) if p == 0: return 0 if p < 0.25: return 1 if p >= 0.25 and p < 0.5: return 2 if p >= 0.5 and p < 0.75: return 3 if p >= 0.75 and p < 1: return 4 if p == 1: return 5 def _get_index(self, feat_name, feat_value): offset, size = Graph.metadata.feat_inds[feat_name] assert feat_value < size return offset + feat_value def get_feat_vec(self, raw_feats): ''' Input: a list of features [degree, node_type] for each node Output: one-hot encoded numpy feature matrix ''' f = np.zeros([len(raw_feats), Graph.metadata.feat_size]) for i, (degree, node_type) in enumerate(raw_feats): # Don't consider degree of item nodes (number of attrs, same for all items) if not node_type.startswith('item'): f[i][self._get_index('rel_degree', self._bin_degree(degree))] = 1 f[i][self._get_index('degree', degree)] = 1 f[i][self._get_index( 'node_type', Graph.metadata.node_types.to_ind(node_type))] = 1 return f