コード例 #1
0
def build_vocab(dialogues, special_symbols=[], entity_forms=[]):
    vocab = Vocabulary(offset=0, unk=True)

    def _add_entity(entity):
        for entity_form in entity_forms:
            # If copy entity embedding from the graph embedding, don't need entity in vocab
            if entity_form != 'graph':
                word = Preprocessor.get_entity_form(entity, entity_form)
                vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turns in dialogue.token_turns:
            for turn in turns:
                for token in chain.from_iterable(turn):
                    if is_entity(token):
                        _add_entity(token)
                    else:
                        vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols)
    print 'Vocabulary size:', vocab.size
    return vocab
コード例 #2
0
def build_schema_mappings(schema, num_items):
    entity_map = Vocabulary(unk=True)
    for type_, values in schema.values.iteritems():
        entity_map.add_words(((value.lower(), type_) for value in values))
    # Add item nodes
    for i in xrange(num_items):
        entity_map.add_word(item_to_entity(i)[1])
    # Add attr nodes
    #for attr in schema.attributes:
    #    entity_map.add_word((attr.name.lower(), 'attr'))

    relation_map = Vocabulary(unk=False)
    attribute_types =  schema.get_attributes()  # {attribute_name: value_type}
    relation_map.add_words((a.lower() for a in attribute_types.keys()))
    relation_map.add_word('has')
    # Inverse relation
    relation_map.add_words([inv_rel(r) for r in relation_map.word_to_ind])

    return entity_map, relation_map
コード例 #3
0
class Graph(object):
    '''
    Maintain a (dynamic) knowledge graph of the agent.
    '''
    metadata = None

    def __init__(self, kb):
        assert Graph.metadata is not None
        self.kb = kb
        self.reset()

    def reset(self):
        '''
        Clear all information from dialogue history and only keep KB information.
        This is required during training when we go through one dialogue multiple times.
        '''
        # Map each node in the graph to an integer
        self.nodes = Vocabulary(unk=False)
        # All paths in the KB; each path is a 3-tuple (node_id, edge_id, node_id)
        # NOTE: The first path is always a padding path
        self.paths = [Graph.metadata.PATH_PAD]
        # Read information form KB to fill in nodes and paths
        self.num_items = len(self.kb.items)
        self.load_kb(self.kb)

        # Input data to feed_dict
        self.node_ids = np.arange(self.nodes.size, dtype=np.int32)
        self.entity_ids = np.array([
            Graph.metadata.entity_map.to_ind(self.nodes.to_word(i))
            for i in xrange(self.nodes.size)
        ],
                                   dtype=np.int32)
        self.paths = np.array(self.paths, dtype=np.int32)
        self.feats = self.get_features()
        self.node_paths = self.get_node_paths()

        # Entity/token sequence in the dialogue
        self.entities = []

    def get_node_paths(self):
        node_paths = []
        for node_id in self.node_ids:
            # Skip the first padding path
            paths = [
                path_id for path_id, path in enumerate(self.paths)
                if path_id != Graph.metadata.PAD_PATH_ID and path[0] == node_id
            ]
            node_paths.append(np.array(paths, dtype=np.int32))
        return node_paths

    def get_input_data(self):
        '''
        Return feed_dict data to the GraphEmbed model.
        '''
        assert self.node_ids.shape[0] == self.feats.shape[0]
        return (self.node_ids, self.entity_ids, self.paths, self.feats)

    def _add_path(self, node1, relation, node2):
        node1_id = self.nodes.to_ind(node1)
        node2_id = self.nodes.to_ind(node2)
        rel = Graph.metadata.relation_map.to_ind(relation)
        irel = Graph.metadata.relation_map.to_ind(inv_rel(relation))
        self.paths.append((node1_id, rel, node2_id))
        self.paths.append((node2_id, irel, node1_id))

    def load_kb(self, kb):
        '''
        Construct 3 types of nodes: item, entity, attribute
        and 2 types of paths: (item, has_attr, entity) and (attr has entity)
        '''
        attr_ents = defaultdict(set)  # Entities of each attribute
        for i, item in enumerate(kb.items):
            # Item nodes
            item_node = (item_to_str(i), 'item')
            #item_name = item_to_str(i)
            #item_node = (item_name, item_name)
            self.nodes.add_word(item_node)
            attrs = sorted(item.items(), key=lambda x: x[0])
            for attr_name, value in attrs:
                type_ = Graph.metadata.attribute_types[attr_name]
                attr_name = attr_name.lower()
                value = value.lower()
                # Attribute nodes
                attr_node = (attr_name, 'attr')
                #attr_node = (attr_name, attr_name)
                self.nodes.add_word(attr_node)
                # Entity nodes
                entity_node = (value, type_)
                self.nodes.add_word(entity_node)
                # Path: item has_attr entity
                self._add_path(item_node, attr_name, entity_node)
                attr_ents[attr_node].add(entity_node)
        # Path: attr has entity
        for attr_node, ent_set in attr_ents.iteritems():
            for entity_node in ent_set:
                self._add_path(attr_node, 'has', entity_node)
        self.paths = np.array(self.paths, dtype=np.int32)

    def read_utterance(self, tokens, stage=None):
        '''
        Map entities to node ids and tokens to -1. Add new nodes if needed.
        tokens: from batch['encoder/decoder_tokens']; entities are represented
        as (surface_form, (canonical_form, type)), i.e. output of entitylink.
        '''
        entities = [x[1] for x in tokens if is_entity(x)]
        new_entities = set([x for x in entities if not self.nodes.has(x)])
        if len(new_entities) > 0:
            self.add_entity_nodes(new_entities)
        node_ids = [self.nodes.to_ind(x[1]) for x in tokens if is_entity(x)]
        self.entities.append(node_ids)

    def _update_nodes(self, entities):
        self.nodes.add_words(entities)
        self.node_ids = np.arange(self.nodes.size, dtype=np.int32)

    def _update_feats(self, entities):
        # degree=0, node_type=entity type
        feats = [[0, self._node_type(x)] for x in entities]
        new_feat_vec = self.get_feat_vec(feats)
        self.feats = np.concatenate((self.feats, new_feat_vec), axis=0)

    def _update_entity_ids(self, entities):
        self.entity_ids = np.concatenate([
            self.entity_ids,
            [Graph.metadata.entity_map.to_ind(entity) for entity in entities]
        ],
                                         axis=0)

    def _update_node_paths(self, entities):
        '''
        New entities map to the padded path.
        '''
        for _ in entities:
            self.node_paths.append(np.array([Graph.metadata.PAD_PATH_ID]))

    def add_entity_nodes(self, entities):
        # Paths do not change, no need to update
        self._update_nodes(entities)
        self._update_entity_ids(entities)
        self._update_feats(entities)
        self._update_node_paths(entities)

    def get_entity_list(self):
        '''
        Return a list of unique entities in these utterances for the last n utterances
        '''
        if Graph.metadata.entity_hist_len > 0:
            last_n = min(Graph.metadata.entity_hist_len, len(self.entities))
            return list(
                set([
                    e for entities in self.entities[-1 * last_n:]
                    for e in entities
                ]))
        else:
            entities = self.entities
            if len(entities) == 0:
                return []
            if len(entities[-1]) == 0:
                if len(entities) < 2:
                    return []
                return list(set(self.entities[-2]))
            else:
                return list(set(self.entities[-1]))

    def _node_type(self, node):
        # Use fine categorty for item and attr nodes
        name, type_ = node
        return name if type_ == 'item' or type_ == 'attr' else type_
        #return type_

    def get_features(self):
        nodes = [self.nodes.to_word(i) for i in xrange(self.nodes.size)]
        # For entity node, -1 degree so that it excludes the edge incident to the attr node
        feats = [[0, self._node_type(node)] if node[1] == 'item'
                 or node[1] == 'attr' else [-1, self._node_type(node)]
                 for node in nodes]
        # Compute degree of each node
        for path in self.paths:
            n1, r, n2 = path
            feats[n1][0] += 1
        return self.get_feat_vec(feats)

    @classmethod
    def degree_feat_size(cls):
        return 6

    def _bin_degree(self, degree):
        # NOTE: we consider degree only for attr and entity nodes (only count edges connected
        # to item nodes).
        assert degree <= self.num_items
        p = degree / float(self.num_items)
        if p == 0:
            return 0
        if p < 0.25:
            return 1
        if p >= 0.25 and p < 0.5:
            return 2
        if p >= 0.5 and p < 0.75:
            return 3
        if p >= 0.75 and p < 1:
            return 4
        if p == 1:
            return 5

    def _get_index(self, feat_name, feat_value):
        offset, size = Graph.metadata.feat_inds[feat_name]
        assert feat_value < size
        return offset + feat_value

    def get_feat_vec(self, raw_feats):
        '''
        Input: a list of features [degree, node_type] for each node
        Output: one-hot encoded numpy feature matrix
        '''
        f = np.zeros([len(raw_feats), Graph.metadata.feat_size])

        for i, (degree, node_type) in enumerate(raw_feats):
            # Don't consider degree of item nodes (number of attrs, same for all items)
            if not node_type.startswith('item'):
                f[i][self._get_index('rel_degree',
                                     self._bin_degree(degree))] = 1
                f[i][self._get_index('degree', degree)] = 1
            f[i][self._get_index(
                'node_type', Graph.metadata.node_types.to_ind(node_type))] = 1

        return f