Exemple #1
0
 def make_birel_matrix(self, relation='children'):
     birel = np.zeros((len(
         self.graph_structs), self._max_nodes, self._max_bi_relations, 2),
                      dtype='int32')
     for i, gs in enumerate(self.graph_structs):
         for j, nid in enumerate(gs.graph.nodes):
             nid_token = get_node_token(gs.graph, nid)
             for k, rel_nid in enumerate(getattr(gs, relation)[nid]):
                 rel_token = get_node_token(gs.graph, rel_nid)
                 try:
                     # birel[i, j, k, :] = [nid_token, rel_token]
                     birel[i, j, k, :] = [nid, rel_nid]
                 except IndexError:
                     continue
     return birel
Exemple #2
0
 def make_vocabulary(self):
     if self.word2ind is not None and len(self.word2ind) > 1:
         logging.info('word2ind already exists ({0} entries). Reusing it. Some entries are: {1}'.format(
             len(self.word2ind),
             list(self.word2ind.items())[:10]))
         return self.word2ind
     logging.info('word2ind does not exist. Creating it.')
     counter = Counter()
     constants = []
     special = []
     for gs in self.graph_structs:
         graph = gs.graph
         for nid in graph.nodes:
             token = get_node_token(graph, nid)
             if get_label(graph, nid, 'type') == 'constant':
                 constants.append(token)
                 counter[token] += 1
             else:
                 special.append(token)
                 counter[token] += 1
     logging.info('Most common 10 tokens: {0}'.format(counter.most_common()[:10]))
     special = sorted(set(special))
     logging.info('Got {0} special tokens: {1}'.format(len(special), special))
     constants = sorted(set(constants))
     logging.info('Got {0} constant tokens. Some of them are: {1}'.format(
         len(constants), constants[:10]))
     vocab = special + constants
     assert '<unk>' not in vocab
     [self.word2ind[w] for w in vocab]
     logging.info('word2ind created. Some entries are: {0}'.format(
         list(self.word2ind.items())[:10]))
     return self.word2ind
Exemple #3
0
 def make_treelet_matrix(self, relation='treelet_predicate'):
     treelets = np.zeros(
         (len(self.graph_structs), self._max_nodes, self._max_treelets, 3),
         dtype='int32')
     for i, gs in enumerate(self.graph_structs):
         for j, nid in enumerate(gs.graph.nodes):
             nid_token = get_node_token(gs.graph, nid)
             for k, (rel1_nid,
                     rel2_nid) in enumerate(getattr(gs, relation)[nid]):
                 rel1_token = get_node_token(gs.graph, rel1_nid)
                 rel2_token = get_node_token(gs.graph, rel2_nid)
                 treelets[i, j, k, :] = [
                     self.word2ind[nid_token], self.word2ind[rel1_token],
                     self.word2ind[rel2_token]
                 ]
     return treelets
Exemple #4
0
 def make_vocabulary(self):
     counter = Counter()
     constants = []
     special = []
     for gs in self.graph_structs:
         graph = gs.graph
         for nid in graph.nodes:
             token = get_node_token(graph, nid)
             if get_label(graph, nid, 'type') == 'constant':
                 constants.append(token)
                 counter[token] += 1
             else:
                 special.append(token)
                 counter[token] += 1
     logging.info('Most common 10 tokens: {0}'.format(
         counter.most_common()[:10]))
     special = sorted(set(special))
     logging.info('Got {0} special tokens: {1}'.format(
         len(special), special))
     constants = sorted(set(constants))
     logging.info('Got {0} constant tokens. Some of them are: {1}'.format(
         len(constants), constants[:10]))
     vocab = special + constants
     assert '<unk>' not in vocab
     [self.word2ind[w] for w in vocab]
     self.word2emb = np.random.uniform(size=(len(self.word2ind), 2))
     return self.word2ind
Exemple #5
0
 def make_node_inds(self):
     node_inds = np.zeros((len(self.graph_structs), self._max_nodes),
                          dtype='float32')
     for i, gs in enumerate(self.graph_structs):
         for j, nid in enumerate(gs.graph.nodes):
             node_token = get_node_token(gs.graph, nid)
             node_inds[i, nid] = self.word2ind[node_token]
     return node_inds