Beispiel #1
0
def train_iteration(opts, assoc_model, trainer, do_sym, log_file):
    """
    Setup where association scores are learned, relation by relation.
    based on `model.macro_loops()`
    :return: full-graph iteration scores
    """
    ep_loss = 0.0
    
    # iterate over relations:
    graphs_order = list(assoc_model.graphs.items())
    # TODO maybe even randomize edge order across relations
    if opts.rand_nodes:
        dy.np.random.shuffle(graphs_order)
    for rel, g in graphs_order:
        # report
        if opts.v > 0:
            timeprint('starting loop over {}'.format(rel))

        if opts.rule_override and rel in SYMMETRIC_RELATIONS and not do_sym:
            timeprint('RELATION OVERRIDEN')
            continue
        
        # iterate over nodes (each as source + as target):
        node_order = list(range(N))
        if opts.rand_nodes:
            dy.np.random.shuffle(node_order)
        for node in tqdm(node_order):
            if opts.debug and node % 100 != 0:
                continue
            ep_loss += node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source=True)
            ep_loss += node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source=False)

    return ep_loss
Beispiel #2
0
    def __init__(self, graphs, embeddings, mode=TRANSLATIONAL_EMBED_MODE, dropout=0.0, model_path=None):
        """
        :param graphs: dictionary of <relation:CSR-format graph>s, node-aligned
        :param embeddings: list of numpy array embeddings, indices aligned to nodes
        :param mode: mode of calculating association score, options: {}
        """.format(MODES_STR)
        # input validation
        graph_sizes = list(set([g.shape[0] for g in list(graphs.values())]))
        assert len(graph_sizes) == 1
        assert len(embeddings) == graph_sizes[0], '{} != {}'.format(len(embeddings), graph_sizes[0])
        
        # raw members
        self.graphs = {canonicalize_name(k):g for k,g in list(graphs.items())}
        self.mode = mode
        
        # documenationy members
        self.relation_names = sorted(self.graphs.keys())
        if 'co_hypernym' in self.relation_names:
            self.relation_names.remove('co_hypernym')
        self.vocab_size = graph_sizes[0]
        self.R = len(self.relation_names)
        self.emb_dim = len(embeddings[0])
        self.dropout = dropout

        # model members
        self.model = dy.Model()
        # TODO consider using no_update param for embeddings
        self.embeddings = self.model.add_lookup_parameters((self.vocab_size, self.emb_dim))
        self.embeddings.init_from_array(embeddings)
        
        # init association parameter
        self.no_assoc = False # so can be overriden in inheritors
        
        # first determine 
        if self.mode == BILINEAR_MODE:              # full-rank bilinear matrix
            assoc_dim = (self.emb_dim, self.emb_dim)
        elif self.mode == DIAG_RANK1_MODE:          # diagonal bilinear matrix + rank 1 matrix
            # first row = diagonal
            # second row = 'source factor'
            # third row = 'target factor'
            assoc_dim = (3, self.emb_dim)
        elif self.mode == TRANSLATIONAL_EMBED_MODE: # additive relational vector
            assoc_dim = self.emb_dim
        elif self.mode == DISTMULT:                 # diagonal bilinear matrix
            assoc_dim = self.emb_dim
        else:
            raise ValueError('unsupported mode: {}. allowed are {}'\
                             .format(self.mode, ', '.join(MODES_STR)))
            
        # init actual parameter
        self.word_assoc_weights = {r:self.model.add_parameters(assoc_dim) for r in self.relation_names}
        if model_path is not None:
            self.model.populate(model_path + '.dyn')
        
        timeprint('finished initialization for association model.')
Beispiel #3
0
 def init_ergm_features(self, graphs=None):
     """
     Computes ERGM features from scratch, populates cache members and self.feature_vals
     :param graphs: if not None, changes underlying member and inits from it.
     """
     if graphs is not None:
         self.graphs = graphs
     self.feature_vals = {}
     self.init_ergm_cache()
     self.update_features()
     timeprint('initialized features from cache')
Beispiel #4
0
 def init_ergm_features(self, graphs=None):
     """
     Computes ERGM features from scratch, populates cache members and self.feature_vals
     :param graphs: if not None, changes underlying member and inits from it.
     """
     if graphs is not None:
         self.graphs = graphs
     self.feature_vals = {}
     self.init_ergm_cache()
     self.update_features()
     timeprint('initialized features from cache')
Beispiel #5
0
def train_iteration(opts, assoc_model, trainer, do_sym, log_file):
    """
    Setup where association scores are learned, relation by relation.
    based on `model.macro_loops()`
    :return: full-graph iteration scores
    """
    ep_loss = 0.0

    # iterate over relations:
    graphs_order = list(assoc_model.graphs.items())
    # TODO maybe even randomize edge order across relations
    if opts.rand_nodes:
        dy.np.random.shuffle(graphs_order)
    for rel, g in graphs_order:
        # report
        if opts.v > 0:
            timeprint('starting loop over {}'.format(rel))

        if opts.rule_override and rel in SYMMETRIC_RELATIONS and not do_sym:
            timeprint('RELATION OVERRIDEN')
            continue

        # iterate over nodes (each as source + as target):
        node_order = list(range(N))
        if opts.rand_nodes:
            dy.np.random.shuffle(node_order)
        for node in tqdm(node_order):
            if opts.debug and node % 100 != 0:
                continue
            ep_loss += node_iteration(rel,
                                      g,
                                      node,
                                      opts,
                                      assoc_model,
                                      trainer,
                                      log_file,
                                      is_source=True)
            ep_loss += node_iteration(rel,
                                      g,
                                      node,
                                      opts,
                                      assoc_model,
                                      trainer,
                                      log_file,
                                      is_source=False)

    return ep_loss
Beispiel #6
0
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source):
    """
    Perform one iteration of trying to score a node's neighbors above negative samples.
    """
    
    # true instances likelihood
    trues = targets(g, node) if is_source else sources(g, node)
    side = '->' if is_source else '<-'
    if len(trues) == 0: return 0.0
    
    if opts.debug:
        dy.renew_cg(immediate_compute = True, check_validity = True)
    else:
        dy.renew_cg()
    
    # compute association score as dynet expression (can't do this above due to staleness)
    true_scores = []
    for tr in trues:
        if is_source:
            j_assoc_score = assoc_model.word_assoc_score(node, tr, rel)
        else:
            j_assoc_score = assoc_model.word_assoc_score(tr, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\
                         .format(node, side, tr, j_assoc_score.scalar_value()))
        true_scores.append(j_assoc_score)


    # false targets likelihood - negative sampling (uniform)
    # collect negative samples
    if opts.nll:
        sample_scores = [[ts] for ts in true_scores]
    else:
        margins = []
    neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))]
    # remove source and true targets if applicable
    for t in [node] + trues:
        if t in neg_samples:
            neg_samples.remove(t)
            neg_samples.append(np.random.choice(range(N)))
    for (i,ns) in enumerate(neg_samples):
        # compute association score as dynet expression
        if is_source:
            ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel)
        else:
            ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\
                         .format(node, side, ns, ns_assoc_score.scalar_value()))
        corresponding_true = i // opts.neg_samp
        if opts.nll:
            sample_scores[corresponding_true].append(ns_assoc_score)
        else:
            # TODO maybe use dy.hinge()
            ctt_score = true_scores[corresponding_true]
            margin = ctt_score - ns_assoc_score
            margins.append(dy.rectify(dy.scalarInput(1.0) - margin))


    # compute overall loss
    if opts.nll:
        if len(sample_scores) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores])
    else:
        if len(margins) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum(margins)
    sc_loss = dy_loss.scalar_value()
    if log_file is not None:
        log_file.write('{}\tLOSS\t{:.3e}\n'\
                         .format(node, sc_loss))
                         
    # backprop and recompute score
    if opts.v > 1:
        timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\
                  .format(rel, node, 'source' if is_source else 'target', sc_loss))

    dy_loss.backward()
    trainer.update()

    return sc_loss
Beispiel #7
0
    def remove_edge(self,
                    src_idx,
                    trg_idx,
                    rel,
                    update_feats=True,
                    permanent=True,
                    caches=None,
                    report_feat_diff=False):
        """
        Removes edge from graph, updates cache and feature values
        :param src_idx: index of source node from edge to remove
        :param trg_idx: index of target node from edge to remove
        :param rel: relation type
        :param update_feats: flag for not updating all cache and features, to be deferred
        :returns: if permanent=False, returns ergm score of removed-edge graph
        """
        if permanent:
            self.score_is_stale = True
            cached_feats = None
            cached_cache = None
        else:
            if caches is not None:
                cached_cache = copy.deepcopy(caches[0])
                cached_feats = caches[1]
            else:
                cached_cache = copy.deepcopy(self.cache)
                cached_feats = copy.deepcopy(self.feature_vals)
            update_feats = True  # no other mode possible

        # update cache members
        # decrement edge count for rel
        self.edge_counts[rel] -= 1

        # pair cache members
        for r, g in list(self.graphs.items()):
            if rel == 'hypernym' and r == 'co_hypernym':
                continue

            # decrement mutual edge count for pairs with trg-src edges
            if g[trg_idx, src_idx] == 1:
                self.mutual_edge_counts[find_key(self.mutual_edge_counts,
                                                 (rel, r))] -= 1

            # decrement two-paths for x-src-trg and src-trg-y
            self.two_path_counts[(r, rel)] -= self.in_degs[r][0, src_idx]
            self.two_path_counts[(rel, r)] -= self.out_degs[r][0, trg_idx]

            # triplet cache members
            for r2, g2 in list(self.graphs.items()):
                if rel == 'hypernym' and r2 == 'co_hypernym':
                    continue

                # decrement transitive closures from two-paths src-x-trg
                if self.out_degs[r][0, src_idx] > 0 and self.in_degs[r2][
                        0, trg_idx] > 0:
                    r_r2_betweens = (g[src_idx] * g2[:, trg_idx]).sum()
                    self.transitive_closure_counts[(r, r2,
                                                    rel)] -= r_r2_betweens
                # decrement directed triangle count
                if self.out_degs[r2][0, trg_idx] > 0 and self.in_degs[r][
                        0, src_idx] > 0:
                    r_r2_cycles = (g2[trg_idx] * g[:, src_idx]).sum()
                    rs_key = find_cyclic_key(self.directed_triangle_counts,
                                             (r, rel, r2))
                    self.directed_triangle_counts[rs_key] -= r_r2_cycles

        # decrement src's out_degree and trg's in_degree in rel and update all related caches
        self.out_degs[rel][0, src_idx] -= 1
        self.in_degs[rel][0, trg_idx] -= 1

        if update_feats:
            # recompute heavy cache updates from raw counts
            self.update_stars_cache_from_edge(rel,
                                              src_idx,
                                              trg_idx,
                                              added=False)

            # update features from caches
            self.update_features()

        if not permanent and report_feat_diff:
            timeprint('changed feature values:')
            diff_keys = [
                k for k in self.feature_vals
                if self.feature_vals[k] != cached_feats[k]
            ]
            if len(diff_keys) > 0:
                print('\n'.join(['{}: from {} to {}'\
                      .format(k, cached_feats[k], self.feature_vals[k]) for k in diff_keys]))

        if permanent:
            # remove actual edge
            self.graphs[rel][src_idx, trg_idx] = 0
        else:
            if rel == 'hypernym':
                self.zero_all_feats('co_hypernym')

            # prepare return value
            ret = self.ergm_score()

            # revert everything
            self.reread_cache(cached_cache)
            self.feature_vals = cached_feats

            # return prepared score
            return ret
Beispiel #8
0
 parser.add_argument("--neg-samp", type=int, default=ASSOC_DEFAULT_NEGS, help="nodes for negative sample")
 parser.add_argument("--rand-nodes", action="store_false", help="randomize node order in training")
 parser.add_argument("--rule-override", action="store_false", help="rule-based override for symmetric relations")
 parser.add_argument("--eval-dev", action='store_true', help="evaluate on dev set (otherwise - test)")
 parser.add_argument("--dropout", type=float, default=0.0)
 parser.add_argument("--nll", action='store_true', help="use negative log likelihood loss")
 parser.add_argument("--no-log", action='store_true')
 parser.add_argument("--early-stopping", action='store_true', help="stop if model hasn't improved in 3 epochs")
 parser.add_argument("--v", type=int, default=1, help="verbosity")
 parser.add_argument("--debug", action='store_true')
 opts = parser.parse_args()
 
 start_time = datetime.now()
 
 # reporting
 timeprint('graphs file = {}'.format(opts.input))
 if opts.embeddings is not None:
     timeprint('embeddings file = {}'.format(opts.embeddings))
 else:
     timeprint('embeddings size = {}'.format(opts.emb_size))
 timeprint('association mode = {}'.format(opts.assoc_mode))
 timeprint('negative samples = {}'.format(opts.neg_samp))
 if opts.model is None:
     timeprint('model file = {}'.format(opts.model_out))
     if opts.nll:
         timeprint('using negative log likelihood loss')
     else:
         timeprint('using margin loss')
     if opts.no_log:
         timeprint('no log file. timestamp for test: {}_{}' \
                   .format(start_time.date(), start_time.time()))
Beispiel #9
0
    def init_ergm_cache(self):
        """
        Computes ERGM features from scratch, populates cache members
        """

        # edges
        for r in self.relation_names:
            edges = edge_count(self.graphs[r])
            self.edge_counts[r] = edges

        timeprint('populated edge cache')

        # mutual edges
        for i, n1 in enumerate(self.relation_names):
            r1 = self.graphs[n1]
            for j in range(i+1): # unordered, including self
                n2 = self.relation_names[j]
                r2 = self.graphs[n2]
                mut_edges = mutual_edges(r1, r2)
                self.mutual_edge_counts[(n1, n2)] = mut_edges

        timeprint('populated mutual edge cache')

        # directed triangles - iterate over R^2 + choose(r,3)/3 ordered relation triplets
        eye = csr_eye(self.vocab_size)
        for i,j,k in cyclic_triads(self.R):
            n1, n2, n3 = self.relation_names[i], self.relation_names[j], self.relation_names[k]
            r1, r2, r3 = self.graphs[n1], self.graphs[n2], self.graphs[n3]
            dir_triangles = (r1 * r2 * r3).multiply(eye).sum()
            if i == j and j == k: # each triangle was counted thrice, except self loops
                self_loops = r1.diagonal().sum()
                dir_triangles += 2 * self_loops
                dir_triangles /=3
            self.directed_triangle_counts[(n1, n2, n3)] = dir_triangles

        timeprint('extracted directed triangle features')

        # transitive closures - iterate over ordered relation triplets
        # (also populate self.two_path_counts)
        for n1, r1 in list(self.graphs.items()):
            for n2, r2 in list(self.graphs.items()):
                two_paths = r1 * r2
                two_path_count = max([two_paths.sum(), sys.float_info.epsilon])
                self.two_path_counts[(n1, n2)] = two_path_count
                for n3, r3 in list(self.graphs.items()):
                    closures = two_paths.multiply(r3).sum() # pointwise
                    self.transitive_closure_counts[(n1, n2, n3)] = closures

        timeprint('populated transitivity cache')

        # 1-star cache for updates + self-2,3-stars
        for n, g in list(self.graphs.items()):
            self.in_degs[n] = g.sum(0) # numpy matrix
            self.out_degs[n] = g.sum(1).transpose() # numpy matrix

            osc = one_rel_star_counts(self.in_degs[n], self.out_degs[n])
            
            self.in_one_star_counts[n] = osc['i1sc']
            self.out_one_star_counts[n] = osc['o1sc']
            self.in_two_star_counts[(n, n)] = osc['i2sc']
            self.out_two_star_counts[(n, n)] = osc['o2sc']
            self.in_three_star_counts[(n, n, n)] = osc['i3sc']
            self.out_three_star_counts[(n, n, n)] = osc['o3sc']

            self.in_one_p_star_counts[n] = osc['i1psc']
            self.out_one_p_star_counts[n] = osc['o1psc']
            self.in_two_p_star_counts[(n, n)] = osc['i2psc']
            self.out_two_p_star_counts[(n, n)] = osc['o2psc']
            self.in_three_p_star_counts[(n, n, n)] = osc['i3psc']
            self.out_three_p_star_counts[(n, n, n)] = osc['o3psc']

        timeprint('populated 1r-star cache')

        # 2-stars
        for n1, n2 in combinations(self.relation_names, 2):
            
            tsc = two_rel_star_counts(self.in_degs[n1], self.out_degs[n1],\
                                      self.in_degs[n2], self.out_degs[n2])
            
            self.in_two_star_counts[(n1, n2)] = tsc['i2sc']
            self.out_two_star_counts[(n1, n2)] = tsc['o2sc']
            self.in_three_star_counts[(n1, n1, n2)] = tsc['i3sc112']
            self.out_three_star_counts[(n1, n1, n2)] = tsc['o3sc112']
            self.in_three_star_counts[(n1, n2, n2)] = tsc['i3sc122']
            self.out_three_star_counts[(n1, n2, n2)] = tsc['o3sc122']
            self.in_two_p_star_counts[(n1, n2)] = tsc['i2psc']
            self.out_two_p_star_counts[(n1, n2)] = tsc['o2psc']
            self.in_three_p_star_counts[(n1, n1, n2)] = tsc['i3psc112']
            self.out_three_p_star_counts[(n1, n1, n2)] = tsc['o3psc112']
            self.in_three_p_star_counts[(n1, n2, n2)] = tsc['i3psc122']
            self.out_three_p_star_counts[(n1, n2, n2)] = tsc['o3psc122']

        timeprint('populated 2r-star cache')

        # 3-stars
        for n1, n2, n3 in combinations(self.relation_names, 3):
            ttsc = three_rel_star_counts(self.in_degs[n1], self.out_degs[n1],\
                                         self.in_degs[n2], self.out_degs[n2],\
                                         self.in_degs[n3], self.out_degs[n3])
            
            self.in_three_star_counts[(n1, n2, n3)] = ttsc['i3sc']
            self.out_three_star_counts[(n1, n2, n3)] = ttsc['o3sc']
            self.in_three_p_star_counts[(n1, n2, n3)] = ttsc['i3psc']
            self.out_three_p_star_counts[(n1, n2, n3)] = ttsc['o3psc']

        timeprint('populated 3r-star cache')
Beispiel #10
0
    def __init__(self, graphs, embeddings, assoc_mode=BILINEAR_MODE, reg=0.0, dropout=0.0,
                 no_assoc=False, model_path=None, ergm_path=None,
                 path_only_init=False):
        """
        :param graphs: dictionary of {relation:CSR-format graph}s, node-aligned
        :param embeddings: list of numpy array embeddings, indices aligned to nodes
        :param model_path: optional path for files with pre-trained association model (read by super)
        :param ergm_path: optional path for files with pre-trained model
        :param path_only_init: model_path only used for initialization
        """
        # input validation
        AssociationModel.__init__(self, graphs, embeddings, assoc_mode, dropout, model_path=model_path)

        # raw members
        self.no_assoc = no_assoc
        self.regularize = reg

        # cache members
        self.cache = {}
        self.edge_counts = self.add_cache_dict('ec')                   # keys are single relations
        self.mutual_edge_counts = self.add_cache_dict('mec')           # keys are unordered relation pairs
        self.two_path_counts = self.add_cache_dict('tpc')              # keys are ordered relation pairs
        self.transitive_closure_counts = self.add_cache_dict('tcc')    # keys are ordered relation triplets
        self.directed_triangle_counts = self.add_cache_dict('dtc')     # keys are ordered relation triplets
        self.in_degs = self.add_cache_dict('ins')                      # keys are single relations, values are big lists
        self.out_degs = self.add_cache_dict('outs')                    # keys are single relations, values are big lists
        self.in_one_star_counts = self.add_cache_dict('i1sc')          # keys are single relations
        self.out_one_star_counts = self.add_cache_dict('o1sc')         # keys are single relations
        self.in_two_star_counts = self.add_cache_dict('i2sc')          # keys are unordered relation pairs
        self.out_two_star_counts = self.add_cache_dict('o2sc')         # keys are unordered relation pairs
        self.in_three_star_counts = self.add_cache_dict('i3sc')        # keys are unordered relation triplets
        self.out_three_star_counts = self.add_cache_dict('o3sc')       # keys are unordered relation triplets
        # 'at least k' stars - 'one/two/three plus'
        self.in_one_p_star_counts = self.add_cache_dict('i1psc')       # keys are single relations
        self.out_one_p_star_counts = self.add_cache_dict('o1psc')      # keys are single relations
        self.in_two_p_star_counts = self.add_cache_dict('i2psc')       # keys are unordered relation pairs
        self.out_two_p_star_counts = self.add_cache_dict('o2psc')      # keys are unordered relation pairs
        self.in_three_p_star_counts = self.add_cache_dict('i3psc')     # keys are unordered relation triplets
        self.out_three_p_star_counts = self.add_cache_dict('o3psc')    # keys are unordered relation triplets

        self.missing_node_indices = []          # updates during training (NOT SURE IF NEEDED)

        timeprint('computing ERGM features...')
        self.init_ergm_features()               # populates self.feature_vals
        timeprint('finished! computed {} features'.format(len(self.feature_vals)))
        timeprint('{} non-zero features'.format(np.count_nonzero(list(self.feature_vals.values()))))

        # documentationy again, for efficient updates
        encountered_features = list(self.feature_vals.keys()) # canonical ordering from now on
        
        if ergm_path is not None:
            ergm_model_path = ergm_path
        elif (model_path is not None) and (not path_only_init):
            ergm_model_path = model_path
        else:
            ergm_model_path = None
                
        if ergm_model_path is None:
            self.feature_set = encountered_features
        else:
            self.feature_set = pickle.load(open(ergm_model_path + '.feats'))
            assert sorted(self.feature_set) == sorted(encountered_features)
        
        if ergm_model_path is None:
            self.ergm_weights = self.model.add_parameters(len(self.feature_set))
        
        if model_path is None and ergm_model_path is None:
            # 'model_path is not None' is initialized in super()
            # TODO support other association modes (affects downstream)
            if self.no_assoc:
                self.word_assoc_weights = {r:self.model.add_parameters((self.emb_dim, self.emb_dim), init=dy.ConstInitializer(0.0)) for r in self.relation_names}
            else:
                self.word_assoc_weights = {r:self.model.add_parameters((self.emb_dim, self.emb_dim)) for r in self.relation_names}
        elif ergm_model_path is not None:
            pc = dy.ParameterCollection()
            dy.load(ergm_model_path + '.dyn', pc)
            pc_list = pc.parameters_list()
            i = 0
            self.ergm_weights = pc_list[i]
            if not path_only_init:
                self.word_assoc_weights = {}
                rel_order = self.relation_names
                for r in rel_order:
                    i += 1
                    self.word_assoc_weights[r] = pc_list[i]
                i += 1
                assert i == len(pc_list),\
                       '{} relation params read but length is {}'.format(i, len(pc_list))
        
        self.dy_score = self.ergm_score()
        self.score = self.dy_score.scalar_value()

        self.score_is_stale = False

        timeprint('finished initialization. initial ERGM score = {}'.format(self.score))
Beispiel #11
0
LOWER = True
ALL_LEMMAS = True

def lemmas(s):
    if ALL_LEMMAS:
        name = '_'.join(s.lemma_names())
    else:
        name = s.lemma_names()[0]
    if LOWER:
        name = name.lower()
    return name.split('_')
    
if __name__ == '__main__':
    if len(sys.argv) < 3:
        timeprint('usage: embed_from_words.py input_embs output_embs [WN prediction dataset]')
        exit(1)
        
    in_file = sys.argv[1]
    
    # create target dataset
    if len(sys.argv) > 3:
        # third param is WN dataset
        wn_vocab = load_prediction_dataset(sys.argv[3])[-1]
        synsets = [wn.synset(w) for w in wn_vocab]
    else:
        synsets = list(wn.all_synsets())
    timeprint('read {} synsets'.format(len(synsets)))
    
    target_words = set()
    timeprint('preparing target word dataset')
Beispiel #12
0
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file,
                   is_source):
    """
    Perform one iteration of trying to score a node's neighbors above negative samples.
    """

    # true instances likelihood
    trues = targets(g, node) if is_source else sources(g, node)
    side = '->' if is_source else '<-'
    if len(trues) == 0: return 0.0

    if opts.debug:
        dy.renew_cg(immediate_compute=True, check_validity=True)
    else:
        dy.renew_cg()

    # compute association score as dynet expression (can't do this above due to staleness)
    true_scores = []
    for tr in trues:
        if is_source:
            j_assoc_score = assoc_model.word_assoc_score(node, tr, rel)
        else:
            j_assoc_score = assoc_model.word_assoc_score(tr, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\
                         .format(node, side, tr, j_assoc_score.scalar_value()))
        true_scores.append(j_assoc_score)

    # false targets likelihood - negative sampling (uniform)
    # collect negative samples
    if opts.nll:
        sample_scores = [[ts] for ts in true_scores]
    else:
        margins = []
    neg_samples = [
        np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))
    ]
    # remove source and true targets if applicable
    for t in [node] + trues:
        if t in neg_samples:
            neg_samples.remove(t)
            neg_samples.append(np.random.choice(range(N)))
    for (i, ns) in enumerate(neg_samples):
        # compute association score as dynet expression
        if is_source:
            ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel)
        else:
            ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel)
        if log_file is not None:
            log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\
                         .format(node, side, ns, ns_assoc_score.scalar_value()))
        corresponding_true = i // opts.neg_samp
        if opts.nll:
            sample_scores[corresponding_true].append(ns_assoc_score)
        else:
            # TODO maybe use dy.hinge()
            ctt_score = true_scores[corresponding_true]
            margin = ctt_score - ns_assoc_score
            margins.append(dy.rectify(dy.scalarInput(1.0) - margin))

    # compute overall loss
    if opts.nll:
        if len(sample_scores) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum([
                dy.pickneglogsoftmax(dy.concatenate(scrs), 0)
                for scrs in sample_scores
            ])
    else:
        if len(margins) == 0:
            dy_loss = dy.scalarInput(0.0)
        else:
            dy_loss = dy.esum(margins)
    sc_loss = dy_loss.scalar_value()
    if log_file is not None:
        log_file.write('{}\tLOSS\t{:.3e}\n'\
                         .format(node, sc_loss))

    # backprop and recompute score
    if opts.v > 1:
        timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\
                  .format(rel, node, 'source' if is_source else 'target', sc_loss))

    dy_loss.backward()
    trainer.update()

    return sc_loss
Beispiel #13
0
def macro_loops(opts,
                ep_idx,
                multi_graph,
                trainer,
                log_file,
                synsets,
                use_assoc=True):
    """
    Passing over graph node by node, relation by relation.
    Single update returned, based on importance sampling from entire graph.
    :param opts: parameter dictionary from calling model
    :param ep_idx: epoch index
    :param multi_graph: trained data structure
    :param trainer: dynet training module
    :param log_file: log file location
    :param synsets: synset name dictionary for reporting
    :param use_assoc: include association component in scores
    :return: node-iteration scores
    """
    iteration_scores = []
    iteration_scores.append(multi_graph.score)

    N = multi_graph.vocab_size
    timeprint('caching original graph features')

    # report
    if opts.v > 0:
        timeprint('starting epoch {}'.format(ep_idx))

    if not opts.rand_all:
        # iterate over relations
        graphs_order = list(multi_graph.graphs.keys())
        if opts.rand_nodes:
            dy.np.random.shuffle(graphs_order)
        for rel in graphs_order:
            # report
            if opts.v > 0:
                timeprint('starting loop over {}'.format(rel))

            if opts.skip_symmetrics and rel in SYMMETRIC_RELATIONS:
                timeprint('skipping symmetric relation {}'.format(rel))
                continue

            if rel == 'co_hypernym':
                timeprint('skipping auxiliary co_hypernym relation')
                continue

            # compute target-wide association cache (no backprop)
            if use_assoc:
                assoc_cache = multi_graph.source_ranker_cache(rel)
            else:
                assoc_cache = np.zeros(
                    (multi_graph.word_assoc_weights[rel].shape()[0],
                     multi_graph.embeddings.shape()[1]))
            timeprint('calculated association cache for {}'.format(rel))

            # iterate over nodes:
            node_order = list(range(N))
            if opts.rand_nodes:
                dy.np.random.shuffle(node_order)
            for src_i in tqdm(node_order):
                assoc_cache = macro_node_iteration(opts, multi_graph,
                                                   assoc_cache, trainer,
                                                   log_file, synsets, rel,
                                                   src_i, use_assoc)

            multi_graph.rescore()
            # total score = sum over all nodes
            iteration_scores.append(multi_graph.score)
    else:
        # iterate randomly over <rel, node>-s iid
        # rand_nodes implied
        all_rels = list(multi_graph.graphs.keys())
        if opts.skip_symmetrics:
            rels = [r for r in all_rels if r not in SYMMETRIC_RELATIONS]
        else:
            rels = all_rels

        if 'co_hypernym' in rels:
            rels.remove('co_hypernym')

        if use_assoc:
            assoc_caches = {
                rel: multi_graph.source_ranker_cache(rel)
                for rel in rels
            }
        else:
            assoc_caches = {
                rel: np.zeros((multi_graph.word_assoc_weights[rel].shape()[0],
                               multi_graph.embeddings.shape()[1]))
                for rel in rels
            }

        relnode_order = list(range(N * len(rels)))
        dy.np.random.shuffle(relnode_order)
        for idx in tqdm(relnode_order):
            rel = rels[idx % len(rels)]
            src_i = idx % N
            assoc_caches[rel] = macro_node_iteration(opts, multi_graph,
                                                     assoc_caches[rel],
                                                     trainer, log_file,
                                                     synsets, rel, src_i,
                                                     use_assoc)

        # only happens once in this setup
        multi_graph.rescore()
        # total score = sum over all nodes
        iteration_scores.append(multi_graph.score)

    return iteration_scores
Beispiel #14
0
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--nll",
                        action='store_true',
                        help="use negative log likelihood loss")
    parser.add_argument("--no-log", action='store_true')
    parser.add_argument("--early-stopping",
                        action='store_true',
                        help="stop if model hasn't improved in 3 epochs")
    parser.add_argument("--v", type=int, default=1, help="verbosity")
    parser.add_argument("--debug", action='store_true')
    opts = parser.parse_args()

    start_time = datetime.now()

    # reporting
    timeprint('graphs file = {}'.format(opts.input))
    if opts.embeddings is not None:
        timeprint('embeddings file = {}'.format(opts.embeddings))
    else:
        timeprint('embeddings size = {}'.format(opts.emb_size))
    timeprint('association mode = {}'.format(opts.assoc_mode))
    timeprint('negative samples = {}'.format(opts.neg_samp))
    if opts.model is None:
        timeprint('model file = {}'.format(opts.model_out))
        if opts.nll:
            timeprint('using negative log likelihood loss')
        else:
            timeprint('using margin loss')
        if opts.no_log:
            timeprint('no log file. timestamp for test: {}_{}' \
                      .format(start_time.date(), start_time.time()))
Beispiel #15
0
    def __init__(self,
                 graphs,
                 embeddings,
                 assoc_mode=BILINEAR_MODE,
                 reg=0.0,
                 dropout=0.0,
                 no_assoc=False,
                 model_path=None,
                 ergm_path=None,
                 path_only_init=False):
        """
        :param graphs: dictionary of {relation:CSR-format graph}s, node-aligned
        :param embeddings: list of numpy array embeddings, indices aligned to nodes
        :param model_path: optional path for files with pre-trained association model (read by super)
        :param ergm_path: optional path for files with pre-trained model
        :param path_only_init: model_path only used for initialization
        """
        # input validation
        AssociationModel.__init__(self,
                                  graphs,
                                  embeddings,
                                  assoc_mode,
                                  dropout,
                                  model_path=model_path)

        # raw members
        self.no_assoc = no_assoc
        self.regularize = reg

        # cache members
        self.cache = {}
        self.edge_counts = self.add_cache_dict(
            'ec')  # keys are single relations
        self.mutual_edge_counts = self.add_cache_dict(
            'mec')  # keys are unordered relation pairs
        self.two_path_counts = self.add_cache_dict(
            'tpc')  # keys are ordered relation pairs
        self.transitive_closure_counts = self.add_cache_dict(
            'tcc')  # keys are ordered relation triplets
        self.directed_triangle_counts = self.add_cache_dict(
            'dtc')  # keys are ordered relation triplets
        self.in_degs = self.add_cache_dict(
            'ins')  # keys are single relations, values are big lists
        self.out_degs = self.add_cache_dict(
            'outs')  # keys are single relations, values are big lists
        self.in_one_star_counts = self.add_cache_dict(
            'i1sc')  # keys are single relations
        self.out_one_star_counts = self.add_cache_dict(
            'o1sc')  # keys are single relations
        self.in_two_star_counts = self.add_cache_dict(
            'i2sc')  # keys are unordered relation pairs
        self.out_two_star_counts = self.add_cache_dict(
            'o2sc')  # keys are unordered relation pairs
        self.in_three_star_counts = self.add_cache_dict(
            'i3sc')  # keys are unordered relation triplets
        self.out_three_star_counts = self.add_cache_dict(
            'o3sc')  # keys are unordered relation triplets
        # 'at least k' stars - 'one/two/three plus'
        self.in_one_p_star_counts = self.add_cache_dict(
            'i1psc')  # keys are single relations
        self.out_one_p_star_counts = self.add_cache_dict(
            'o1psc')  # keys are single relations
        self.in_two_p_star_counts = self.add_cache_dict(
            'i2psc')  # keys are unordered relation pairs
        self.out_two_p_star_counts = self.add_cache_dict(
            'o2psc')  # keys are unordered relation pairs
        self.in_three_p_star_counts = self.add_cache_dict(
            'i3psc')  # keys are unordered relation triplets
        self.out_three_p_star_counts = self.add_cache_dict(
            'o3psc')  # keys are unordered relation triplets

        self.missing_node_indices = [
        ]  # updates during training (NOT SURE IF NEEDED)

        timeprint('computing ERGM features...')
        self.init_ergm_features()  # populates self.feature_vals
        timeprint('finished! computed {} features'.format(
            len(self.feature_vals)))
        timeprint('{} non-zero features'.format(
            np.count_nonzero(list(self.feature_vals.values()))))

        # documentationy again, for efficient updates
        encountered_features = list(
            self.feature_vals.keys())  # canonical ordering from now on

        if ergm_path is not None:
            ergm_model_path = ergm_path
        elif (model_path is not None) and (not path_only_init):
            ergm_model_path = model_path
        else:
            ergm_model_path = None

        if ergm_model_path is None:
            self.feature_set = encountered_features
        else:
            self.feature_set = pickle.load(open(ergm_model_path + '.feats'))
            assert sorted(self.feature_set) == sorted(encountered_features)

        if ergm_model_path is None:
            self.ergm_weights = self.model.add_parameters(len(
                self.feature_set))

        if model_path is None and ergm_model_path is None:
            # 'model_path is not None' is initialized in super()
            # TODO support other association modes (affects downstream)
            if self.no_assoc:
                self.word_assoc_weights = {
                    r: self.model.add_parameters((self.emb_dim, self.emb_dim),
                                                 init=dy.ConstInitializer(0.0))
                    for r in self.relation_names
                }
            else:
                self.word_assoc_weights = {
                    r: self.model.add_parameters((self.emb_dim, self.emb_dim))
                    for r in self.relation_names
                }
        elif ergm_model_path is not None:
            pc = dy.ParameterCollection()
            dy.load(ergm_model_path + '.dyn', pc)
            pc_list = pc.parameters_list()
            i = 0
            self.ergm_weights = pc_list[i]
            if not path_only_init:
                self.word_assoc_weights = {}
                rel_order = self.relation_names
                for r in rel_order:
                    i += 1
                    self.word_assoc_weights[r] = pc_list[i]
                i += 1
                assert i == len(pc_list),\
                       '{} relation params read but length is {}'.format(i, len(pc_list))

        self.dy_score = self.ergm_score()
        self.score = self.dy_score.scalar_value()

        self.score_is_stale = False

        timeprint('finished initialization. initial ERGM score = {}'.format(
            self.score))
Beispiel #16
0
    def init_ergm_cache(self):
        """
        Computes ERGM features from scratch, populates cache members
        """

        # edges
        for r in self.relation_names:
            edges = edge_count(self.graphs[r])
            self.edge_counts[r] = edges

        timeprint('populated edge cache')

        # mutual edges
        for i, n1 in enumerate(self.relation_names):
            r1 = self.graphs[n1]
            for j in range(i + 1):  # unordered, including self
                n2 = self.relation_names[j]
                r2 = self.graphs[n2]
                mut_edges = mutual_edges(r1, r2)
                self.mutual_edge_counts[(n1, n2)] = mut_edges

        timeprint('populated mutual edge cache')

        # directed triangles - iterate over R^2 + choose(r,3)/3 ordered relation triplets
        eye = csr_eye(self.vocab_size)
        for i, j, k in cyclic_triads(self.R):
            n1, n2, n3 = self.relation_names[i], self.relation_names[
                j], self.relation_names[k]
            r1, r2, r3 = self.graphs[n1], self.graphs[n2], self.graphs[n3]
            dir_triangles = (r1 * r2 * r3).multiply(eye).sum()
            if i == j and j == k:  # each triangle was counted thrice, except self loops
                self_loops = r1.diagonal().sum()
                dir_triangles += 2 * self_loops
                dir_triangles /= 3
            self.directed_triangle_counts[(n1, n2, n3)] = dir_triangles

        timeprint('extracted directed triangle features')

        # transitive closures - iterate over ordered relation triplets
        # (also populate self.two_path_counts)
        for n1, r1 in list(self.graphs.items()):
            for n2, r2 in list(self.graphs.items()):
                two_paths = r1 * r2
                two_path_count = max([two_paths.sum(), sys.float_info.epsilon])
                self.two_path_counts[(n1, n2)] = two_path_count
                for n3, r3 in list(self.graphs.items()):
                    closures = two_paths.multiply(r3).sum()  # pointwise
                    self.transitive_closure_counts[(n1, n2, n3)] = closures

        timeprint('populated transitivity cache')

        # 1-star cache for updates + self-2,3-stars
        for n, g in list(self.graphs.items()):
            self.in_degs[n] = g.sum(0)  # numpy matrix
            self.out_degs[n] = g.sum(1).transpose()  # numpy matrix

            osc = one_rel_star_counts(self.in_degs[n], self.out_degs[n])

            self.in_one_star_counts[n] = osc['i1sc']
            self.out_one_star_counts[n] = osc['o1sc']
            self.in_two_star_counts[(n, n)] = osc['i2sc']
            self.out_two_star_counts[(n, n)] = osc['o2sc']
            self.in_three_star_counts[(n, n, n)] = osc['i3sc']
            self.out_three_star_counts[(n, n, n)] = osc['o3sc']

            self.in_one_p_star_counts[n] = osc['i1psc']
            self.out_one_p_star_counts[n] = osc['o1psc']
            self.in_two_p_star_counts[(n, n)] = osc['i2psc']
            self.out_two_p_star_counts[(n, n)] = osc['o2psc']
            self.in_three_p_star_counts[(n, n, n)] = osc['i3psc']
            self.out_three_p_star_counts[(n, n, n)] = osc['o3psc']

        timeprint('populated 1r-star cache')

        # 2-stars
        for n1, n2 in combinations(self.relation_names, 2):

            tsc = two_rel_star_counts(self.in_degs[n1], self.out_degs[n1],\
                                      self.in_degs[n2], self.out_degs[n2])

            self.in_two_star_counts[(n1, n2)] = tsc['i2sc']
            self.out_two_star_counts[(n1, n2)] = tsc['o2sc']
            self.in_three_star_counts[(n1, n1, n2)] = tsc['i3sc112']
            self.out_three_star_counts[(n1, n1, n2)] = tsc['o3sc112']
            self.in_three_star_counts[(n1, n2, n2)] = tsc['i3sc122']
            self.out_three_star_counts[(n1, n2, n2)] = tsc['o3sc122']
            self.in_two_p_star_counts[(n1, n2)] = tsc['i2psc']
            self.out_two_p_star_counts[(n1, n2)] = tsc['o2psc']
            self.in_three_p_star_counts[(n1, n1, n2)] = tsc['i3psc112']
            self.out_three_p_star_counts[(n1, n1, n2)] = tsc['o3psc112']
            self.in_three_p_star_counts[(n1, n2, n2)] = tsc['i3psc122']
            self.out_three_p_star_counts[(n1, n2, n2)] = tsc['o3psc122']

        timeprint('populated 2r-star cache')

        # 3-stars
        for n1, n2, n3 in combinations(self.relation_names, 3):
            ttsc = three_rel_star_counts(self.in_degs[n1], self.out_degs[n1],\
                                         self.in_degs[n2], self.out_degs[n2],\
                                         self.in_degs[n3], self.out_degs[n3])

            self.in_three_star_counts[(n1, n2, n3)] = ttsc['i3sc']
            self.out_three_star_counts[(n1, n2, n3)] = ttsc['o3sc']
            self.in_three_p_star_counts[(n1, n2, n3)] = ttsc['i3psc']
            self.out_three_p_star_counts[(n1, n2, n3)] = ttsc['o3psc']

        timeprint('populated 3r-star cache')
Beispiel #17
0
    def remove_edge(self, src_idx, trg_idx, rel, update_feats=True, permanent=True,
                    caches=None, report_feat_diff=False):
        """
        Removes edge from graph, updates cache and feature values
        :param src_idx: index of source node from edge to remove
        :param trg_idx: index of target node from edge to remove
        :param rel: relation type
        :param update_feats: flag for not updating all cache and features, to be deferred
        :returns: if permanent=False, returns ergm score of removed-edge graph
        """
        if permanent:
            self.score_is_stale = True
            cached_feats = None
            cached_cache = None
        else:
            if caches is not None:
                cached_cache = copy.deepcopy(caches[0])
                cached_feats = caches[1]
            else:
                cached_cache = copy.deepcopy(self.cache)
                cached_feats = copy.deepcopy(self.feature_vals)
            update_feats=True # no other mode possible

        # update cache members
        # decrement edge count for rel
        self.edge_counts[rel] -= 1

        # pair cache members
        for r,g in list(self.graphs.items()):
            if rel == 'hypernym' and r == 'co_hypernym':
                continue

            # decrement mutual edge count for pairs with trg-src edges
            if g[trg_idx, src_idx] == 1:
                self.mutual_edge_counts[find_key(self.mutual_edge_counts, (rel, r))] -= 1

            # decrement two-paths for x-src-trg and src-trg-y
            self.two_path_counts[(r, rel)] -= self.in_degs[r][0,src_idx]
            self.two_path_counts[(rel, r)] -= self.out_degs[r][0,trg_idx]
            
            # triplet cache members
            for r2, g2 in list(self.graphs.items()):
                if rel == 'hypernym' and r2 == 'co_hypernym':
                    continue
                    
                # decrement transitive closures from two-paths src-x-trg
                if self.out_degs[r][0,src_idx] > 0 and self.in_degs[r2][0,trg_idx] > 0:
                    r_r2_betweens = (g[src_idx] * g2[:,trg_idx]).sum()
                    self.transitive_closure_counts[(r, r2, rel)] -= r_r2_betweens
                # decrement directed triangle count
                if self.out_degs[r2][0,trg_idx] > 0 and self.in_degs[r][0,src_idx] > 0:
                    r_r2_cycles = (g2[trg_idx] * g[:,src_idx]).sum()
                    rs_key = find_cyclic_key(self.directed_triangle_counts, (r, rel, r2))
                    self.directed_triangle_counts[rs_key] -= r_r2_cycles

        # decrement src's out_degree and trg's in_degree in rel and update all related caches
        self.out_degs[rel][0,src_idx] -= 1
        self.in_degs[rel][0,trg_idx] -= 1

        if update_feats:
            # recompute heavy cache updates from raw counts
            self.update_stars_cache_from_edge(rel, src_idx, trg_idx, added=False)

            # update features from caches
            self.update_features()

        if not permanent and report_feat_diff:
            timeprint('changed feature values:')
            diff_keys = [k for k in self.feature_vals if self.feature_vals[k] != cached_feats[k]]
            if len(diff_keys) > 0:
                print('\n'.join(['{}: from {} to {}'\
                      .format(k, cached_feats[k], self.feature_vals[k]) for k in diff_keys]))
        
        if permanent:
            # remove actual edge
            self.graphs[rel][src_idx,trg_idx] = 0
        else:
            if rel == 'hypernym':
                self.zero_all_feats('co_hypernym')
            
            # prepare return value
            ret = self.ergm_score()

            # revert everything
            self.reread_cache(cached_cache)
            self.feature_vals = cached_feats

            # return prepared score
            return ret
Beispiel #18
0
    def add_edge(self, src_idx, trg_idx, rel, permanent=False, caches=None, report_feat_diff=False):
        """
        Uses cache to update feature values and produce score
        :param src_idx: index of source node from edge to add
        :param trg_idx: index of target node from edge to add
        :param rel: relation type
        :param permanent: True if node assignment to remain as is (inference mode, or restitution)
        :param cache: optional - precomputed backup members (cache, features)
        :return: new ergm score
        """
        # back cache up
        if caches is not None:
            backup_cache = copy.deepcopy(caches[0])
            backup_feats = caches[1]
        elif not permanent:
            backup_cache = copy.deepcopy(self.cache)
            backup_feats = copy.deepcopy(self.feature_vals)
        else:
            backup_cache = None
            backup_feats = None

        # update cache members
        # increment edge count for r
        self.edge_counts[rel] += 1

        # pair cache members
        for r,g in list(self.graphs.items()):
            if rel == 'hypernym' and r == 'co_hypernym':
                continue

            # increment mutual edge count for pairs with trg-src edges
            if g[trg_idx, src_idx] == 1:
                self.mutual_edge_counts[find_key(self.mutual_edge_counts, (rel, r))] += 1

            # increment two-paths for x-src-trg and src-trg-y
            self.two_path_counts[(r, rel)] += self.in_degs[r][0,src_idx]
            self.two_path_counts[(rel, r)] += self.out_degs[r][0,trg_idx]

            # triplet cache members
            for r2, g2 in list(self.graphs.items()):
                if rel == 'hypernym' and r2 == 'co_hypernym':
                    continue
                    
                # increment transitive closures from two-paths src-x-trg
                if self.out_degs[r][0,src_idx] > 0 and self.in_degs[r2][0,trg_idx] > 0:
                    r_r2_betweens = (g[src_idx] * g2[:,trg_idx]).sum()
                    self.transitive_closure_counts[(r, r2, rel)] += r_r2_betweens
                # increment directed triangle count
                if self.out_degs[r2][0,trg_idx] > 0 and self.in_degs[r][0,src_idx] > 0:
                    r_r2_cycles = (g2[trg_idx] * g[:,src_idx]).sum()
                    rs_key = find_cyclic_key(self.directed_triangle_counts, (r, rel, r2))
                    self.directed_triangle_counts[rs_key] += r_r2_cycles

        # increment src's out_degree and trg's in_degree in rel and update all related caches
        self.out_degs[rel][0,src_idx] += 1
        self.in_degs[rel][0,trg_idx] += 1
        
        self.update_stars_cache_from_edge(rel, src_idx, trg_idx)
        
        # update features from caches
        self.update_features()
        if rel == 'hypernym':
            self.zero_all_feats('co_hypernym')
        
        if report_feat_diff:
            timeprint('changed feature values:')
            diff_keys = [k for k in self.feature_vals if self.feature_vals[k] != backup_feats[k]]
            print('\n'.join(['{}: from {} to {}'\
                  .format(k, backup_feats[k], self.feature_vals[k]) for k in diff_keys]))

        # compute score for loss
        ret = self.ergm_score()

        if permanent:
            # add actual edge
            self.graphs[rel][src_idx,trg_idx] = 1
            # update score
            self.dy_score = ret
            self.score = ret.scalar_value()
            self.score_is_stale = False
        else:
            self.reread_cache(backup_cache)
            self.feature_vals = backup_feats

        return ret
Beispiel #19
0
    def add_edge(self,
                 src_idx,
                 trg_idx,
                 rel,
                 permanent=False,
                 caches=None,
                 report_feat_diff=False):
        """
        Uses cache to update feature values and produce score
        :param src_idx: index of source node from edge to add
        :param trg_idx: index of target node from edge to add
        :param rel: relation type
        :param permanent: True if node assignment to remain as is (inference mode, or restitution)
        :param cache: optional - precomputed backup members (cache, features)
        :return: new ergm score
        """
        # back cache up
        if caches is not None:
            backup_cache = copy.deepcopy(caches[0])
            backup_feats = caches[1]
        elif not permanent:
            backup_cache = copy.deepcopy(self.cache)
            backup_feats = copy.deepcopy(self.feature_vals)
        else:
            backup_cache = None
            backup_feats = None

        # update cache members
        # increment edge count for r
        self.edge_counts[rel] += 1

        # pair cache members
        for r, g in list(self.graphs.items()):
            if rel == 'hypernym' and r == 'co_hypernym':
                continue

            # increment mutual edge count for pairs with trg-src edges
            if g[trg_idx, src_idx] == 1:
                self.mutual_edge_counts[find_key(self.mutual_edge_counts,
                                                 (rel, r))] += 1

            # increment two-paths for x-src-trg and src-trg-y
            self.two_path_counts[(r, rel)] += self.in_degs[r][0, src_idx]
            self.two_path_counts[(rel, r)] += self.out_degs[r][0, trg_idx]

            # triplet cache members
            for r2, g2 in list(self.graphs.items()):
                if rel == 'hypernym' and r2 == 'co_hypernym':
                    continue

                # increment transitive closures from two-paths src-x-trg
                if self.out_degs[r][0, src_idx] > 0 and self.in_degs[r2][
                        0, trg_idx] > 0:
                    r_r2_betweens = (g[src_idx] * g2[:, trg_idx]).sum()
                    self.transitive_closure_counts[(r, r2,
                                                    rel)] += r_r2_betweens
                # increment directed triangle count
                if self.out_degs[r2][0, trg_idx] > 0 and self.in_degs[r][
                        0, src_idx] > 0:
                    r_r2_cycles = (g2[trg_idx] * g[:, src_idx]).sum()
                    rs_key = find_cyclic_key(self.directed_triangle_counts,
                                             (r, rel, r2))
                    self.directed_triangle_counts[rs_key] += r_r2_cycles

        # increment src's out_degree and trg's in_degree in rel and update all related caches
        self.out_degs[rel][0, src_idx] += 1
        self.in_degs[rel][0, trg_idx] += 1

        self.update_stars_cache_from_edge(rel, src_idx, trg_idx)

        # update features from caches
        self.update_features()
        if rel == 'hypernym':
            self.zero_all_feats('co_hypernym')

        if report_feat_diff:
            timeprint('changed feature values:')
            diff_keys = [
                k for k in self.feature_vals
                if self.feature_vals[k] != backup_feats[k]
            ]
            print('\n'.join(['{}: from {} to {}'\
                  .format(k, backup_feats[k], self.feature_vals[k]) for k in diff_keys]))

        # compute score for loss
        ret = self.ergm_score()

        if permanent:
            # add actual edge
            self.graphs[rel][src_idx, trg_idx] = 1
            # update score
            self.dy_score = ret
            self.score = ret.scalar_value()
            self.score_is_stale = False
        else:
            self.reread_cache(backup_cache)
            self.feature_vals = backup_feats

        return ret
Beispiel #20
0
    parser.add_argument("--regularize", type=float, default=0.1)
    parser.add_argument("--neg-samp", type=int, default=M3GM_DEFAULT_NEGS, help="number of negative samples")
    parser.add_argument("--no-assoc-bp", action='store_true', help="do not backprop into association model")
    parser.add_argument("--dropout", type=float, default=0.0,
                        help="dropout for association model only, set to 0.0 in no-assoc-bp mode")

    # testing
    parser.add_argument("--rule-override", type=bool, default=True, help="rule-based override for symmetric relations")
    parser.add_argument("--rerank", type=int, default=100, help="number of top results to rerank")
    opts = parser.parse_args()

    # init
    start_time = datetime.now()

    # reporting
    timeprint('graphs file = {}'.format(opts.input))
    timeprint('embeddings file = {}'.format(opts.embeddings if opts.embeddings is not None \
                                                else 'of size {}'.format(opts.emb_size)))
    timeprint('association mode = {}'.format(opts.assoc_mode))
    timeprint('reranker output file = {}'.format(opts.rerank_out))
    if opts.model is None:
        timeprint('model output file = {}, only init = {}'.format(opts.model_out, opts.model_only_init))
        timeprint('epochs = {}'.format(opts.epochs))
        timeprint('Adagrad learning rate = {}'.format(opts.learning_rate))
        timeprint('neg-samp = {}'.format(opts.neg_samp))
        timeprint('rand-nodes = {}'.format(opts.rand_nodes))
        timeprint('dropout = {}'.format(opts.dropout))
        timeprint('regularizer labmda = {}'.format(opts.regularize))
    else:
        timeprint('model file = {}, ergm model file = {}'.format(opts.model, opts.ergm_model))
        if opts.ergm_model is not None:
Beispiel #21
0
    def __init__(self,
                 graphs,
                 embeddings,
                 mode=TRANSLATIONAL_EMBED_MODE,
                 dropout=0.0,
                 model_path=None):
        """
        :param graphs: dictionary of <relation:CSR-format graph>s, node-aligned
        :param embeddings: list of numpy array embeddings, indices aligned to nodes
        :param mode: mode of calculating association score, options: {}
        """.format(MODES_STR)
        # input validation
        graph_sizes = list(set([g.shape[0] for g in list(graphs.values())]))
        assert len(graph_sizes) == 1
        assert len(embeddings) == graph_sizes[0], '{} != {}'.format(
            len(embeddings), graph_sizes[0])

        # raw members
        self.graphs = {
            canonicalize_name(k): g
            for k, g in list(graphs.items())
        }
        self.mode = mode

        # documenationy members
        self.relation_names = sorted(self.graphs.keys())
        if 'co_hypernym' in self.relation_names:
            self.relation_names.remove('co_hypernym')
        self.vocab_size = graph_sizes[0]
        self.R = len(self.relation_names)
        self.emb_dim = len(embeddings[0])
        self.dropout = dropout

        # model members
        self.model = dy.Model()
        # TODO consider using no_update param for embeddings
        self.embeddings = self.model.add_lookup_parameters(
            (self.vocab_size, self.emb_dim))
        self.embeddings.init_from_array(embeddings)

        # init association parameter
        self.no_assoc = False  # so can be overriden in inheritors

        # first determine
        if self.mode == BILINEAR_MODE:  # full-rank bilinear matrix
            assoc_dim = (self.emb_dim, self.emb_dim)
        elif self.mode == DIAG_RANK1_MODE:  # diagonal bilinear matrix + rank 1 matrix
            # first row = diagonal
            # second row = 'source factor'
            # third row = 'target factor'
            assoc_dim = (3, self.emb_dim)
        elif self.mode == TRANSLATIONAL_EMBED_MODE:  # additive relational vector
            assoc_dim = self.emb_dim
        elif self.mode == DISTMULT:  # diagonal bilinear matrix
            assoc_dim = self.emb_dim
        else:
            raise ValueError('unsupported mode: {}. allowed are {}'\
                             .format(self.mode, ', '.join(MODES_STR)))

        # init actual parameter
        self.word_assoc_weights = {
            r: self.model.add_parameters(assoc_dim)
            for r in self.relation_names
        }
        if model_path is not None:
            self.model.populate(model_path + '.dyn')

        timeprint('finished initialization for association model.')
Beispiel #22
0
def eval(prev_graphs, graphs, ergm, opts, N, log_file, rerank_file):
    writing = log_file is not None

    caches = (copy.deepcopy(ergm.cache),
              copy.deepcopy(ergm.feature_vals))

    rel_all_ranks = {}  # for final results
    rel_pre_ranks = {}  # for improvement analysis
    rel_erg_ranks = {}  # for ergm-alone analysis
    all_pre_ranks = []
    all_all_ranks = []
    all_erg_ranks = []
    insts = Counter()
    total_misses = Counter()
    overrides = Counter()
    rerank_ups = Counter()
    rerank_downs = Counter()
    erg_ups = Counter()
    erg_downs = Counter()
    rerank_diff = Counter()
    erg_diff = Counter()

    change_idx = 1

    rels_order = list(graphs.items())
    for rel, te_gr in rels_order:
        if rel == 'co_hypernym':
            continue

        # set up
        if writing:
            timeprint('testing relation {}'.format(rel))
            log_file.write('relation: {}\n'.format(rel))
        # add incrementally, eval each edge, revert
        tr_gr = prev_graphs[rel]  # to filter known connections
        s_assoc_cache = ergm.source_ranker_cache(rel)
        t_assoc_cache = ergm.target_ranker_cache(rel)
        override_rel = opts.rule_override and rel in SYMMETRIC_RELATIONS
        all_ranks = []
        pre_ranks = []
        erg_ranks = []
        if override_rel and writing:
            log_file.write('RELATION OVERRIDE\n')
        node_order = list(range(N))  # DO NOT RANDOMIZE THIS - NEED TO PREDICT BOTH SIDES
        for node in tqdm(node_order):
            s_trues, s_unch_loc_ranks, s_loc_gold_ranks, s_gold_reranked, s_gold_ergs, s_pls, change_idx = \
                node_loop(change_idx, ergm, rel, node, s_assoc_cache,
                          caches, tr_gr, te_gr, override_rel, opts.rerank, True, log_file, rerank_file)
            t_trues, t_unch_loc_ranks, t_loc_gold_ranks, t_gold_reranked, t_gold_ergs, t_pls, change_idx = \
                node_loop(change_idx, ergm, rel, node, t_assoc_cache,
                          caches, tr_gr, te_gr, override_rel, opts.rerank, False, log_file, rerank_file)

            total_trues = s_trues + t_trues
            insts[rel] += (total_trues)
            if override_rel:
                overrides[rel] += total_trues

            ulr = s_unch_loc_ranks + t_unch_loc_ranks
            lgr = s_loc_gold_ranks + t_loc_gold_ranks
            grr = s_gold_reranked + t_gold_reranked
            ger = s_gold_ergs + t_gold_ergs
            total_misses[rel] += (len(ulr))

            pre_ranks.extend(lgr)
            if override_rel:
                erg_ranks.extend(lgr)
                all_ranks.extend(lgr)
            else:
                all_ranks.extend(ulr + grr)
                erg_ranks.extend(ulr + ger)

            for pl in s_pls + t_pls:
                if pl[3] < pl[2]:
                    rerank_ups[rel] += 1
                if pl[3] > pl[2]:
                    rerank_downs[rel] += 1
                if pl[4] < pl[2]:
                    erg_ups[rel] += 1
                if pl[4] > pl[2]:
                    erg_downs[rel] += 1
                rerank_diff[rel] += (pl[2] - pl[3])
                erg_diff[rel] += (pl[2] - pl[4])

        rel_all_ranks[rel] = all_ranks
        rel_pre_ranks[rel] = pre_ranks
        rel_erg_ranks[rel] = erg_ranks

        all_all_ranks.extend(all_ranks)
        all_pre_ranks.extend(pre_ranks)
        all_erg_ranks.extend(erg_ranks)

    if writing:
        log_file.write('\nper relation:\n')
        for rel in list(graphs.keys()):
            if insts[rel] > 0 and insts[rel] - total_misses[rel] > 0:
                log_file.write('\n{}:\n'.format(rel))
                log_file.write('{} instances, {} misses\n'.format(insts[rel], total_misses[rel]))
                log_file.write('reranks: {} up, {} down\n'.format(rerank_ups[rel], rerank_downs[rel]))
                log_file.write('ERGM only: {} up, {} down\n'.format(erg_ups[rel], erg_downs[rel]))
                log_file.write('rank diff: {}, ERGM only: {}\n'.format(rerank_diff[rel], erg_diff[rel]))
                log_file.write('metrics: pre-rank\trerank\tERGM only\n')
                log_file.write('average rank: {:.5f}\t{:.5f}\t{:.5f}\n'.format(np.average(rel_pre_ranks[rel]),
                                                                               np.average(rel_all_ranks[rel]),
                                                                               np.average(rel_erg_ranks[rel])))
                log_file.write('mrr: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mrr(rel_pre_ranks[rel]), mrr(rel_all_ranks[rel]),
                                                                      mrr(rel_erg_ranks[rel])))
                log_file.write(
                    'mq: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mq(rel_pre_ranks[rel], N), mq(rel_all_ranks[rel], N),
                                                          mq(rel_erg_ranks[rel], N)))
                log_file.write('h@100: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel], n=100),
                                                                        h_at_n(rel_all_ranks[rel], n=100),
                                                                        h_at_n(rel_erg_ranks[rel], n=100)))
                log_file.write(
                    'h@10: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel]), h_at_n(rel_all_ranks[rel]),
                                                            h_at_n(rel_erg_ranks[rel])))
                log_file.write('h@1: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(rel_pre_ranks[rel], n=1),
                                                                      h_at_n(rel_all_ranks[rel], n=1),
                                                                      h_at_n(rel_erg_ranks[rel], n=1)))

        log_file.write('\ntotals:\n')
        log_file.write('total number of instances: {}\n'.format(sum(insts.values())))
        log_file.write('total misses: {}\n'.format(sum(total_misses.values())))
        log_file.write('overrides: {}\n'.format(sum(overrides.values())))
        log_file.write(
            'rerank improvements: {}; regressions: {}\n'.format(sum(rerank_ups.values()), sum(rerank_downs.values())))
        log_file.write(
            'only ERGM improvements: {}; regressions: {}\n'.format(sum(erg_ups.values()), sum(erg_downs.values())))
        log_file.write(
            'total rank diffs: rerank {}, only ERGM {}\n'.format(sum(rerank_diff.values()), sum(erg_diff.values())))

        log_file.write('metrics: pre-rank\trerank\tERGM only\n')
        log_file.write(
            'average rank: {:.5f}\t{:.5f}\t{:.5f}\n'.format(np.average(all_pre_ranks), np.average(all_all_ranks),
                                                            np.average(all_erg_ranks)))
        log_file.write(
            'mrr: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mrr(all_pre_ranks), mrr(all_all_ranks), mrr(all_erg_ranks)))
        log_file.write(
            'mq: {:.4f}\t{:.4f}\t{:.4f}\n'.format(mq(all_pre_ranks, N), mq(all_all_ranks, N), mq(all_erg_ranks, N)))
        log_file.write(
            'h@100: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks, n=100), h_at_n(all_all_ranks, n=100),
                                                     h_at_n(all_erg_ranks, n=100)))
        log_file.write('h@10: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks), h_at_n(all_all_ranks),
                                                               h_at_n(all_erg_ranks)))
        log_file.write('h@1: {:.5f}\t{:.5f}\t{:.5f}\n'.format(h_at_n(all_pre_ranks, n=1), h_at_n(all_all_ranks, n=1),
                                                              h_at_n(all_erg_ranks, n=1)))

    print('number of instances:', sum(insts.values()))
    print('total misses:', sum(total_misses.values()))
    print('overrides:', sum(overrides.values()))
    print('average rank:', np.average(all_all_ranks))
    print('mrr: {:.4f}'.format(mrr(all_all_ranks)))
    print('mq:', mq(all_all_ranks, N))
    print('h@100: {:.5f}'.format(h_at_n(all_all_ranks, n=100)))
    print('h@10: {:.5f}'.format(h_at_n(all_all_ranks)))
    print('h@1: {:.5f}'.format(h_at_n(all_all_ranks, n=1)))

    return mrr(all_all_ranks), h_at_n(all_all_ranks, n=10), h_at_n(all_all_ranks, n=3), h_at_n(all_all_ranks, n=1)
Beispiel #23
0
def macro_node_iteration(opts, multi_graph, assoc_cache, trainer, log_file,
                         synsets, rel, src_i, use_assoc):
    """
    One node-relation iteration in a macro-level pass over the multigraph
    :param opts: parameter dictionary from calling model
    :param multi_graph: trained data structure
    :param assoc_cache: cache for association model
    :param trainer: dynet training module
    :param log_file: log file location
    :param synsets: synset name dictionary for reporting
    :param rel: relation type for iteration
    :param src_i: source node ID for iteration
    :param use_assoc: use association score model
    :return: state of cache after iteration
    """

    g = multi_graph.graphs[rel]
    N = multi_graph.vocab_size

    # set up iteration
    if opts.debug:
        dy.renew_cg(immediate_compute=True, check_validity=True)
    else:
        dy.renew_cg()

    # keep existing score for all deltas
    multi_graph.rescore()
    score_with_all = multi_graph.dy_score

    # report progress
    perform_verbosity_steps = opts.v > 1 or (opts.v > 0 and src_i > 0
                                             and src_i % 10 == 0)
    if perform_verbosity_steps:
        timeprint('iterating on node {}, {}, current score = {:.6f}'\
                  .format(src_i, synsets[src_i], score_with_all.scalar_value()))

    # true targets scoring

    true_targets = targets(g, src_i)

    if len(true_targets) == 0:
        # don't perform negative sampling without true targets
        return assoc_cache

    # compute log likelihood on targets
    # each used to be multiplied by multi_graph.a_scale
    target_assoc_scores = {
        t: multi_graph.word_assoc_score(src_i, t, rel)
        for t in true_targets
    }
    if opts.no_assoc_bp:
        # turn into values to detach from computation graph
        target_assoc_scores = {
            t: t_as.value()
            for t, t_as in list(target_assoc_scores.items())
        }
    target_scores = {
        t: score_with_all + t_as
        for t, t_as in list(target_assoc_scores.items())
    }

    # false targets scoring - importance sampling

    # compute softmax over all false targets based on bilinear scores
    if use_assoc:
        assoc_sc = multi_graph.score_from_source_cache(assoc_cache, src_i)
        neg_assocs = {
            j: s
            for j, s in enumerate(assoc_sc)
            if j not in true_targets and j != src_i
        }
    else:
        neg_assocs = {
            j: 1.0
            for j in range(N) if j not in true_targets and j != src_i
        }
    neg_probs = softmaxify(neg_assocs)

    # collect negative samples
    # TODO see if searchsorted can work here too (issue in dynet repo)
    neg_samples = {t: [dy.np.random.choice(range(len(neg_assocs)), p=neg_probs)\
                      for _ in range(opts.neg_samp)]\
                   for t in true_targets} # sample without return?

    # for reporting
    if perform_verbosity_steps:
        neg_sample_idcs = []
        for negs in list(neg_samples.values()):
            neg_sample_idcs.extend([list(neg_assocs.keys())[j] for j in negs])

    # compute neg log likelihood on negative samples
    margins = []
    for t in true_targets:
        t_score = target_scores[t]
        negs = [list(neg_assocs.keys())[j] for j in neg_samples[t]]
        # each used to be multiplied by multi_graph.a_scale
        neg_assoc_scores = [
            multi_graph.word_assoc_score(src_i, j, rel) for j in negs
        ]
        if opts.no_assoc_bp:
            # turn into values to detach from computation graph
            neg_assoc_scores = [s.value() for s in neg_assoc_scores]
        # prepare graph for pass
        multi_graph.remove_edge(src_i, t, rel, permanent=True)
        t_cache = (copy.deepcopy(multi_graph.cache),
                   copy.deepcopy(multi_graph.feature_vals))
        for jas, j, origj in zip(neg_assoc_scores, negs, neg_samples[t]):
            q_norm = 1.0 / neg_probs[origj]
            g_score = multi_graph.add_edge(src_i,
                                           j,
                                           rel,
                                           caches=t_cache,
                                           report_feat_diff=opts.v > 1)
            margins.append(
                dy.rectify(g_score + jas + MARGIN - t_score) * q_norm)
            log_file.write('{}\t{}\t{}\t{}\t{:.2e}\t{:.2e}\t{:.2e}\n'\
                         .format(rel, src_i, t, j, t_score.scalar_value(),
                                 g_score.scalar_value(), jas if type(jas) == float else jas.value()))
        # revert graph for next margin iteration
        multi_graph.add_edge(src_i, t, rel, permanent=True)
    node_loss = dy.esum(margins)

    # backprop and recompute score
    if perform_verbosity_steps:
        timeprint('selected nodes {} with probabilities {}'\
                  .format(neg_sample_idcs, ['{:.2e}'.format(neg_probs[n]) for n in neg_samples]))
        timeprint('overall {} loss = {:.6f}'\
                  .format('margin' if opts.margin_loss else 'neg log', node_loss.scalar_value()))

        # record state for later reporting
        pre_weights = multi_graph.ergm_weights.as_array()
        pre_assoc = multi_graph.word_assoc_weights[rel].as_array()

    # add regularization
    if multi_graph.regularize > 0.0:
        node_loss += multi_graph.regularize * dy.l2_norm(
            dy.parameter(multi_graph.ergm_weights))

    # perform actual learning
    node_loss.backward()
    trainer.update()

    if perform_verbosity_steps:
        post_weights = multi_graph.ergm_weights.as_array()
        post_assoc = multi_graph.word_assoc_weights[rel].as_array()
        w_diff = post_weights - pre_weights
        a_diff = post_assoc - pre_assoc
        timeprint('changed weights = {}'.format(len(w_diff.nonzero()[0])))
        timeprint('changed pre_assoc = {}, norm {}'\
                  .format(len(a_diff.nonzero()[0]), np.linalg.norm(a_diff)))

    # recompute assoc_cache columns for src_i and participating targets
    if use_assoc and not opts.no_assoc_bp:
        # TODO normalize embeddings?
        return multi_graph.source_ranker_cache(rel)
    return assoc_cache
Beispiel #24
0

def lemmas(s):
    if ALL_LEMMAS:
        name = '_'.join(s.lemma_names())
    else:
        name = s.lemma_names()[0]
    if LOWER:
        name = name.lower()
    return name.split('_')


if __name__ == '__main__':
    if len(sys.argv) < 3:
        timeprint(
            'usage: embed_from_words.py input_embs output_embs [WN prediction dataset]'
        )
        exit(1)

    in_file = sys.argv[1]

    # create target dataset
    if len(sys.argv) > 3:
        # third param is WN dataset
        wn_vocab = load_prediction_dataset(sys.argv[3])[-1]
        synsets = [wn.synset(w) for w in wn_vocab]
    else:
        synsets = list(wn.all_synsets())
    timeprint('read {} synsets'.format(len(synsets)))

    target_words = set()