Beispiel #1
0
def print_rule(rule, id2relation):
    premises, hypothesis, conf = rule
    premises = [(premise[0], id2relation[premise[2]], premise[1])
                for premise in premises]
    hypothesis = [(hypothesis[0], id2relation[hypothesis[2]], hypothesis[1])]
    rule = premises + [['=>']] + hypothesis + [[str(conf)]]
    print_time_info('  '.join(' '.join(part) for part in rule))
Beispiel #2
0
 def save(self, directory):
     if not directory.exists():
         directory.mkdir()
     save_path = directory / 'cgc.pkl'
     with open(save_path, 'wb') as f:
         pickle.dump(self, f)
     print_time_info('Successfully save cgc to %s.' % save_path)
Beispiel #3
0
    def evaluate(self):
        self.net.eval()
        sr_data, tg_data = list(zip(*self.cgc.test_entity_seeds))
        sr_data = torch.tensor(sr_data, dtype=torch.int64)
        tg_data = torch.tensor(tg_data, dtype=torch.int64)
        if self.is_cuda:
            sr_data = sr_data.cuda()
            tg_data = tg_data.cuda()
        sim = self.net.predict((sr_data, tg_data))
        for x, y in self.aligned_entites:
            sim[x, y] -= 1.0
        top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl = get_hits(sim)
        self.writer.add_scalars('data/Hits@N', {'Hits@1 sr': top_lr[0],
                                                'Hits@10 sr': top_lr[1],
                                                'Hits@1 tg': top_rl[0],
                                                'Hits@10 tg': top_rl[1]},
                                self.now_epoch)
        self.writer.add_scalars('data/Rank', {'MR sr': mr_lr,
                                              'MRR sr': mrr_lr,
                                              'MR tg': mr_rl,
                                              'MRR tg': mrr_rl},
                                self.now_epoch)

        if top_lr[0] + top_rl[0] > self.best_hits_1[1] + self.best_hits_1[2]:
            self.best_hits_1 = (self.now_epoch, top_lr[0], top_rl[0])
            self.bad_result = 0
        else:
            self.bad_result += 1
        print_time_info('Current best Hits@1 at the %dth epoch: (%.2f, %.2f)' % (self.best_hits_1))
Beispiel #4
0
 def init(self, directory, load=False):
     set_random_seed()
     directory = Path(directory)
     self.graph_pair = directory.name
     if load:
         try:
             self.cgc = CrossGraphCompletion.restore(directory /
                                                     'running_temp')
         except FileNotFoundError:
             print_time_info(
                 'CrossGraphCompletion cache file not found, start from the beginning.'
             )
             self.cgc = CrossGraphCompletion(directory,
                                             self.train_seeds_ratio,
                                             self.rule_transfer,
                                             self.graph_completion)
             self.cgc.init()
             self.cgc.save(directory / 'running_temp')
     else:
         self.cgc = CrossGraphCompletion(directory, self.train_seeds_ratio,
                                         self.rule_transfer,
                                         self.graph_completion)
         self.cgc.init()
         self.cgc.save(directory / 'running_temp')
     self.cgc.check()
Beispiel #5
0
 def print_parameter(self, file=None):
     parameters = self.__dict__
     print_time_info('Parameter setttings:', dash_top=True, file=file)
     print('\tNet: ', type(self.net).__name__, file=file)
     for key, value in parameters.items():
         if type(value) in {int, float, str, bool}:
             print('\t%s:' % key, value, file=file)
     print('---------------------------------------', file=file)
Beispiel #6
0
def _print_new_rules(bi_new_rules, id2relation_sr, id2relation_tg):
    for language, rules in bi_new_rules.items():
        print_time_info(language, dash_top=True)
        for rule in rules[:20]:
            # for rule in random.choices(rules, k=20):
            if language == 'sr':
                print_rule(rule, id2relation_sr)
            else:
                print_rule(rule, id2relation_tg)
Beispiel #7
0
 def atom_parser(string):
     atoms = []
     for atom in atom_regex.finditer(string):
         # (head, tail, relation)
         atoms.append((atom.group(1), atom.group(3), int(atom.group(2))))
     if not atoms:
         print('-------------------------')
         print_time_info(string)
         raise ValueError('Parse atom failed.')
     return atoms
Beispiel #8
0
def _print_new_triple_confs(bi_new_triple_confs, id2entity_sr, id2entity_tg,
                            id2relation_sr, id2relation_tg):
    for language, triple_confs in bi_new_triple_confs.items():
        print_time_info(language, dash_top=True)
        for triple in random.choices(list(triple_confs.keys()), k=10):
            conf = triple_confs[triple]
            if language == 'sr':
                print_triple(triple, id2entity_sr, id2relation_sr, end='')
            else:
                print_triple(triple, id2entity_tg, id2relation_tg, end='')
            print(' ', conf)
Beispiel #9
0
 def init_log(self, log_dir):
     log_dir = Path(log_dir)
     if log_dir.exists():
         raise FileExistsError('The directory already exists!')
     else:
         log_dir.mkdir()
     comment = log_dir.name
     self.writer = SummaryWriter(str(log_dir))
     with open(log_dir / 'parameters.txt', 'w') as f:
         print_time_info(comment, file=f)
         self.print_parameter(f)
     print_time_info('Successfully initialized log in "%s" directory!' % log_dir)
Beispiel #10
0
 def init_log(self, log_dir):
     log_dir = Path(log_dir)
     if log_dir.exists():
         print('Warning: we will remove %s' % (str(log_dir)))
         shutil.rmtree(str(log_dir))
     log_dir.mkdir()
     comment = log_dir.name
     self.writer = SummaryWriter(str(log_dir))
     with open(log_dir / 'parameters.txt', 'w') as f:
         print_time_info(comment, file=f)
         self.print_parameter(f)
     print_time_info('Successfully initialized log in "%s" directory!' %
                     log_dir)
Beispiel #11
0
def rule_parser(file_path):
    '''
    Accept the output of an AMIE+ .jar software and transform to ...
    '''
    atom_regex = re.compile(r'\?([a-z])  <([0-9]*?)>  \?([a-z])')

    def atom_parser(string):
        atoms = []
        for atom in atom_regex.finditer(string):
            # (head, tail, relation)
            atoms.append((atom.group(1), atom.group(3), int(atom.group(2))))
        if not atoms:
            print('-------------------------')
            print_time_info(string)
            raise ValueError('Parse atom failed.')
        return atoms

    def premises_reformat(premises):
        variables = set()
        for atom in premises:
            variables.add(atom[0])
            variables.add(atom[1])
        variables = list(variables)
        variables.sort()
        mapping = {}
        for i, a in enumerate(variables):
            mapping[a] = chr(ord('a') + i)
        for i in range(len(premises)):
            head, tail, relation = premises[i]
            premises[i] = (mapping[head], mapping[tail], int(relation))
        return premises

    with open(file_path, 'r', encoding='utf8') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines if line[0] == '?']
        rule_confs = [(lambda x: (x[0], float(x[3])))(line.split('\t'))
                      for line in lines]

    rules = []
    for rule, conf in rule_confs:
        premises, hypothesis = rule.split('=>')
        premises = atom_parser(premises)
        premises = premises_reformat(premises)
        hypothesis = atom_parser(hypothesis)
        if not len(hypothesis) == 1:
            print('-------------------------')
            print_time_info(rule)
            raise ValueError('Parse rule failed.')
        rules.append((premises, hypothesis[0], conf))
        # premises, hypothesis
    return rules
Beispiel #12
0
    def init_triple_coefficient(self):
        # get relation2conf
        self.relation2conf_sr = get_relation2conf(self.rules_sr)
        self.relation2conf_tg = get_relation2conf(self.rules_tg)
        print_time_info('sr r2conf num: ' + str(len(self.relation2conf_sr)) +
                        ' average: ' + str(
                            sum(self.relation2conf_sr.values()) /
                            len(self.relation2conf_sr)),
                        dash_top=True)
        print_time_info('tg r2conf num: ' + str(len(self.relation2conf_tg)) +
                        ' average: ' + str(
                            sum(self.relation2conf_tg.values()) /
                            len(self.relation2conf_tg)),
                        dash_top=True)

        # get relation2imp
        self.relation2imp_sr = get_relation2imp(self.triples_sr,
                                                len(self.id2relation_sr))
        self.relation2imp_tg = get_relation2imp(self.triples_tg,
                                                len(self.id2relation_tg))
        print_time_info('sr r2imp num: ' + str(len(self.relation2imp_sr)) +
                        ' average: ' + str(
                            sum(self.relation2imp_sr.values()) /
                            len(self.relation2imp_sr)),
                        dash_top=True)
        print_time_info('tg r2imp num: ' + str(len(self.relation2imp_tg)) +
                        ' average: ' + str(
                            sum(self.relation2imp_tg.values()) /
                            len(self.relation2imp_tg)),
                        dash_top=True)
Beispiel #13
0
def mine_rule_with_amie(path2triples, path2rules):
    '''
    '''
    import subprocess
    from project_path import executable_dir
    minpca = 0.8
    maxad = 3
    num_process = 2
    jar_patch_path = executable_dir / 'amie_plus.jar'
    command = 'java -jar %s -maxad %d -minpca %f -nc %d %s > %s &' % (
        jar_patch_path, maxad, minpca, num_process, path2triples, path2rules)
    res = subprocess.call(command, shell=True)
    if res == 0:
        print_time_info('Mining started.')
    else:
        print_time_info('Something went wrong.')
Beispiel #14
0
def read_file(path, parse_func):
    num = -1
    with open(path, 'r', encoding='utf8') as f:
        line = f.readline().strip()
        if line.isdigit():
            num = int(line)
        else:
            f.seek(0)
        lines = f.readlines()

    lines = parse_func(lines)

    if len(lines) != num and num >= 0:
        print_time_info('File: %s has corruptted, data_num: %d/%d.' %
                        (path, num, len(lines)))
        raise ValueError()
    return lines
Beispiel #15
0
def _rule_based_graph_completion(triple_graph_sr, triple_graph_tg, rules_sr,
                                 rules_tg, triple2id_sr, triple2id_tg):
    '''
    triples = [(head, tail, relation)]
    return new [((head, tail, relation), conf)...]
    '''
    print_time_info('Rule based graph completion started!')

    def __rule_based_graph_completion(triple_graph, rules):
        triples = triple_graph.triples
        new_triple_confs = {}
        new_triple_premises = {}
        for rule in rules:
            # print('The rule is', rule)
            new_triple_conf_premises_candidates = triple_graph.inference_by_rule(
                rule)
            # i = 0
            for new_triple, conf, premises in new_triple_conf_premises_candidates:
                if not new_triple in triples:
                    if new_triple not in new_triple_confs:
                        new_triple_confs[new_triple] = conf
                        new_triple_premises[new_triple] = premises
                        # i += 1
                    else:
                        ori_conf = new_triple_confs[new_triple]
                        if ori_conf < conf:
                            new_triple_confs[new_triple] = conf
                            new_triple_premises[new_triple] = premises
                            # i += 1
            # print(i, '-----------')
        return new_triple_confs, new_triple_premises

    new_triple_confs_sr, new_triple_premises_sr = __rule_based_graph_completion(
        triple_graph_sr, rules_sr)
    new_triple_confs_tg, new_triple_premises_tg = __rule_based_graph_completion(
        triple_graph_tg, rules_tg)
    new_triple_premises_sr = {
        triple: [triple2id_sr[premise] for premise in premises]
        for triple, premises in new_triple_premises_sr.items()
    }
    new_triple_premises_tg = {
        triple: [triple2id_tg[premise] for premise in premises]
        for triple, premises in new_triple_premises_tg.items()
    }
    print_time_info('Rule based graph completion finished!')
    return new_triple_confs_sr, new_triple_confs_tg, new_triple_premises_sr, new_triple_premises_tg
Beispiel #16
0
    def __init__(self, cgc, data_name, triples, relations, nega_sample_num):
        self.triples = set(triples)
        assert len(self.triples) == len(triples)
        assert isinstance(cgc, CrossGraphCompletion)
        self.cgc = cgc
        self.data_name = data_name
        self.premise_pad = len(self.triples)
        print_time_info('premise pad number: %d' % self.premise_pad)
        self.nega_sample_num = nega_sample_num
        self.relations = relations
        self.h = []
        self.t = []
        self.pos_r = []
        self.neg_r = []
        self.premises = []

        self.check_p = -100
        self.init()
Beispiel #17
0
    def bootstrap(self, new_entity_seeds, new_relation_seeds):
        self.bp_entity_seeds = new_entity_seeds
        self.bp_relation_seeds = new_relation_seeds

        print_time_info('BootStrap: new triple infer started!')
        new_rules_sr, new_rules_tg = rule_transfer(
            self.rules_sr, self.rules_tg,
            self._relation_seeds + new_relation_seeds)
        rules_sr = {(premises, hypothesis)
                    for premises, hypothesis, conf in self.rules_sr}
        rules_tg = {(premises, hypothesis)
                    for premises, hypothesis, conf in self.rules_tg}
        new_rules_sr = [(premises, hypothesis, conf)
                        for premises, hypothesis, conf in new_rules_sr
                        if (premises, hypothesis) not in rules_sr]
        new_rules_tg = [(premises, hypothesis, conf)
                        for premises, hypothesis, conf in new_rules_tg
                        if (premises, hypothesis) not in rules_tg]
        new_triple_confs_sr, new_triple_confs_tg, new_triple_premises_sr, new_triple_premises_tg = _rule_based_graph_completion(
            self.triple_graph_sr, self.triple_graph_tg, new_rules_sr,
            new_rules_tg, self.triple2id_sr, self.triple2id_tg)

        self.bp_new_triple_confs_sr = {
            triple: conf
            for triple, conf in new_triple_confs_sr.items()
            if triple not in self._new_triple_confs_sr
        }
        self.bp_new_triple_confs_tg = {
            triple: conf
            for triple, conf in new_triple_confs_tg.items()
            if triple not in self._new_triple_confs_tg
        }
        self.bp_new_triple_premises_sr = {
            triple: premises
            for triple, premises in new_triple_premises_sr.items()
            if triple not in self._new_triple_confs_sr
        }
        self.bp_new_triple_premises_tg = {
            triple: premises
            for triple, premises in new_triple_premises_tg.items()
            if triple not in self._new_triple_confs_tg
        }
        print_time_info('BootStrap: sr new triple %d; tg new triple %d!' %
                        (len(new_triple_confs_sr), len(new_triple_confs_tg)))
Beispiel #18
0
 def _print_result_log(self, bi_new_triples, method, data_name='triple'):
     print('------------------------------------------------------------')
     print_time_info('language_pair: ' +
                     '_'.join(self.language_pair.values()))
     print_time_info('Method: ' + method)
     for key, language in self.language_pair.items():
         print_time_info(language + ' new %s numbers: ' % data_name +
                         str(len(bi_new_triples[key])))
     print('------------------------------------------------------------\n')
Beispiel #19
0
def get_hits(sim, top_k=(1, 10, 50, 100)):
    if isinstance(sim, np.ndarray):
        sim = torch.from_numpy(sim)
    top_lr, mr_lr, mrr_lr = topk(sim, top_k)
    top_rl, mr_rl, mrr_rl = topk(sim.t(), top_k)
    print_time_info('For each source:')
    print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr))
    for i in range(len(top_lr)):
        print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i]))
    print('')
    print_time_info('For each target:')
    print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_rl, mrr_rl))
    for i in range(len(top_rl)):
        print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_rl[i]))
    # return Hits@10
    return top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl
Beispiel #20
0
    def train(self):
        cgc = self.cgc
        with torch.no_grad():
            triples_sr = TripleDataset(cgc.triples_sr, self.nega_n_r)
            triples_tg = TripleDataset(cgc.triples_tg, self.nega_n_r)
            triples_data_sr = triples_sr.get_all()
            triples_data_tg = triples_tg.get_all()
            rules_sr = RuleDataset(cgc, 'new_triple_premises_sr', cgc.triples_sr, list(cgc.id2relation_sr.keys()),
                                   self.nega_n_r)
            rules_tg = RuleDataset(cgc, 'new_triple_premises_tg', cgc.triples_tg, list(cgc.id2relation_tg.keys()),
                                   self.nega_n_r)
            rules_data_sr = rules_sr.get_all()
            rules_data_tg = rules_tg.get_all()
            ad = AliagnmentDataset(cgc, 'entity_seeds', self.nega_n_e, len(cgc.id2entity_sr), len(cgc.id2entity_tg),
                                   self.is_cuda)
            ad_data = ad.get_all()
            ad_rel = AliagnmentDataset(cgc, 'relation_seeds', self.nega_n_r, len(cgc.id2relation_sr),
                                       len(cgc.id2relation_tg), self.is_cuda)
            ad_rel_data = ad_rel.get_all()

        if self.is_cuda:
            self.net.cuda()
            ad_data = [data.cuda() for data in ad_data]
            ad_rel_data = [data.cuda() for data in ad_rel_data]
            triples_data_sr = [data.cuda() for data in triples_data_sr]
            triples_data_tg = [data.cuda() for data in triples_data_tg]
            rules_data_sr = [data.cuda() for data in rules_data_sr]
            rules_data_tg = [data.cuda() for data in rules_data_tg]

        optimizer = self.optimizer(self.net.parameters(), lr=self.lr, weight_decay=self.l2_penalty)
        criterion_align = SpecialLossAlign(self.align_gamma, cuda=self.is_cuda)
        criterion_rel = SpecialLossAlign(self.rel_align_gamma, cuda=self.is_cuda)
        criterion_transe = SpecialLossRule(self.rule_gamma, cuda=self.is_cuda)
        criterion_rule = SpecialLossRule(self.rule_gamma, cuda=self.is_cuda)

        for epoch in range(self.num_epoch):
            self.net.train()
            optimizer.zero_grad()
            repre_sr, repre_tg, sr_rel_repre, tg_rel_repre, transe_tv, rule_tv = self.net(ad_data, ad_rel_data,
                                                                                          triples_data_sr,
                                                                                          triples_data_tg,
                                                                                          rules_data_sr, rules_data_tg)

            align_loss = criterion_align(repre_sr, repre_tg)
            rel_align_loss = criterion_rel(sr_rel_repre, tg_rel_repre)
            transe_loss = criterion_transe(transe_tv)
            if self.rule_infer:
                rule_loss = criterion_rule(rule_tv)
                loss = sum([align_loss, transe_loss, rel_align_loss, rule_loss])
            else:
                rule_loss = 0.0
                loss = sum([align_loss, rel_align_loss, transe_loss])
            loss.backward()
            optimizer.step()
            print_time_info(
                'Epoch: %d; align loss = %.4f; relation align loss = %.4f; transe loss = %.4f; rule loss = %.4f.' % (
                    epoch + 1, float(align_loss), float(rel_align_loss), float(transe_loss), float(rule_loss)))
            self.writer.add_scalars('data/Loss',
                                    {'Align Loss': float(align_loss), 'TransE Loss': float(transe_loss),
                                     'Rule Loss': float(rule_loss), 'Relation Align Loss': float(rel_align_loss)},
                                    epoch)
            self.now_epoch += 1
            if (epoch + 1) % self.update_cycle == 0:
                self.evaluate()
                ad_data, ad_rel_data, triples_data_sr, triples_data_tg, rules_data_sr, rules_data_tg = self.negative_sampling(
                    ad, ad_rel, triples_sr, triples_tg, rules_sr, rules_tg)
                if self.is_cuda:
                    torch.cuda.empty_cache()
                    ad_data = [data.cuda() for data in ad_data]
                    ad_rel_data = [data.cuda() for data in ad_rel_data]
                    triples_data_sr = [data.cuda() for data in triples_data_sr]
                    triples_data_tg = [data.cuda() for data in triples_data_tg]
                    rules_data_sr = [data.cuda() for data in rules_data_sr]
                    rules_data_tg = [data.cuda() for data in rules_data_tg]
Beispiel #21
0
def get_hits(sim, top_k=(1, 10, 50, 100)):
    test_num = sim.shape[0]

    # sim = spatial.distance.cdist(Lvec, Rvec, metric='minkowski', p=2)
    # sim = spatial.distance.cdist(Lvec, Rvec, metric='cityblock')
    def top_get(sim, top_k):
        top_x = [0] * len(top_k)
        for i in range(sim.shape[0]):
            rank = sim[i, :].argsort()
            rank_index = np.where(rank == i)[0][0]
            for j in range(len(top_k)):
                if rank_index < top_k[j]:
                    top_x[j] += 1
        return top_x

    top_lr, mr_lr, mrr_lr = multiprocess_topk(sim, top_k)
    top_rl, mr_rl, mrr_rl = multiprocess_topk(sim.T, top_k)

    print_time_info('For each source:')
    print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr))
    for i in range(len(top_lr)):
        print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i]))
    print('')
    print_time_info('For each target:')
    print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_rl, mrr_rl))
    for i in range(len(top_rl)):
        print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_rl[i]))
    # return Hits@10
    return top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl
Beispiel #22
0
    def __init__(self,
                 directory,
                 train_seeds_ratio,
                 rule_transfer=True,
                 graph_completion=True):
        '''
        we followed the experiment setting of JAPE
        the folder under the directory JAPE/0_x contains the entity alignment dataset for train and test.
        '''
        assert train_seeds_ratio in {0.1, 0.2, 0.3, 0.4, 0.5}, print_time_info(
            'Not a legal train seeds ratio: %f.' % train_seeds_ratio,
            dash_bot=True)
        self.directory = directory
        self.rule_transfer = rule_transfer
        self.train_seeds_ratio = train_seeds_ratio
        self.graph_completion = graph_completion
        language_sr, language_tg = directory.name.split('_')
        self.language_pair = {'sr': language_sr, 'tg': language_tg}

        self._entity_seeds = []
        self.bp_entity_seeds = []
        self.test_entity_seeds = []

        self._relation_seeds = []
        self.bp_relation_seeds = []
        self.test_relaiton_seeds = [
        ]  ## randomly initialized, used only for bootstrap

        self.triples_sr = []
        self.triples_tg = []
        self.triple2id_sr = {}
        self.triple2id_tg = {}

        self._new_triple_confs_sr = {}
        self._new_triple_confs_tg = {}
        self._new_triple_premises_sr = {}
        self._new_triple_premises_tg = {}

        self.bp_new_triple_confs_sr = {}
        self.bp_new_triple_confs_tg = {}
        self.bp_new_triple_premises_sr = {}
        self.bp_new_triple_premises_tg = {}

        self.rules_sr = []
        self.rules_tg = []
        self.rules_trans2_sr = []
        self.rules_trans2_tg = []
        self.id2entity_sr = {}
        self.id2entity_tg = {}
        self.id2relation_sr = {}
        self.id2relation_tg = {}

        # calculate the average PCA confidence of rules of which relation x is the tail
        # conf(r) = \frac{sum([pca\_conf | (premises, r, pca\_conf)\in rules])}{num((premises, r, pca\_conf)\in rules)}
        self.relation2conf_sr = {}
        self.relation2conf_tg = {}

        # calculate imp(r) = 1- min(\frac{num(head|(head, tail, r) \in triples)}{num(tail|(head, tail, r) \in triples)}, 1)
        self.relation2imp_sr = {}
        self.relation2imp_tg = {}

        self.triple_graph_sr = TripleGraph()
        self.triple_graph_tg = TripleGraph()
Beispiel #23
0
def print_triple(triple, id2entity, id2relation, end='\n'):
    head, tail, relation = triple
    print_time_info(' '.join(
        [id2entity[head], id2relation[relation], id2entity[tail]]),
                    end=end)
Beispiel #24
0
 def restore(cls, directory):
     load_path = directory / 'cgc.pkl'
     with open(load_path, 'rb') as f:
         new_one = pickle.load(f)
     print_time_info('Successfully loaded cgc from %s.' % load_path)
     return new_one
Beispiel #25
0
def _check(ori, new, num):
    if len(ori) != len(new):
        print_time_info('Check failed %d.' % num, dash_top=True)
        raise ValueError()