def print_rule(rule, id2relation): premises, hypothesis, conf = rule premises = [(premise[0], id2relation[premise[2]], premise[1]) for premise in premises] hypothesis = [(hypothesis[0], id2relation[hypothesis[2]], hypothesis[1])] rule = premises + [['=>']] + hypothesis + [[str(conf)]] print_time_info(' '.join(' '.join(part) for part in rule))
def save(self, directory): if not directory.exists(): directory.mkdir() save_path = directory / 'cgc.pkl' with open(save_path, 'wb') as f: pickle.dump(self, f) print_time_info('Successfully save cgc to %s.' % save_path)
def evaluate(self): self.net.eval() sr_data, tg_data = list(zip(*self.cgc.test_entity_seeds)) sr_data = torch.tensor(sr_data, dtype=torch.int64) tg_data = torch.tensor(tg_data, dtype=torch.int64) if self.is_cuda: sr_data = sr_data.cuda() tg_data = tg_data.cuda() sim = self.net.predict((sr_data, tg_data)) for x, y in self.aligned_entites: sim[x, y] -= 1.0 top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl = get_hits(sim) self.writer.add_scalars('data/Hits@N', {'Hits@1 sr': top_lr[0], 'Hits@10 sr': top_lr[1], 'Hits@1 tg': top_rl[0], 'Hits@10 tg': top_rl[1]}, self.now_epoch) self.writer.add_scalars('data/Rank', {'MR sr': mr_lr, 'MRR sr': mrr_lr, 'MR tg': mr_rl, 'MRR tg': mrr_rl}, self.now_epoch) if top_lr[0] + top_rl[0] > self.best_hits_1[1] + self.best_hits_1[2]: self.best_hits_1 = (self.now_epoch, top_lr[0], top_rl[0]) self.bad_result = 0 else: self.bad_result += 1 print_time_info('Current best Hits@1 at the %dth epoch: (%.2f, %.2f)' % (self.best_hits_1))
def init(self, directory, load=False): set_random_seed() directory = Path(directory) self.graph_pair = directory.name if load: try: self.cgc = CrossGraphCompletion.restore(directory / 'running_temp') except FileNotFoundError: print_time_info( 'CrossGraphCompletion cache file not found, start from the beginning.' ) self.cgc = CrossGraphCompletion(directory, self.train_seeds_ratio, self.rule_transfer, self.graph_completion) self.cgc.init() self.cgc.save(directory / 'running_temp') else: self.cgc = CrossGraphCompletion(directory, self.train_seeds_ratio, self.rule_transfer, self.graph_completion) self.cgc.init() self.cgc.save(directory / 'running_temp') self.cgc.check()
def print_parameter(self, file=None): parameters = self.__dict__ print_time_info('Parameter setttings:', dash_top=True, file=file) print('\tNet: ', type(self.net).__name__, file=file) for key, value in parameters.items(): if type(value) in {int, float, str, bool}: print('\t%s:' % key, value, file=file) print('---------------------------------------', file=file)
def _print_new_rules(bi_new_rules, id2relation_sr, id2relation_tg): for language, rules in bi_new_rules.items(): print_time_info(language, dash_top=True) for rule in rules[:20]: # for rule in random.choices(rules, k=20): if language == 'sr': print_rule(rule, id2relation_sr) else: print_rule(rule, id2relation_tg)
def atom_parser(string): atoms = [] for atom in atom_regex.finditer(string): # (head, tail, relation) atoms.append((atom.group(1), atom.group(3), int(atom.group(2)))) if not atoms: print('-------------------------') print_time_info(string) raise ValueError('Parse atom failed.') return atoms
def _print_new_triple_confs(bi_new_triple_confs, id2entity_sr, id2entity_tg, id2relation_sr, id2relation_tg): for language, triple_confs in bi_new_triple_confs.items(): print_time_info(language, dash_top=True) for triple in random.choices(list(triple_confs.keys()), k=10): conf = triple_confs[triple] if language == 'sr': print_triple(triple, id2entity_sr, id2relation_sr, end='') else: print_triple(triple, id2entity_tg, id2relation_tg, end='') print(' ', conf)
def init_log(self, log_dir): log_dir = Path(log_dir) if log_dir.exists(): raise FileExistsError('The directory already exists!') else: log_dir.mkdir() comment = log_dir.name self.writer = SummaryWriter(str(log_dir)) with open(log_dir / 'parameters.txt', 'w') as f: print_time_info(comment, file=f) self.print_parameter(f) print_time_info('Successfully initialized log in "%s" directory!' % log_dir)
def init_log(self, log_dir): log_dir = Path(log_dir) if log_dir.exists(): print('Warning: we will remove %s' % (str(log_dir))) shutil.rmtree(str(log_dir)) log_dir.mkdir() comment = log_dir.name self.writer = SummaryWriter(str(log_dir)) with open(log_dir / 'parameters.txt', 'w') as f: print_time_info(comment, file=f) self.print_parameter(f) print_time_info('Successfully initialized log in "%s" directory!' % log_dir)
def rule_parser(file_path): ''' Accept the output of an AMIE+ .jar software and transform to ... ''' atom_regex = re.compile(r'\?([a-z]) <([0-9]*?)> \?([a-z])') def atom_parser(string): atoms = [] for atom in atom_regex.finditer(string): # (head, tail, relation) atoms.append((atom.group(1), atom.group(3), int(atom.group(2)))) if not atoms: print('-------------------------') print_time_info(string) raise ValueError('Parse atom failed.') return atoms def premises_reformat(premises): variables = set() for atom in premises: variables.add(atom[0]) variables.add(atom[1]) variables = list(variables) variables.sort() mapping = {} for i, a in enumerate(variables): mapping[a] = chr(ord('a') + i) for i in range(len(premises)): head, tail, relation = premises[i] premises[i] = (mapping[head], mapping[tail], int(relation)) return premises with open(file_path, 'r', encoding='utf8') as f: lines = f.readlines() lines = [line.strip() for line in lines if line[0] == '?'] rule_confs = [(lambda x: (x[0], float(x[3])))(line.split('\t')) for line in lines] rules = [] for rule, conf in rule_confs: premises, hypothesis = rule.split('=>') premises = atom_parser(premises) premises = premises_reformat(premises) hypothesis = atom_parser(hypothesis) if not len(hypothesis) == 1: print('-------------------------') print_time_info(rule) raise ValueError('Parse rule failed.') rules.append((premises, hypothesis[0], conf)) # premises, hypothesis return rules
def init_triple_coefficient(self): # get relation2conf self.relation2conf_sr = get_relation2conf(self.rules_sr) self.relation2conf_tg = get_relation2conf(self.rules_tg) print_time_info('sr r2conf num: ' + str(len(self.relation2conf_sr)) + ' average: ' + str( sum(self.relation2conf_sr.values()) / len(self.relation2conf_sr)), dash_top=True) print_time_info('tg r2conf num: ' + str(len(self.relation2conf_tg)) + ' average: ' + str( sum(self.relation2conf_tg.values()) / len(self.relation2conf_tg)), dash_top=True) # get relation2imp self.relation2imp_sr = get_relation2imp(self.triples_sr, len(self.id2relation_sr)) self.relation2imp_tg = get_relation2imp(self.triples_tg, len(self.id2relation_tg)) print_time_info('sr r2imp num: ' + str(len(self.relation2imp_sr)) + ' average: ' + str( sum(self.relation2imp_sr.values()) / len(self.relation2imp_sr)), dash_top=True) print_time_info('tg r2imp num: ' + str(len(self.relation2imp_tg)) + ' average: ' + str( sum(self.relation2imp_tg.values()) / len(self.relation2imp_tg)), dash_top=True)
def mine_rule_with_amie(path2triples, path2rules): ''' ''' import subprocess from project_path import executable_dir minpca = 0.8 maxad = 3 num_process = 2 jar_patch_path = executable_dir / 'amie_plus.jar' command = 'java -jar %s -maxad %d -minpca %f -nc %d %s > %s &' % ( jar_patch_path, maxad, minpca, num_process, path2triples, path2rules) res = subprocess.call(command, shell=True) if res == 0: print_time_info('Mining started.') else: print_time_info('Something went wrong.')
def read_file(path, parse_func): num = -1 with open(path, 'r', encoding='utf8') as f: line = f.readline().strip() if line.isdigit(): num = int(line) else: f.seek(0) lines = f.readlines() lines = parse_func(lines) if len(lines) != num and num >= 0: print_time_info('File: %s has corruptted, data_num: %d/%d.' % (path, num, len(lines))) raise ValueError() return lines
def _rule_based_graph_completion(triple_graph_sr, triple_graph_tg, rules_sr, rules_tg, triple2id_sr, triple2id_tg): ''' triples = [(head, tail, relation)] return new [((head, tail, relation), conf)...] ''' print_time_info('Rule based graph completion started!') def __rule_based_graph_completion(triple_graph, rules): triples = triple_graph.triples new_triple_confs = {} new_triple_premises = {} for rule in rules: # print('The rule is', rule) new_triple_conf_premises_candidates = triple_graph.inference_by_rule( rule) # i = 0 for new_triple, conf, premises in new_triple_conf_premises_candidates: if not new_triple in triples: if new_triple not in new_triple_confs: new_triple_confs[new_triple] = conf new_triple_premises[new_triple] = premises # i += 1 else: ori_conf = new_triple_confs[new_triple] if ori_conf < conf: new_triple_confs[new_triple] = conf new_triple_premises[new_triple] = premises # i += 1 # print(i, '-----------') return new_triple_confs, new_triple_premises new_triple_confs_sr, new_triple_premises_sr = __rule_based_graph_completion( triple_graph_sr, rules_sr) new_triple_confs_tg, new_triple_premises_tg = __rule_based_graph_completion( triple_graph_tg, rules_tg) new_triple_premises_sr = { triple: [triple2id_sr[premise] for premise in premises] for triple, premises in new_triple_premises_sr.items() } new_triple_premises_tg = { triple: [triple2id_tg[premise] for premise in premises] for triple, premises in new_triple_premises_tg.items() } print_time_info('Rule based graph completion finished!') return new_triple_confs_sr, new_triple_confs_tg, new_triple_premises_sr, new_triple_premises_tg
def __init__(self, cgc, data_name, triples, relations, nega_sample_num): self.triples = set(triples) assert len(self.triples) == len(triples) assert isinstance(cgc, CrossGraphCompletion) self.cgc = cgc self.data_name = data_name self.premise_pad = len(self.triples) print_time_info('premise pad number: %d' % self.premise_pad) self.nega_sample_num = nega_sample_num self.relations = relations self.h = [] self.t = [] self.pos_r = [] self.neg_r = [] self.premises = [] self.check_p = -100 self.init()
def bootstrap(self, new_entity_seeds, new_relation_seeds): self.bp_entity_seeds = new_entity_seeds self.bp_relation_seeds = new_relation_seeds print_time_info('BootStrap: new triple infer started!') new_rules_sr, new_rules_tg = rule_transfer( self.rules_sr, self.rules_tg, self._relation_seeds + new_relation_seeds) rules_sr = {(premises, hypothesis) for premises, hypothesis, conf in self.rules_sr} rules_tg = {(premises, hypothesis) for premises, hypothesis, conf in self.rules_tg} new_rules_sr = [(premises, hypothesis, conf) for premises, hypothesis, conf in new_rules_sr if (premises, hypothesis) not in rules_sr] new_rules_tg = [(premises, hypothesis, conf) for premises, hypothesis, conf in new_rules_tg if (premises, hypothesis) not in rules_tg] new_triple_confs_sr, new_triple_confs_tg, new_triple_premises_sr, new_triple_premises_tg = _rule_based_graph_completion( self.triple_graph_sr, self.triple_graph_tg, new_rules_sr, new_rules_tg, self.triple2id_sr, self.triple2id_tg) self.bp_new_triple_confs_sr = { triple: conf for triple, conf in new_triple_confs_sr.items() if triple not in self._new_triple_confs_sr } self.bp_new_triple_confs_tg = { triple: conf for triple, conf in new_triple_confs_tg.items() if triple not in self._new_triple_confs_tg } self.bp_new_triple_premises_sr = { triple: premises for triple, premises in new_triple_premises_sr.items() if triple not in self._new_triple_confs_sr } self.bp_new_triple_premises_tg = { triple: premises for triple, premises in new_triple_premises_tg.items() if triple not in self._new_triple_confs_tg } print_time_info('BootStrap: sr new triple %d; tg new triple %d!' % (len(new_triple_confs_sr), len(new_triple_confs_tg)))
def _print_result_log(self, bi_new_triples, method, data_name='triple'): print('------------------------------------------------------------') print_time_info('language_pair: ' + '_'.join(self.language_pair.values())) print_time_info('Method: ' + method) for key, language in self.language_pair.items(): print_time_info(language + ' new %s numbers: ' % data_name + str(len(bi_new_triples[key]))) print('------------------------------------------------------------\n')
def get_hits(sim, top_k=(1, 10, 50, 100)): if isinstance(sim, np.ndarray): sim = torch.from_numpy(sim) top_lr, mr_lr, mrr_lr = topk(sim, top_k) top_rl, mr_rl, mrr_rl = topk(sim.t(), top_k) print_time_info('For each source:') print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr)) for i in range(len(top_lr)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i])) print('') print_time_info('For each target:') print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_rl, mrr_rl)) for i in range(len(top_rl)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_rl[i])) # return Hits@10 return top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl
def train(self): cgc = self.cgc with torch.no_grad(): triples_sr = TripleDataset(cgc.triples_sr, self.nega_n_r) triples_tg = TripleDataset(cgc.triples_tg, self.nega_n_r) triples_data_sr = triples_sr.get_all() triples_data_tg = triples_tg.get_all() rules_sr = RuleDataset(cgc, 'new_triple_premises_sr', cgc.triples_sr, list(cgc.id2relation_sr.keys()), self.nega_n_r) rules_tg = RuleDataset(cgc, 'new_triple_premises_tg', cgc.triples_tg, list(cgc.id2relation_tg.keys()), self.nega_n_r) rules_data_sr = rules_sr.get_all() rules_data_tg = rules_tg.get_all() ad = AliagnmentDataset(cgc, 'entity_seeds', self.nega_n_e, len(cgc.id2entity_sr), len(cgc.id2entity_tg), self.is_cuda) ad_data = ad.get_all() ad_rel = AliagnmentDataset(cgc, 'relation_seeds', self.nega_n_r, len(cgc.id2relation_sr), len(cgc.id2relation_tg), self.is_cuda) ad_rel_data = ad_rel.get_all() if self.is_cuda: self.net.cuda() ad_data = [data.cuda() for data in ad_data] ad_rel_data = [data.cuda() for data in ad_rel_data] triples_data_sr = [data.cuda() for data in triples_data_sr] triples_data_tg = [data.cuda() for data in triples_data_tg] rules_data_sr = [data.cuda() for data in rules_data_sr] rules_data_tg = [data.cuda() for data in rules_data_tg] optimizer = self.optimizer(self.net.parameters(), lr=self.lr, weight_decay=self.l2_penalty) criterion_align = SpecialLossAlign(self.align_gamma, cuda=self.is_cuda) criterion_rel = SpecialLossAlign(self.rel_align_gamma, cuda=self.is_cuda) criterion_transe = SpecialLossRule(self.rule_gamma, cuda=self.is_cuda) criterion_rule = SpecialLossRule(self.rule_gamma, cuda=self.is_cuda) for epoch in range(self.num_epoch): self.net.train() optimizer.zero_grad() repre_sr, repre_tg, sr_rel_repre, tg_rel_repre, transe_tv, rule_tv = self.net(ad_data, ad_rel_data, triples_data_sr, triples_data_tg, rules_data_sr, rules_data_tg) align_loss = criterion_align(repre_sr, repre_tg) rel_align_loss = criterion_rel(sr_rel_repre, tg_rel_repre) transe_loss = criterion_transe(transe_tv) if self.rule_infer: rule_loss = criterion_rule(rule_tv) loss = sum([align_loss, transe_loss, rel_align_loss, rule_loss]) else: rule_loss = 0.0 loss = sum([align_loss, rel_align_loss, transe_loss]) loss.backward() optimizer.step() print_time_info( 'Epoch: %d; align loss = %.4f; relation align loss = %.4f; transe loss = %.4f; rule loss = %.4f.' % ( epoch + 1, float(align_loss), float(rel_align_loss), float(transe_loss), float(rule_loss))) self.writer.add_scalars('data/Loss', {'Align Loss': float(align_loss), 'TransE Loss': float(transe_loss), 'Rule Loss': float(rule_loss), 'Relation Align Loss': float(rel_align_loss)}, epoch) self.now_epoch += 1 if (epoch + 1) % self.update_cycle == 0: self.evaluate() ad_data, ad_rel_data, triples_data_sr, triples_data_tg, rules_data_sr, rules_data_tg = self.negative_sampling( ad, ad_rel, triples_sr, triples_tg, rules_sr, rules_tg) if self.is_cuda: torch.cuda.empty_cache() ad_data = [data.cuda() for data in ad_data] ad_rel_data = [data.cuda() for data in ad_rel_data] triples_data_sr = [data.cuda() for data in triples_data_sr] triples_data_tg = [data.cuda() for data in triples_data_tg] rules_data_sr = [data.cuda() for data in rules_data_sr] rules_data_tg = [data.cuda() for data in rules_data_tg]
def get_hits(sim, top_k=(1, 10, 50, 100)): test_num = sim.shape[0] # sim = spatial.distance.cdist(Lvec, Rvec, metric='minkowski', p=2) # sim = spatial.distance.cdist(Lvec, Rvec, metric='cityblock') def top_get(sim, top_k): top_x = [0] * len(top_k) for i in range(sim.shape[0]): rank = sim[i, :].argsort() rank_index = np.where(rank == i)[0][0] for j in range(len(top_k)): if rank_index < top_k[j]: top_x[j] += 1 return top_x top_lr, mr_lr, mrr_lr = multiprocess_topk(sim, top_k) top_rl, mr_rl, mrr_rl = multiprocess_topk(sim.T, top_k) print_time_info('For each source:') print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_lr, mrr_lr)) for i in range(len(top_lr)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_lr[i])) print('') print_time_info('For each target:') print_time_info('MR: %.2f; MRR: %.2f%%.' % (mr_rl, mrr_rl)) for i in range(len(top_rl)): print_time_info('Hits@%d: %.2f%%' % (top_k[i], top_rl[i])) # return Hits@10 return top_lr, top_rl, mr_lr, mr_rl, mrr_lr, mrr_rl
def __init__(self, directory, train_seeds_ratio, rule_transfer=True, graph_completion=True): ''' we followed the experiment setting of JAPE the folder under the directory JAPE/0_x contains the entity alignment dataset for train and test. ''' assert train_seeds_ratio in {0.1, 0.2, 0.3, 0.4, 0.5}, print_time_info( 'Not a legal train seeds ratio: %f.' % train_seeds_ratio, dash_bot=True) self.directory = directory self.rule_transfer = rule_transfer self.train_seeds_ratio = train_seeds_ratio self.graph_completion = graph_completion language_sr, language_tg = directory.name.split('_') self.language_pair = {'sr': language_sr, 'tg': language_tg} self._entity_seeds = [] self.bp_entity_seeds = [] self.test_entity_seeds = [] self._relation_seeds = [] self.bp_relation_seeds = [] self.test_relaiton_seeds = [ ] ## randomly initialized, used only for bootstrap self.triples_sr = [] self.triples_tg = [] self.triple2id_sr = {} self.triple2id_tg = {} self._new_triple_confs_sr = {} self._new_triple_confs_tg = {} self._new_triple_premises_sr = {} self._new_triple_premises_tg = {} self.bp_new_triple_confs_sr = {} self.bp_new_triple_confs_tg = {} self.bp_new_triple_premises_sr = {} self.bp_new_triple_premises_tg = {} self.rules_sr = [] self.rules_tg = [] self.rules_trans2_sr = [] self.rules_trans2_tg = [] self.id2entity_sr = {} self.id2entity_tg = {} self.id2relation_sr = {} self.id2relation_tg = {} # calculate the average PCA confidence of rules of which relation x is the tail # conf(r) = \frac{sum([pca\_conf | (premises, r, pca\_conf)\in rules])}{num((premises, r, pca\_conf)\in rules)} self.relation2conf_sr = {} self.relation2conf_tg = {} # calculate imp(r) = 1- min(\frac{num(head|(head, tail, r) \in triples)}{num(tail|(head, tail, r) \in triples)}, 1) self.relation2imp_sr = {} self.relation2imp_tg = {} self.triple_graph_sr = TripleGraph() self.triple_graph_tg = TripleGraph()
def print_triple(triple, id2entity, id2relation, end='\n'): head, tail, relation = triple print_time_info(' '.join( [id2entity[head], id2relation[relation], id2entity[tail]]), end=end)
def restore(cls, directory): load_path = directory / 'cgc.pkl' with open(load_path, 'rb') as f: new_one = pickle.load(f) print_time_info('Successfully loaded cgc from %s.' % load_path) return new_one
def _check(ori, new, num): if len(ori) != len(new): print_time_info('Check failed %d.' % num, dash_top=True) raise ValueError()