def compute(dataset_name): """ Compute the mappings < degree class -> facts that belong to that degree class > from all the test facts of a dataset, given the dataset name. :param dataset_name: the name of the dataset to compute the mappings for :return: a dict that associates each degree class to the corresponding test facts """ print( "Computing the mappings <degree class -> list of test facts belonging to that degree class> for dataset %s ..." % dataset_name) degree_class_2_facts = defaultdict(lambda: []) # get the mappings entity mid -> degree _, _, mid_2_degree = entity_degrees.read(dataset_name) # for each test fact, get the head degree and tail degree dataset = Dataset(dataset_name) for (head, relation, tail) in dataset.test_triples: head_degree = mid_2_degree[head] tail_degree = mid_2_degree[tail] for degree_class in CLASSES: if _fact_belongs_to_degree_class(head_degree, tail_degree, degree_class): degree_class_2_facts[degree_class].append( (head, relation, tail)) break return degree_class_2_facts
def read_filtered_ranks_entries_for(model_name, dataset_name, entity_number=None, tie_policy="average"): print("Reading filtered rank entries for " + model_name + " results on " + dataset_name + " with tie policy " + tie_policy + "...") filepath = filtered_ranks_path(model_name, dataset_name, tie_policy) if entity_number is None: _, _, entity_2_degree = entity_degrees.read(dataset_name) entity_number = len(entity_2_degree) entries = [] with open(filepath) as input_data: lines = input_data.readlines() for line in lines: line = html.unescape(line) (head, relation, tail, rank_head_filtered, rank_tail_filtered) = line.strip().split(";") entry = dict() entry["head"] = head entry["relation"] = relation entry["tail"] = tail if rank_head_filtered.startswith("MISS_"): if tie_policy == "max": entry["head_rank_filtered"] = float(entity_number) elif tie_policy == "average": entry["head_rank_filtered"] = ( int(rank_head_filtered.replace("MISS_", "")) + float(entity_number)) / 2 elif tie_policy == "min": entry["head_rank_filtered"] = float( rank_head_filtered.replace("MISS_", "")) else: entry["head_rank_filtered"] = float(rank_head_filtered) if rank_tail_filtered.startswith("MISS_"): if tie_policy == "max": entry["tail_rank_filtered"] = float(entity_number) elif tie_policy == "average": entry["tail_rank_filtered"] = ( int(rank_tail_filtered.replace("MISS_", "")) + float(entity_number)) / 2 elif tie_policy == "min": entry["tail_rank_filtered"] = float( rank_tail_filtered.replace("MISS_", "")) else: entry["tail_rank_filtered"] = float(rank_tail_filtered) entries.append(entry) return entries
def get_dicts(dataset_name): _, _, entity_2_degree = entity_degrees.read(dataset_name) relation_2_mentions = relation_mentions.read(dataset_name) dataset = Dataset(dataset_name) head_degree_2_amount_of_facts = defaultdict(lambda: 0) tail_degree_2_amount_of_facts = defaultdict(lambda: 0) relation_mentions_2_amount_of_facts = defaultdict(lambda: 0) for (head, relation, tail) in dataset.train_triples: head_degree = entity_2_degree[head] tail_degree = entity_2_degree[tail] rel_mentions = relation_2_mentions[relation] head_degree_2_amount_of_facts[head_degree] += 1 tail_degree_2_amount_of_facts[tail_degree] += 1 relation_mentions_2_amount_of_facts[rel_mentions] += 1 return head_degree_2_amount_of_facts, tail_degree_2_amount_of_facts, relation_mentions_2_amount_of_facts
from io_utils import * from collections import defaultdict def plot(x, y, title, xlabel, ylabel): plt.scatter(x, y, s=1, color='blue') plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) #plt.xscale('log') #plt.yscale('log') plt.show() entity_2_in_degree, entity_2_out_degree, entity_2_degree, = entity_degrees.read( FB15K) relation_2_mentions = relation_mentions.read(FB15K) relation_2_coarse_class = relation_coarse_classes.read(FB15K, return_rel_2_class=True) # for each entity, compute a dict "type of relation" -> number of times that the entity occurs as head for that type of relation entity_2_head_counts = defaultdict(lambda: defaultdict(lambda: 0)) # for each entity, compute a dict "type of relation" -> number of times that the entity occurs as tail for that type of relation entity_2_tail_counts = defaultdict(lambda: defaultdict(lambda: 0)) dataset = Dataset(FB15K) for (head, relation, tail) in dataset.train_triples: entity_2_head_counts[head][relation_2_coarse_class[relation]] += 1 entity_2_tail_counts[tail][relation_2_coarse_class[relation]] += 1
hits_3_perc = float(hits_3) * 100 / len(all_ranks) hits_5_perc = float(hits_5) * 100 / len(all_ranks) hits_10_perc = float(hits_10) * 100 / len(all_ranks) print("Mean Rank:\t\t\t\t\t%f" % mean_rank) print("Mean Reciprocal Rank: \t\t%f%%" % mean_reciprocal_rank) print("Hits@1:\t\t\t\t\t\t%f%%" % hits_1_perc) print("Hits@3:\t\t\t\t\t\t%f%%" % hits_3_perc) print("Hits@5:\t\t\t\t\t\t%f%%" % hits_5_perc) print("Hits@10:\t\t\t\t\t%s" % hits_10_perc) print( str(round(hits_1_perc, 2)) + " " + str(round(hits_10_perc, 2)) + " " + str(round(mean_reciprocal_rank, 3))) #a, b, c = str(hits_1_perc), str(hits_10_perc), str(mean_reciprocal_rank) #a, b, c = a.replace("0.", "."), b.replace("0.", "."), c.replace("0.", ".") #print(a + " " + b + " " + c) entity_2_in_degree, entity_2_out_degree, entity_2_degree = entity_degrees.read( datasets.FB15K) filtered_ranks_entries_avg = performances.read_filtered_ranks_entries_for( models.RSN, datasets.FB15K, "avg") filtered_ranks_entries_min = performances.read_filtered_ranks_entries_for( models.RSN, datasets.FB15K, "min") for cur_dataset_name in datasets.ALL_DATASET_NAMES: for cur_model_name in models.ALL_MODEL_NAMES: print_metrics_for(cur_model_name, cur_dataset_name) print()
relation_mentions_2_mean_rank = dict() for item in entity_degree_2_ranks.items(): if len(item[1]) > 100: entity_degree_2_mean_rank[item[0]] = np.average(item[1]) else: entity_degree_2_mean_rank[item[0]] = None for item in relation_mentions_2_ranks.items(): relation_mentions_2_mean_rank[item[0]] = np.average(item[1]) return entity_degree_2_mean_rank, relation_mentions_2_mean_rank _, _, entity2degree = entity_degrees.read(FB15K) relation2mentions = relation_mentions.read(FB15K) transE_entries = performances.read_filtered_ranks_entries_for(TRANSE, FB15K) rotatE_entries = performances.read_filtered_ranks_entries_for(ROTATE, FB15K) convE_entries = performances.read_filtered_ranks_entries_for(CONVE, FB15K) simplE_entries = performances.read_filtered_ranks_entries_for(SIMPLE, FB15K) anyburl_entries = performances.read_filtered_ranks_entries_for(ANYBURL, FB15K) transE_entity_degree_2_rank, transE_relation_mentions_2_rank = get_dicts_from_entries( transE_entries, entity2degree, relation2mentions) rotatE_entity_degree_2_rank, rotatE_relation_mentions_2_rank = get_dicts_from_entries( rotatE_entries, entity2degree, relation2mentions) convE_entity_degree_2_rank, convE_relation_mentions_2_rank = get_dicts_from_entries( convE_entries, entity2degree, relation2mentions) simplE_entity_degree_2_rank, simplE_relation_mentions_2_rank = get_dicts_from_entries(
entity_degree_2_hits[tail_degree] += 1.0 relation_mentions_2_hits[relation_mentions_number] += 1.0 for key in entity_degree_2_hits: entity_degree_2_hits[ key] = entity_degree_2_hits[key] / entity_degree_2_count[key] for key in relation_mentions_2_hits: relation_mentions_2_hits[key] = relation_mentions_2_hits[ key] / relation_mentions_2_count[key] return entity_degree_2_hits, relation_mentions_2_hits dataset_name = FB15K models_names = [ROTATE] _, _, entity_2_degree = entity_degrees.read(dataset_name) relation_2_mentions = relation_mentions.read(dataset_name) for model_name in models_names: model_entries = performances.read_filtered_ranks_entries_for( model_name, dataset_name) model_entity_degree_2_hits, model_relation_mentions_2_hits = get_dicts_from_entries( model_entries, entity_2_degree, relation_2_mentions) plot_dict( model_entity_degree_2_hits, model_name + " entity degree vs hits@1", "degree", "percentage of hits@1 on all predictions of entities with that degree") # plot_dict(rotatE_relation_mentions_2_hits, "RotatE relation mentions vs mean rank", "relation mentions", "percentage of hits@1 on all predictions of entities with that degree")
plt.grid(True) plt.show() def distribution_of(name_2_count): count_2_amount_of_names = defaultdict(lambda: 0) for (name, count) in name_2_count.items(): count_2_amount_of_names[count] += 1 return count_2_amount_of_names _, _, fb15k_entity_degrees = entity_degrees.read(FB15K) fb15k_relation_mentions = relation_mentions.read(FB15K) _, _, wn18_entity_degrees = entity_degrees.read(WN18) wn18_relation_mentions = relation_mentions.read(WN18) _, _, fb15k237_entity_degrees = entity_degrees.read(FB15K_237) fb15k237_relation_mentions = relation_mentions.read(FB15K_237) _, _, wn18rr_entity_degrees = entity_degrees.read(WN18RR) wn18rr_relation_mentions = relation_mentions.read(WN18RR) fb15k_entity_degrees_distribution = distribution_of(fb15k_entity_degrees) fb15k_relation_mentions_distribution = distribution_of(fb15k_relation_mentions) wn18_entity_degrees_distribution = distribution_of(wn18_entity_degrees)
def read_filtered_details_entries_for(model_name, dataset_name, entity_number=None, tie_policy="average"): print("Reading filtered details entries for " + model_name + " results on " + dataset_name + " with tie policy " + tie_policy + "...") filepath = filtered_details_path(model_name, dataset_name, tie_policy) if entity_number is None: _, _, entity_2_degree = entity_degrees.read(dataset_name) entity_number = len(entity_2_degree) fact_2_head_tail_details = defaultdict(lambda: dict()) with open(filepath) as input_data: lines = input_data.readlines() for line in lines: line = html.unescape(line) (head, relation, tail, type, details) = line.strip().split(";", 4) details = details[1:-1].split(";") key = ";".join([head, relation, tail]) fact_2_head_tail_details[key][type] = details entries = [] for key in fact_2_head_tail_details: entry = dict() head, relation, tail = key.split(";") entry["head"] = head entry["relation"] = relation entry["tail"] = tail head_details = fact_2_head_tail_details[key]["predict head"] tail_details = fact_2_head_tail_details[key]["predict tail"] entry["head_details_filtered"] = head_details entry["tail_details_filtered"] = tail_details if head_details[-1].startswith("MISS_"): if tie_policy == "max": entry["head_rank_filtered"] = float(entity_number) elif tie_policy == "average": entry["head_rank_filtered"] = (float(len(head_details)) + float(entity_number)) / 2 elif tie_policy == "min": entry["head_rank_filtered"] = float(len(head_details)) else: entry["head_rank_filtered"] = float(len(head_details)) if tail_details[-1].startswith("MISS_"): if tie_policy == "max": entry["tail_rank_filtered"] = float(entity_number) elif tie_policy == "average": entry["tail_rank_filtered"] = (float(len(tail_details)) + float(entity_number)) / 2 elif tie_policy == "min": entry["tail_rank_filtered"] = float(len(tail_details)) else: entry["tail_rank_filtered"] = float(len(tail_details)) entries.append(entry) return entries
import math from collections import defaultdict from dataset_analysis.degrees import entity_degrees from dataset_analysis.degrees import degree_classes from datasets import FB15K, Dataset dataset = Dataset(FB15K) test_fact_2_degree_class = degree_classes.read(FB15K, return_fact_2_class=True) _, _, mid_2_degree = entity_degrees.read(FB15K) degree_class_2_count = defaultdict(lambda: 0) all_count = len(test_fact_2_degree_class) for test_fact in test_fact_2_degree_class: degree_class = test_fact_2_degree_class[test_fact] degree_class_2_count[degree_class] += 1 for degree_class in degree_class_2_count: perc = 100 * float(degree_class_2_count[degree_class]) / float(all_count) perc = round(perc, 2) print(degree_class) print(str(perc) + "%") print()