def read(dataset_name, read_separator=";"): """ Read the mappings <prediction -> number of correct answers > from the corresponding file of the dataset with the given name and return them in the format of two dicts: - head_prediction_2_peers: maps all "head questions" relation;tail_entity to the number of correct head answers - tail_prediction_2_peers: maps all "tail questions" head_entity;relation to the number of correct tail answers :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :return: the computed mappings """ print("Reading number of peers for training facts of dataset %s" % dataset_name) filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER, TRAIN_FACTS_WITH_PEERS_FILENAME) head_prediction_2_peers = defaultdict(lambda: 0) tail_prediction_2_peers = defaultdict(lambda: 0) with open(filepath, "r") as input_file: lines = input_file.readlines() for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff head, relation, tail, head_peers, tail_peers = line.strip().split( read_separator) head_prediction_2_peers[relation + ";" + tail] = int(head_peers) tail_prediction_2_peers[head + ";" + relation] = int(tail_peers) return head_prediction_2_peers, tail_prediction_2_peers
def read(dataset_name, read_separator=";", return_fact_2_arity=False): filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER, TEST_FACTS_WITH_ARITY_FILENAME) with open(filepath, "r") as input_file: lines = input_file.readlines() if return_fact_2_arity: triple_2_arity = dict() for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff head, relation, tail, arity = line.strip().split(read_separator) triple_2_arity[read_separator.join([head, relation, tail])] = int(arity) return triple_2_arity else: arity_2_triples = defaultdict(lambda: []) for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff head, relation, tail, arity = line.strip().split(read_separator) arity_2_triples[int(arity)].append( read_separator.join([head, relation, tail])) return arity_2_triples
def read(dataset_name, read_separator=";"): """ Read the file that contains the mappings <entity name -> in degree> <entity name -> out degree> <entity name -> overall degree> for the training set of a specific dataset. :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :return: the computed mappings, in the order <entity -> in degree>, <entity -> out degree>, <entity -> overall degree> """ print("Reading the mappings <entity name -> degree> (for in, out and overall degree) in %s training set..." % dataset_name) dataset_home = datasets.home_folder_for(dataset_name) filepath = os.path.join(dataset_home, FOLDER, FILENAME) mid_2_in_degree = defaultdict(lambda: 0) mid_2_out_degree = defaultdict(lambda: 0) mid_2_degree = defaultdict(lambda: 0) with open(filepath) as input_data: lines = input_data.readlines() for line in lines: line = html.unescape(line) # this may be needed by YAGO, that has some & stuff (mid, in_degree, out_degree, degree) = line.strip().split(read_separator) mid_2_in_degree[mid] = int(in_degree) mid_2_out_degree[mid] = int(out_degree) mid_2_degree[mid] = int(degree) return mid_2_in_degree, mid_2_out_degree, mid_2_degree
def read(dataset_name, read_separator=";"): """ Read the file that contains the mappings <relationship name -> list of types> for the training set of a specific dataset. :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :return: the computed mappings """ print( "Reading the mappings <relationship name -> list of types> in %s training set..." % dataset_name) dataset_home = datasets.home_folder_for(dataset_name) filepath = os.path.join(dataset_home, FOLDER, FILENAME) relation_2_types = dict() with open(filepath) as input_data: lines = input_data.readlines() for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff relation, types = line.strip().split(read_separator) relation_2_types[relation] = types.split(",") return relation_2_types
def save(dataset): """ Compute the mappings < relation fine class -> relations that belong to that class> in a specific dataset and save them in a location in the home folder of the dataset :param dataset: the dataset to compute the mappings for """ fine_class_2_rels = compute(dataset) lines = [] for fine_class in FINE_CLASSES: for rel in fine_class_2_rels[fine_class]: lines.append(";".join([rel, fine_class]) + "\n") dataset_home = datasets.home_folder_for(dataset.home) output_filepath = os.path.join(dataset_home, FILENAME) print( "Saving fine-grained relation classes for dataset %s into location %s" % (dataset.name, output_filepath)) with open(output_filepath, "w") as output_file: output_file.writelines(lines) # dataset = datasets.Dataset(datasets.FB15K) # save(dataset)
def save(dataset_name, write_separator=";"): """ Compute the mappings < test fact -> degree class > for all the test facts of a dataset, and save them in a file. :param write_separator: the separator to use when writing the file :param dataset_name: the name of the dataset for which to compute the mappings """ degree_class_2_facts = compute(dataset_name) lines = [] for degree_class in CLASSES: for fact in degree_class_2_facts[degree_class]: head, relationship, tail = fact lines.append( write_separator.join([head, relationship, tail, degree_class]) + "\n") print( "Saving the mappings <degree class -> list of test facts belonging to that degree class> for dataset %s ..." % dataset_name) dataset_home = datasets.home_folder_for(dataset_name) output_filepath = os.path.join(dataset_home, FILENAME) with open(output_filepath, "w") as output_file: output_file.writelines(lines)
def save(dataset, read_separator=";"): test_fact_2_arity = compute(dataset) print("Saving the arity for each test fact in " + dataset.name + "...") output_lines = [] for test_fact in dataset.test_triples: key = ";".join(test_fact) output_lines.append(key + ";" + str(test_fact_2_arity[key]) + "\n") filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER, TEST_FACTS_WITH_ARITY_FILENAME) with open(filepath, "w") as outfile: outfile.writelines(output_lines)
def read(dataset_name, read_separator=";", return_fact_2_class=False): """ Read the mappings <peer class -> test facts to that class > from the corresponding file of the dataset with the given name and return them - either in the format <peer class -> test facts belonging to that class > - or in the format <test fact -> peer class that it belongs to > :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :param return_fact_2_class: if true, return mappings in the format <test fact -> peer class that it belongs to > otherwise, return mappings in the format in the format <peer class -> test facts belonging to that class > :return: the computed mappings """ print("Reading peer classes for test facts of dataset %s..." % dataset_name) input_filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER, TEST_FACTS_WITH_PEERS_FILENAME) with open(input_filepath, "r") as input_file: if not return_fact_2_class: peer_class_2_facts = dict() for peer_class in PEER_CLASSES: peer_class_2_facts[peer_class] = [] for line in input_file.readlines(): line = html.unescape( line ) # this may be needed by YAGO, that has some & stuff head, relation, tail, peer_class = line.strip().split( read_separator) peer_class_2_facts[peer_class].append([head, relation, tail]) return peer_class_2_facts else: fact_2_peer_class = dict() for line in input_file.readlines(): line = html.unescape( line ) # this may be needed by YAGO, that has some & stuff head, relation, tail, peer_class = line.strip().split( read_separator) fact_2_peer_class[";".join([head, relation, tail])] = peer_class return fact_2_peer_class
def read(dataset_name, read_separator=";", return_fact_2_class=False): """ Read the mappings <test fact -> degree class that it belongs to > from the corresponding file of the dataset with the given name and return them - either in the format < degree class -> test facts that belong to that degree class > - or in the format <test fact -> degree class that it belongs to > :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :param return_fact_2_class: if true, return mappings in the format <test fact -> degree class that it belongs to > otherwise, return mappings in the format in the format < degree class -> test facts that belong to that degree class > :return: the computed mappings """ print( "Reading the mappings <degree class -> list of test facts belonging to that degree class> for dataset %s ..." % dataset_name) datase_folder = datasets.home_folder_for(dataset_name) with open(os.path.join(datase_folder, FILENAME), "r") as input_file: if not return_fact_2_class: degree_class_2_facts = dict() for degree_class in CLASSES: degree_class_2_facts[degree_class] = [] for line in input_file.readlines(): line = html.unescape( line ) # this may be needed by YAGO, that has some & stuff head, relation, tail, degree_class = line.strip().split( read_separator) degree_class_2_facts[degree_class].append( [head, relation, tail]) return degree_class_2_facts else: fact_2_class = dict() for line in input_file.readlines(): line = html.unescape( line ) # this may be needed by YAGO, that has some & stuff head, relation, tail, degree_class = line.strip().split( read_separator) fact_2_class[";".join([head, relation, tail])] = degree_class return fact_2_class
def read(dataset_name, read_separator=";", return_fact_2_clique_size=False): """ Read from the filesystem a map that associates each test triple to the size of the maximal clique that contains that triple. :return return a map <clique size -> list of facts with that clique size> :param dataset_name: the name of the dataset to read the mappings for :param read_separator: the separator to use when reading the mappings from the filesystem :param return_fact_2_clique_size: return a map <fact -> clique size> """ print("Reading number of siblings for training facts of dataset %s" % dataset_name) filepath = os.path.join(datasets.home_folder_for(dataset_name), FOLDER, TEST_FACTS_WITH_MAXIMAL_CLIQUE_SIZE_FILENAME) with open(filepath, "r") as input_file: lines = input_file.readlines() if return_fact_2_clique_size: triple_2_clique_size = dict() for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff head, relation, tail, max_clique_size = line.strip().split( read_separator) triple_2_clique_size[read_separator.join( [head, relation, tail])] = int(max_clique_size) return triple_2_clique_size else: clique_size_2_triples = defaultdict(lambda: []) for line in lines: line = html.unescape( line) # this may be needed by YAGO, that has some & stuff head, relation, tail, max_clique_size = line.strip().split( read_separator) clique_size_2_triples[int(max_clique_size)].append( read_separator.join([head, relation, tail])) return clique_size_2_triples
def save(dataset): reified_fact_2_cvts = compute(dataset) print("Saving the freebase-clean CVTs for each fact in " + dataset.name + "...") output_lines = [] for reified_fact in reified_fact_2_cvts: output_lines.append(";".join([reified_fact]) + ";[" + ";".join(reified_fact_2_cvts[reified_fact]) + "]\n") filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER, ALL_FACTS_WITH_CVTS_FILENAME) with open(filepath, "w") as outfile: outfile.writelines(output_lines) #save(datasets.Dataset(datasets.FB15K_237))
def read(dataset_name, read_separator=";", return_rel_2_class=False): """ Read the mappings <relation coarse class -> relations belonging to that coarse class > from the corresponding file of the dataset with the given name and return them - either in the format <relation coarse class -> relations belonging to that coarse class > - or in the format <relation -> relation coarse class that it belongs to > :param dataset_name: the name of the dataset for which to compute the mappings :param read_separator: the separator to use when reading the csv file :param return_rel_2_class: if true, return mappings in the format <relation -> class that it belongs to > otherwise, return mappings in the format in the format < class -> relations belonging to that class > :return: the computed mappings """ print("Reading coarse-grained relation classes for dataset %s" % dataset_name) dataset_home = datasets.home_folder_for(dataset_name) with open(os.path.join(dataset_home, FILENAME), "r") as input_file: if not return_rel_2_class: coarse_class_2_rels = dict() for coarse_class in COARSE_CLASSES: coarse_class_2_rels[coarse_class] = [] for line in input_file.readlines(): relation, coarse_class = line.strip().split(read_separator) coarse_class_2_rels[coarse_class].append(relation) return coarse_class_2_rels else: rel_2_class = dict() for line in input_file.readlines(): relation, coarse_class = line.strip().split(read_separator) rel_2_class[relation] = coarse_class return rel_2_class # dataset = datasets.Dataset(datasets.FB15K) # save(dataset)
def read(dataset, return_cvt_2_facts=False): filepath = os.path.join(datasets.home_folder_for(dataset.name), FOLDER, ALL_FACTS_WITH_CVTS_FILENAME) with open(filepath, "r") as infile: lines = infile.readlines() if return_cvt_2_facts: cvt_2_facts = defaultdict(lambda: []) for line in lines: head, rel, tail, cvts = line.strip().split(";", 3) cvts = cvts[1:-1].split(";") for cvt in cvts: cvt_2_facts[cvt].append((head, rel, tail)) return cvt_2_facts else: fact_2_cvts = dict() for line in lines: head, rel, tail, cvts = line.strip().split(";", 3) cvts = cvts[1:-1].split(";") fact_2_cvts[";".join([head, rel, tail])] = cvts return fact_2_cvts
def read(dataset_name, read_separator=";"): """ Read the file that contains the mapping <relation name -> number of mentions> for the training set of a specific dataset. :param dataset_name: the name of the dataset to read the mappings for, from its specific, pre-computed file :param read_separator: the separator to use when reading the file :return: the read mappings """ print("Reading the mapping <relation name -> number of mentions> in %s training set..." % dataset_name) dataset_home = datasets.home_folder_for(dataset_name) filepath = os.path.join(dataset_home, FOLDER, FILENAME) name_2_count = defaultdict(lambda: 0) with open(filepath) as input_data: lines = input_data.readlines() for line in lines: line = html.unescape(line) # this may be needed by YAGO, that has some & stuff (name, count) = line.strip().split(read_separator) name_2_count[name] = int(count) return name_2_count