def make_train_test_edge_data(data_dir): print("Loading graph...") graph, _, _ = load_graph(data_dir, 10) print("Getting all edges...") edges = graph.get_all_edges() split_point = int(0.1 * len(edges)) val_test_edges = edges[:split_point] print("Getting negative samples...") val_test_edge_negsamples = [ graph.get_negative_edge_samples(e, 100) for e in val_test_edges ] print("Making and storing test queries.") val_test_edge_queries = [ Query(("1-chain", val_test_edges[i]), val_test_edge_negsamples[i], None, 100) for i in range(split_point) ] val_split_point = int(0.1 * len(val_test_edge_queries)) val_queries = val_test_edge_queries[:val_split_point] test_queries = val_test_edge_queries[val_split_point:] pickle.dump([q.serialize() for q in val_queries], open(data_dir + "/val_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump([q.serialize() for q in test_queries], open(data_dir + "/test_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL) print("Removing test edges...") graph.remove_edges(val_test_edges) print("Making and storing train queries.") train_edges = graph.get_all_edges() train_queries = [Query(("1-chain", e), None, None) for e in train_edges] pickle.dump([q.serialize() for q in train_queries], open(data_dir + "/train_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL)
def load_queries_by_type(data_file, keep_graph=True): raw_info = pickle.load(open(data_file, "rb")) queries = defaultdict(list) for raw_query in raw_info: query = Query.deserialize(raw_query, keep_graph=keep_graph) queries[query.formula.query_type].append(query) return queries
def load_queries_by_formula(data_file): raw_info = pickle.load(open(data_file, "rb")) queries = defaultdict(lambda: defaultdict(list)) for raw_query in raw_info: query = Query.deserialize(raw_query) queries[query.formula.query_type][query.formula].append(query) return queries
def load_test_queries_by_formula(data_file, keep_graph=False): ''' 4. read query method Read query file as a dict key: "full_neg" (full negative sample) or "one_neg" (only one negative sample) value: a dict() key: query type value: a dict() key: formula template value: the query object ''' if path.exists(data_file): raw_info = pickle.load(open(data_file, "rb")) queries = { "full_neg": defaultdict(lambda: defaultdict(list)), "one_neg": defaultdict(lambda: defaultdict(list)) } for raw_query in raw_info: neg_type = "full_neg" if len(raw_query[1]) > 1 else "one_neg" query = Query.deserialize(raw_query, keep_graph=keep_graph) queries[neg_type][query.formula.query_type][query.formula].append( query) return queries else: return None
def make_train_test_edge_data(data_dir, neg_sample_size): ''' 1. Load graph-data.pkl for the same format 2. Load training/valid/testing triples, a list of edge (head id, (domain type, predicate, range type), tail id) ''' print("Loading graph...") graph, _, _ = load_graph(data_dir, 10) print("Load training/valid/testing triples...") train_triples = pickle_load(data_dir + "/train_triples.pkl") valid_triples = pickle_load(data_dir + "/valid_triples.pkl") test_triples = pickle_load(data_dir + "/test_triples.pkl") print("Getting full negative samples (for APR evaluation) and make queries...") valid_queries = [make_valid_test_edge_from_triple(graph, edge, neg_sample_size) for edge in valid_triples] test_queries = [make_valid_test_edge_from_triple(graph, edge, neg_sample_size) for edge in test_triples] print("Getting one negative samples (for AUC evaluation) and make queries...") valid_queries += [make_valid_test_edge_from_triple(graph, edge, 1) for edge in valid_triples] test_queries += [make_valid_test_edge_from_triple(graph, edge, 1) for edge in test_triples] print("Dumping valid/test 1-chain queries") pickle.dump([q.serialize() for q in valid_queries], open(data_dir+"/val_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump([q.serialize() for q in test_queries], open(data_dir+"/test_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL) print("Dumping train 1-chain queries") train_queries = [Query(("1-chain", e), None, None, keep_graph=True) for e in train_triples] pickle.dump([q.serialize() for q in train_queries], open(data_dir+"/train_edges.pkl", "w"), protocol=pickle.HIGHEST_PROTOCOL) print("Finish making training/valid/testing 1-chain queries")
def load_queries(data_file, keep_graph=False): ''' 1. read query method Read query file as a list of Query object ''' raw_info = pickle.load(open(data_file, "rb")) return [ Query.deserialize(info, keep_graph=keep_graph) for info in raw_info ]
def load_test_queries_by_formula(data_file): raw_info = pickle.load(open(data_file, "rb")) queries = {"full_neg" : defaultdict(lambda : defaultdict(list)), "one_neg" : defaultdict(lambda : defaultdict(list))} for raw_query in raw_info: neg_type = "full_neg" if len(raw_query[1]) > 1 else "one_neg" query = Query.deserialize(raw_query) queries[neg_type][query.formula.query_type][query.formula].append(query) return queries
def load_queries_by_type(data_file, keep_graph=True): ''' 3. read query method Read query file as a dict key: query type value: a list of Query object ''' raw_info = pickle.load(open(data_file, "rb")) queries = defaultdict(list) for raw_query in raw_info: query = Query.deserialize(raw_query, keep_graph=keep_graph) queries[query.formula.query_type].append(query) return queries
def load_test_queries_by_formula(data_file): raw_info = pickle.load(open(data_file, "rb")) queries = { "full_neg": defaultdict(lambda: defaultdict(list)), "one_neg": defaultdict(lambda: defaultdict(list)) } for raw_query in raw_info: neg_type = "full_neg" if len(raw_query[1]) > 1 else "one_neg" #TODO Gad: I have changed keep_graph to True to keeep the nodes print raw_query query = Query.deserialize(raw_query, keep_graph=True) queries[neg_type][query.formula.query_type][query.formula].append( query) return queries
def load_queries_by_formula(data_file, keep_graph=True): ''' 2. read query method Read query file as a dict key: query type value: a dict() key: formula template value: the query object ''' if path.exists(data_file): raw_info = pickle.load(open(data_file, "rb")) queries = defaultdict(lambda: defaultdict(list)) for raw_query in raw_info: query = Query.deserialize(raw_query, keep_graph=keep_graph) queries[query.formula.query_type][query.formula].append(query) return queries else: return None
def load_queries(data_file, keep_graph=False): raw_info = pickle.load(open(data_file, "rb")) return [ Query.deserialize(info, keep_graph=keep_graph) for info in raw_info ]
def make_valid_test_edge_from_triple(graph, edge, neg_sample_size): neg_samples = graph.get_negative_edge_samples(edge, neg_sample_size) return Query(("1-chain", edge), neg_samples, None, neg_sample_size, keep_graph=True)