def parse_ogbl(files, has_rel, output_dir, num_partitions=1): if has_rel is True: train_idx = torch.load(str(files[0])) valid_idx = torch.load(str(files[1])) test_idx = torch.load(str(files[2])) train_list = np.array([ train_idx.get("head"), train_idx.get("relation"), train_idx.get("tail") ]).T valid_list = np.array([ valid_idx.get("head"), valid_idx.get("relation"), valid_idx.get("tail") ]).T test_list = np.array([ test_idx.get("head"), test_idx.get("relation"), test_idx.get("tail") ]).T else: train_list = torch.load(files[0]).get("edge") valid_list = torch.load(files[1]).get("edge") test_list = torch.load(files[2]).get("edge") np.savetxt(str(Path(output_dir) / Path("train.txt")), train_list, fmt="%s", delimiter="\t", newline="\n") np.savetxt(str(Path(output_dir) / Path("valid.txt")), valid_list, fmt="%s", delimiter="\t", newline="\n") np.savetxt(str(Path(output_dir) / Path("test.txt")), test_list, fmt="%s", delimiter="\t", newline="\n") print("Conversion completed.") if has_rel is True: stats, num_nodes, num_edges = general_parser( [ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions) else: stats, num_nodes, num_edges = general_parser( [ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["sd"], [output_dir], num_partitions=num_partitions) return stats, num_nodes, num_edges
def parse_ogbn(files, output_dir, num_partitions=1): splits = [] for file in files[0:-1]: nodes = pd.read_csv(file, compression='gzip', header=None) splits.append(nodes) edges = pd.read_csv(files[-1], compression='gzip', header=None) train_edges = edges.loc[np.in1d(edges[0], splits[0])] valid_edges = edges.loc[np.in1d(edges[0], splits[1])] test_edges = edges.loc[np.in1d(edges[0], splits[2])] train_edges.to_csv(str(Path(output_dir) / Path("train.txt")), sep="\t", header=False, index=False) valid_edges.to_csv(str(Path(output_dir) / Path("valid.txt")), sep="\t", header=False, index=False) test_edges.to_csv(str(Path(output_dir) / Path("test.txt")), sep="\t", header=False, index=False) stats, num_nodes, num_edges = general_parser([ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["sd"], [output_dir], num_partitions=num_partitions) return stats, num_nodes, num_edges
def kinships(output_dir, num_partitions=1, split=(.05, .05)): KINSHIPS_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/kinship/kinship.data" download_path = download_file(KINSHIPS_URL, output_dir) edges = [] pattern = re.compile("^(?P<rel>[a-z]+)" + r"\((?P<n1>[A-Za-z]+).{2}(?P<n2>[A-Za-z]+)\)\n$") f = open(download_path, "r") lines = f.readlines() for line in lines: if '\n' == line[0]: continue m = pattern.match(line) rel = m.group("rel") node_1 = m.group("n1") node_2 = m.group("n2") edges.append([node_1, rel, node_2]) if (Path(output_dir) / Path("sample_edges.txt")).exists(): (Path(output_dir) / Path("sample_edges.txt")).unlink() np.random.shuffle(edges) np.savetxt((Path(output_dir) / Path("sample_edges.txt")), edges, fmt="%s", delimiter="\t", newline="\n") return general_parser([str(Path(output_dir) / Path("sample_edges.txt"))], ["srd"], [output_dir], dataset_split=split)
def live_journal(output_dir, num_partitions=1, split=(.05, .05)): LIVE_JOURNAL_URL = "https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz" download_path = download_file(LIVE_JOURNAL_URL, output_dir) extract_file(download_path) return general_parser( [str(Path(output_dir) / Path("soc-LiveJournal1.txt"))], ["sd"], [output_dir], num_partitions=num_partitions, dataset_split=split)
def drkg(output_dir, num_partitions=1, split=(.05, .05)): DRKG_URL = "https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz" download_path = download_file(DRKG_URL, output_dir) extract_file(download_path) return general_parser([str(Path(output_dir) / Path("drkg.tsv"))], ["srd"], [output_dir], num_partitions=num_partitions, dataset_split=split)
def twitter(output_dir, num_partitions=1, split=(.05, .05)): TWITTER_URL = "https://snap.stanford.edu/data/twitter-2010.txt.gz" download_path = download_file(TWITTER_URL, output_dir) extract_file(download_path) return general_parser([str(Path(output_dir) / Path("twitter-2010.txt"))], ["srd"], [output_dir], num_partitions=num_partitions, dataset_split=split, num_line_skip=1)
def hetionet(output_dir, num_partitions=1, split=(.05, .05)): HETIONET_URL = "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz" download_path = download_file(HETIONET_URL, output_dir) extract_file(download_path) return general_parser( [str(Path(output_dir) / Path("hetionet-v1.0-edges.sif"))], ["srd"], [output_dir], num_partitions=num_partitions, dataset_split=split)
def fb15k_237(output_dir, num_partitions=1): FB15K_237 = "https://data.deepai.org/FB15K-237.2.zip" download_path = download_file(FB15K_237, output_dir) extract_file(download_path) for file in (output_dir / Path("Release")).iterdir(): file.rename(output_dir / Path(file.name)) (output_dir / Path("Release")).rmdir() return general_parser([ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions)
def wn18(output_dir, num_partitions=1): WN18_URL = "https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:wordnet-mlj12.tar.gz" download_path = download_file(WN18_URL, output_dir) extract_file(download_path) for file in (output_dir / Path("wordnet-mlj12")).iterdir(): file.rename(output_dir / Path(file.name)) (output_dir / Path("wordnet-mlj12")).rmdir() return general_parser([ str(Path(output_dir) / Path("wordnet-mlj12-train.txt")), str(Path(output_dir) / Path("wordnet-mlj12-valid.txt")), str(Path(output_dir) / Path("wordnet-mlj12-test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions)
def freebase86m(output_dir, num_partitions=1): FREEBASE86M_URL = "https://data.dgl.ai/dataset/Freebase.zip" download_path = download_file(FREEBASE86M_URL, output_dir) extract_file(download_path) for file in (output_dir / Path("Freebase")).iterdir(): file.rename(output_dir / Path(file.name)) (output_dir / Path("Freebase")).rmdir() return general_parser([ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["sdr"], [output_dir], num_partitions=num_partitions)
def fb15k(output_dir, num_partitions=1): FB15K_URL = "https://dl.fbaipublicfiles.com/starspace/fb15k.tgz" download_path = download_file(FB15K_URL, output_dir) extract_file(download_path) for file in (output_dir / Path("FB15k")).iterdir(): file.rename(output_dir / Path(file.name)) (output_dir / Path("FB15k")).rmdir() return general_parser([ str(Path(output_dir) / Path("freebase_mtr100_mte100-train.txt")), str(Path(output_dir) / Path("freebase_mtr100_mte100-valid.txt")), str(Path(output_dir) / Path("freebase_mtr100_mte100-test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions)
def openbiolink_hq(output_dir, num_partitions=1): OPENBIOLINK_HQ_URL = "https://zenodo.org/record/3834052/files/HQ_DIR.zip?download=1" download_path = download_file(OPENBIOLINK_HQ_URL, output_dir) extract_file(download_path) return general_parser([ str( Path(output_dir) / Path("HQ_DIR/train_test_data/train_sample.csv")), str(Path(output_dir) / Path("HQ_DIR/train_test_data/val_sample.csv")), str(Path(output_dir) / Path("HQ_DIR/train_test_data/test_sample.csv")) ], ["srd"], [output_dir], num_partitions=num_partitions, num_line_skip=0)
def codex_l(output_dir, num_partitions=1): CODEX_L_TRAIN_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/train.txt" CODEX_L_VALID_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/valid.txt" CODEX_L_TEST_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/test.txt" download_path = download_file(CODEX_L_TRAIN_URL, output_dir) download_path = download_file(CODEX_L_VALID_URL, output_dir) download_path = download_file(CODEX_L_TEST_URL, output_dir) return general_parser([ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions)
def wn18rr(output_dir, num_partitions=1): WN18RR_URL = "https://data.dgl.ai/dataset/wn18rr.zip" download_path = download_file(WN18RR_URL, output_dir) extract_file(download_path) for file in (output_dir / Path("wn18rr")).iterdir(): file.rename(output_dir / Path(file.name)) (output_dir / Path("wn18rr")).rmdir() return general_parser([ str(Path(output_dir) / Path("train.txt")), str(Path(output_dir) / Path("valid.txt")), str(Path(output_dir) / Path("test.txt")) ], ["srd"], [output_dir], num_partitions=num_partitions)
def openbiolink_lq(output_dir, num_partitions=1): OPENBIOLINK_LQ_URL = "https://samwald.info/res/OpenBioLink_2020_final/ALL_DIR.zip" download_path = download_file(OPENBIOLINK_LQ_URL, output_dir) extract_file(download_path) return general_parser([ str( Path(output_dir) / Path("ALL_DIR/train_test_data/train_sample.csv")), str(Path(output_dir) / Path("ALL_DIR/train_test_data/val_sample.csv")), str( Path(output_dir) / Path("ALL_DIR/train_test_data/test_sample.csv")) ], ["srd"], [output_dir], num_partitions=num_partitions, num_line_skip=0)
def test_basic(self): """ Check the preprocessor executes on the test data without error """ output_dir = "test/test_data/" train_file = "train_edges.txt" valid_file = "valid_edges.txt" test_file = "test_edges.txt" stats, num_nodes, num_rels = general_parser( [str(Path(output_dir) / train_file), str(Path(output_dir) / valid_file), str(Path(output_dir) / test_file)], ["srd"], [output_dir], num_partitions=1) assert(stats[0] == 1000) assert(stats[1] == 100) assert(stats[2] == 100) assert(num_nodes == 100) assert(num_rels == 10)