def download(self, overwrite=False): self.input_cites_edge_list_file = self.output_directory / Path("cites_edge_index.npy") self.input_splits_file = self.output_directory / Path("split_dict.pt") self.input_node_feature_file = self.output_directory / Path("node_feat.npy") self.input_node_label_file = self.output_directory / Path("node_label.npy") download = False if not self.input_cites_edge_list_file.exists(): download = True if not self.input_splits_file.exists(): download = True if not self.input_node_feature_file.exists(): download = True if not self.input_node_label_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) (self.output_directory / Path("mag240m_kddcup2021/processed/paper___cites___paper/edge_index.npy")).rename(self.input_cites_edge_list_file) (self.output_directory / Path("mag240m_kddcup2021/split_dict.pt")).rename(self.input_splits_file) (self.output_directory / Path("mag240m_kddcup2021/processed/paper/node_feat.npy")).rename(self.input_node_feature_file) (self.output_directory / Path("mag240m_kddcup2021/processed/paper/node_label.npy")).rename(self.input_node_label_file)
def download(self, overwrite=False): self.input_train_edges_file = self.output_directory / Path( "train_hrt.npy") self.input_valid_edges_sr_file = self.output_directory / Path( "val_hr.npy") self.input_valid_edges_d_file = self.output_directory / Path( "val_t.npy") # self.input_test_edges_file = self.output_directory / Path("test-dev_hr.npy") # self.input_test_edges_file = self.output_directory / Path("test-challenge_hr.npy") self.input_node_feature_file = self.output_directory / Path( "entity_feat.npy") self.input_rel_feature_file = self.output_directory / Path( "relation_feat.npy") download = False if not self.input_train_edges_file.exists(): download = True if not self.input_valid_edges_sr_file.exists(): download = True if not self.input_valid_edges_d_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=True) for file in (self.output_directory / Path("wikikg90m-v2/processed/")).iterdir(): file.rename(self.output_directory / Path(file.name))
def download(self, overwrite=False): self.input_train_edges_file = self.output_directory / Path( "freebase_mtr100_mte100-train.txt") self.input_valid_edges_file = self.output_directory / Path( "freebase_mtr100_mte100-valid.txt") self.input_test_edges_file = self.output_directory / Path( "freebase_mtr100_mte100-test.txt") download = False if not self.input_train_edges_file.exists(): download = True if not self.input_valid_edges_file.exists(): download = True if not self.input_test_edges_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=True) for file in (self.output_directory / Path("FB15k")).iterdir(): file.rename(self.output_directory / Path(file.name)) (self.output_directory / Path("FB15k")).rmdir()
def download(self, overwrite=False): self.input_edges = self.output_directory / Path("twitter-2010.txt") if not self.input_edges.exists(): archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=True)
def download(self, overwrite=False): self.input_edges = self.output_directory / Path("soc-LiveJournal1.txt") if not self.input_edges.exists(): archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=True) strip_header(self.input_edges, num_lines=4)
def download(self, overwrite=False): self.input_edge_list_file = self.output_directory / Path("edge.csv") self.input_node_feature_file = self.output_directory / Path( "node-feat.csv") self.input_node_label_file = self.output_directory / Path( "node-label.csv") self.input_train_nodes_file = self.output_directory / Path("train.csv") self.input_valid_nodes_file = self.output_directory / Path("valid.csv") self.input_test_nodes_file = self.output_directory / Path("test.csv") download = False if not self.input_edge_list_file.exists(): download = True if not self.input_node_feature_file.exists(): download = True if not self.input_node_label_file.exists(): download = True if not self.input_train_nodes_file.exists(): download = True if not self.input_valid_nodes_file.exists(): download = True if not self.input_test_nodes_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) extract_file(self.output_directory / Path("products/raw/edge.csv.gz")) extract_file(self.output_directory / Path("products/raw/node-feat.csv.gz")) extract_file(self.output_directory / Path("products/raw/node-label.csv.gz")) (self.output_directory / Path("products/raw/edge.csv")).rename( self.input_edge_list_file) (self.output_directory / Path("products/raw/node-feat.csv")).rename( self.input_node_feature_file) (self.output_directory / Path("products/raw/node-label.csv")).rename( self.input_node_label_file) for file in (self.output_directory / Path("products/split/sales_ranking")).iterdir(): extract_file(file) for file in (self.output_directory / Path("products/split/sales_ranking")).iterdir(): file.rename(self.output_directory / Path(file.name))
def download(self, overwrite=False): self.input_train_edges_file = self.output_directory / Path("edge.csv") download = False if not self.input_train_edges_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) extract_file(self.output_directory / Path("arxiv/raw/edge.csv.gz")) (self.output_directory / Path("arxiv/raw/edge.csv")).rename( self.input_train_edges_file)
def download(self, overwrite=False): self.input_edge_list_file = self.output_directory / Path( "data.npz") # key: edge_index self.input_node_feature_file = self.output_directory / Path( "data.npz") # key: node_feat self.input_node_label_file = self.output_directory / Path( "node-label.npz") self.input_train_nodes_file = self.output_directory / Path("train.csv") self.input_valid_nodes_file = self.output_directory / Path("valid.csv") self.input_test_nodes_file = self.output_directory / Path("test.csv") download = False if not self.input_edge_list_file.exists(): download = True if not self.input_node_feature_file.exists(): download = True if not self.input_node_label_file.exists(): download = True if not self.input_train_nodes_file.exists(): download = True if not self.input_valid_nodes_file.exists(): download = True if not self.input_test_nodes_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) (self.output_directory / Path("papers100M-bin/raw/data.npz")).rename( self.input_node_feature_file) (self.output_directory / Path("papers100M-bin/raw/node-label.npz")).rename( self.input_node_label_file) for file in (self.output_directory / Path("papers100M-bin/split/time")).iterdir(): extract_file(file) for file in (self.output_directory / Path("papers100M-bin/split/time")).iterdir(): file.rename(self.output_directory / Path(file.name))
def download(self, overwrite=False, remap_ids=True): self.input_train_edges_file = self.output_directory / Path("train.pt") self.input_valid_edges_file = self.output_directory / Path("valid.pt") self.input_test_edges_file = self.output_directory / Path("test.pt") download = False if not self.input_train_edges_file.exists(): download = True if not self.input_valid_edges_file.exists(): download = True if not self.input_test_edges_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) for file in (self.output_directory / Path("ppassoc/split/throughput")).iterdir(): file.rename(self.output_directory / Path(file.name))
def download(self, overwrite=False): # These are the files we want to make my the end of the the download self.input_edge_list_file = self.output_directory / Path("edge.csv") self.input_node_feature_file = self.output_directory / Path( "node-feat.csv") self.input_node_label_file = self.output_directory / Path( "node-label.csv") self.input_train_nodes_file = self.output_directory / Path("train.csv") self.input_valid_nodes_file = self.output_directory / Path("valid.csv") self.input_test_nodes_file = self.output_directory / Path("test.csv") # If files already exist we don't need to do processing download = False if not self.input_edge_list_file.exists(): download = True if not self.input_node_feature_file.exists(): download = True if not self.input_node_label_file.exists(): download = True if not self.input_train_nodes_file.exists(): download = True if not self.input_valid_nodes_file.exists(): download = True if not self.input_test_nodes_file.exists(): download = True if download: archive_path = download_url(self.dataset_url, self.output_directory, overwrite) extract_file(archive_path, remove_input=False) # Reading and processing the csv df = pd.read_csv(dataset_dir / Path("cora/cora.content"), sep="\t", header=None) cols = df.columns[1:len(df.columns) - 1] # Getting all the indices indices = np.array(range(len(df))) np.random.shuffle(indices) train_indices = indices[0:int(0.8 * len(df))] valid_indices = indices[int(0.8 * len(df)):int(0.8 * len(df)) + int(0.1 * len(df))] test_indices = indices[int(0.8 * len(df)) + int(0.1 * len(df)):] np.savetxt(dataset_dir / Path("train.csv"), train_indices, delimiter=",", fmt="%d") np.savetxt(dataset_dir / Path("valid.csv"), valid_indices, delimiter=",", fmt="%d") np.savetxt(dataset_dir / Path("test.csv"), test_indices, delimiter=",", fmt="%d") # Features features = df[cols] features.to_csv(index=False, sep=",", path_or_buf=dataset_dir / Path("node-feat.csv"), header=False) # Labels labels = df[df.columns[len(df.columns) - 1]] labels = labels.apply(switch_to_num) labels.to_csv(index=False, sep=",", path_or_buf=dataset_dir / Path("node-label.csv"), header=False) # Edges node_ids = df[df.columns[0]] dict_reverse = node_ids.to_dict() nodes_dict = {v: k for k, v in dict_reverse.items()} df_edges = pd.read_csv(dataset_dir / Path("cora/cora.cites"), sep="\t", header=None) df_edges.replace({0: nodes_dict, 1: nodes_dict}, inplace=True) df_edges.to_csv(index=False, sep=",", path_or_buf=dataset_dir / Path("edge.csv"), header=False)