def get_graph_kernel_dataset(dataset_ID, feat_norm='zscore'): print('Loading data') nx_graphs, y = read_graphs_txt(dataset_ID) # Preprocessing y = np.array(y)[..., None] y = OneHotEncoder(sparse=False, categories='auto').fit_transform(y) # Get node attributes try: A, X_attr, _ = nx_to_numpy(nx_graphs, nf_keys=['attributes'], auto_pad=False) X_attr = node_feat_norm(X_attr, feat_norm) except KeyError: print('Featureless nodes') A, X_attr, _ = nx_to_numpy(nx_graphs, auto_pad=False) # na will be None # Get clustering coefficients (always zscore norm) clustering_coefficients = [ np.array(list(nx.clustering(g).values()))[..., None] for g in nx_graphs ] clustering_coefficients = node_feat_norm(clustering_coefficients, 'zscore') # Get node degrees node_degrees = np.array([np.sum(_, axis=-1, keepdims=True) for _ in A]) node_degrees = node_feat_norm(node_degrees, feat_norm) # Get node labels (always ohe norm) try: _, X_labs, _ = nx_to_numpy(nx_graphs, nf_keys=['label'], auto_pad=False) X_labs = node_feat_norm(X_labs, 'ohe') except KeyError: print('Label-less nodes') X_labs = None # Concatenate features Xs = [node_degrees, clustering_coefficients] if X_attr is not None: Xs.append(X_attr) if X_labs is not None: Xs.append(X_labs) X = [np.concatenate(x_, axis=-1) for x_ in zip(*Xs)] X = np.array(X) return A, X, y
def generate_graph_matrices(graphs, auto_pad=False): A, X, E = nx_to_numpy(graphs, nf_keys=['atomic_num'], ef_keys=['bond_type'], auto_pad=auto_pad, self_loops=True) uniq_X = np.unique([v for x in X for v in np.unique(x)]) X = [label_to_one_hot(x, uniq_X) for x in X] uniq_E = np.unique([v for x in E for v in np.unique(x)]) E = [label_to_one_hot(x, uniq_E) for x in E] return A, X, E
def generate_graph_matrices(graphs, auto_pad=False): """ Generate A, X, E matrix from smiles :param graphs: list of networkx graphs :param auto_pad: bool. whether to pad the node matrix to have the same length :return A, X, E """ A, X, E = nx_to_numpy(graphs, nf_keys=['atomic_num'], ef_keys=['bond_type'], auto_pad=auto_pad, self_loops=True) uniq_X = np.unique([v for x in X for v in np.unique(x)]) X = [label_to_one_hot(x, uniq_X) for x in X] uniq_E = np.unique([v for x in E for v in np.unique(x)]) E = [label_to_one_hot(x, uniq_E) for x in E] return A, X, E
def load_data(nf_keys=None, ef_keys=None, auto_pad=True, self_loops=False, amount=None, return_type='numpy'): """ Loads the QM9 chemical data set of small molecules. Nodes represent heavy atoms (hydrogens are discarded), edges represent chemical bonds. The node features represent the chemical properties of each atom, and are loaded according to the `nf_keys` argument. See `spektral.datasets.qm9.NODE_FEATURES` for possible node features, and see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx) for the meaning of each property. Usually, it is sufficient to load the atomic number. The edge features represent the type and stereoscopy of each chemical bond between two atoms. See `spektral.datasets.qm9.EDGE_FEATURES` for possible edge features, and see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx) for the meaning of each property. Usually, it is sufficient to load the type of bond. :param nf_keys: list or str, node features to return (see `qm9.NODE_FEATURES` for available features); :param ef_keys: list or str, edge features to return (see `qm9.EDGE_FEATURES` for available features); :param auto_pad: if `return_type='numpy'`, zero pad graph matrices to have the same number of nodes; :param self_loops: if `return_type='numpy'`, add self loops to adjacency matrices; :param amount: the amount of molecules to return (in ascending order by number of atoms). :param return_type: `'numpy'`, `'networkx'`, or `'sdf'`, data format to return; :return: - if `return_type='numpy'`, the adjacency matrix, node features, edge features, and a Pandas dataframe containing labels; - if `return_type='networkx'`, a list of graphs in Networkx format, and a dataframe containing labels; - if `return_type='sdf'`, a list of molecules in the internal SDF format and a dataframe containing labels. """ if return_type not in RETURN_TYPES: raise ValueError('Possible return_type: {}'.format(RETURN_TYPES)) if not os.path.exists(DATA_PATH): _download_data() # Try to download dataset print('Loading QM9 dataset.') sdf_file = os.path.join(DATA_PATH, 'qm9.sdf') data = load_sdf(sdf_file, amount=amount) # Internal SDF format # Load labels labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv') labels = load_csv(labels_file) if amount is not None: labels = labels[:amount] if return_type is 'sdf': return data, labels else: # Convert to Networkx data = [sdf_to_nx(_) for _ in data] if return_type is 'numpy': if nf_keys is not None: if isinstance(nf_keys, str): nf_keys = [nf_keys] else: nf_keys = NODE_FEATURES if ef_keys is not None: if isinstance(ef_keys, str): ef_keys = [ef_keys] else: ef_keys = EDGE_FEATURES adj, nf, ef = nx_to_numpy(data, auto_pad=auto_pad, self_loops=self_loops, nf_keys=nf_keys, ef_keys=ef_keys) return adj, nf, ef, labels elif return_type is 'networkx': return data, labels else: # Should not get here raise RuntimeError()
def load_data(return_type='numpy', nf_keys=None, ef_keys=None, auto_pad=True, self_loops=False, amount=None): """ Loads the QM9 molecules dataset. :param return_type: 'networkx', 'numpy', or 'sdf', data format to return; :param nf_keys: list or str, node features to return (see `qm9.NODE_FEATURES` for available features); :param ef_keys: list or str, edge features to return (see `qm9.EDGE_FEATURES` for available features); :param auto_pad: if `return_type='numpy'`, zero pad graph matrices to have the same number of nodes; :param self_loops: if `return_type='numpy'`, add self loops to adjacency matrices; :param amount: the amount of molecules to return (in order). :return: if `return_type='numpy'`, the adjacency matrix, node features, edge features, and a Pandas dataframe containing labels; if `return_type='networkx'`, a list of graphs in Networkx format, and a dataframe containing labels; if `return_type='sdf'`, a list of molecules in the internal SDF format and a dataframe containing labels. """ if return_type not in RETURN_TYPES: raise ValueError('Possible return_type: {}'.format(RETURN_TYPES)) if not os.path.exists(DATA_PATH): _ = dataset_downloader() # Try to download dataset print('Loading QM9 dataset.') sdf_file = os.path.join(DATA_PATH, 'qm9.sdf') data = load_sdf(sdf_file, amount=amount) # Internal SDF format # Load labels labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv') labels = load_csv(labels_file) if amount is not None: labels = labels[:amount] if return_type is 'sdf': return data, labels else: # Convert to Networkx data = [sdf_to_nx(_, keep_hydrogen=True) for _ in data] if return_type is 'numpy': if nf_keys is not None: if isinstance(nf_keys, str): nf_keys = [nf_keys] else: nf_keys = NODE_FEATURES if ef_keys is not None: if isinstance(ef_keys, str): ef_keys = [ef_keys] else: ef_keys = EDGE_FEATURES adj, nf, ef = nx_to_numpy(data, auto_pad=auto_pad, self_loops=self_loops, nf_keys=nf_keys, ef_keys=ef_keys) return adj, nf, ef, labels elif return_type is 'networkx': return data, labels else: # Should not get here raise RuntimeError()
def load_data(dataset_name, normalize_features=None, clean=False): """ Loads one of the Benchmark Data Sets for Graph Kernels from TU Dortmund ([link](https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets)). The node features are computed by concatenating the following features for each node: - node attributes, if available, normalized as specified in `normalize_features`; - clustering coefficient, normalized with z-score; - node degrees, normalized as specified in `normalize_features`; - node labels, if available, one-hot encoded. :param dataset_name: name of the dataset to load (see `spektral.datasets.tud.AVAILABLE_DATASETS`). :param normalize_features: `None`, `'zscore'` or `'ohe'`, how to normalize the node features (only works for node attributes). :param clean: if True, return a version of the dataset with no isomorphic graphs. :return: - a list of adjacency matrices; - a list of node feature matrices; - a numpy array containing the one-hot encoded targets. """ if dataset_name not in AVAILABLE_DATASETS: raise ValueError('Available datasets: {}'.format(AVAILABLE_DATASETS)) if clean: dataset_name += '_clean' if not os.path.exists(DATA_PATH + dataset_name): _download_data(dataset_name) # Read data nx_graphs, y = _read_graphs(dataset_name) # Preprocessing y = np.array(y)[..., None] y = OneHotEncoder(sparse=False, categories='auto').fit_transform(y) # Get node attributes try: A, X_attr, _ = nx_to_numpy(nx_graphs, nf_keys=['attributes'], auto_pad=False) X_attr = _normalize_node_features(X_attr, normalize_features) except KeyError: print('Featureless nodes') A, X_attr, _ = nx_to_numpy(nx_graphs, auto_pad=False) # Get clustering coefficients (always zscore norm) clustering_coefficients = [ np.array(list(nx.clustering(g).values()))[..., None] for g in nx_graphs ] clustering_coefficients = _normalize_node_features(clustering_coefficients, 'zscore') # Get node degrees node_degrees = np.array([np.sum(_, axis=-1, keepdims=True) for _ in A]) node_degrees = _normalize_node_features(node_degrees, 'zscore') # Get node labels try: _, X_labs, _ = nx_to_numpy(nx_graphs, nf_keys=['label'], auto_pad=False) X_labs = _normalize_node_features(X_labs, 'ohe') except KeyError: print('Label-less nodes') X_labs = None # Concatenate features Xs = [node_degrees, clustering_coefficients] if X_attr is not None: Xs.append(X_attr) if X_labs is not None: Xs.append(X_labs) X = [np.concatenate(x_, axis=-1) for x_ in zip(*Xs)] X = np.array(X) return A, X, y