def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./bbbp_dglgraph.bin', n_jobs=1): self._url = 'dataset/bbbp.zip' data_path = get_download_dir() + '/bbbp.zip' dir_path = get_download_dir() + '/bbbp' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/BBBP.csv') super(BBBP, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['p_np'], load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs) self.load_full = False self.names = df['name'].tolist() self.names = [self.names[i] for i in self.valid_ids]
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./muv_dglgraph.bin', n_jobs=1): self._url = 'dataset/muv.zip' data_path = get_download_dir() + '/muv.zip' dir_path = get_download_dir() + '/muv' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/muv.csv') self.ids = df['mol_id'].tolist() self.load_full = False df = df.drop(columns=['mol_id']) super(MUV, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs) self.ids = [self.ids[i] for i in self.valid_ids]
def __init__(self, subset, mol_to_graph=mol_to_bigraph, node_featurizer=default_node_featurizer, edge_featurizer=default_edge_featurizer, atom_pair_featurizer=default_atom_pair_featurizer, load=True): assert subset in ['train', 'val', 'test'], \ 'Expect subset to be "train" or "val" or "test", got {}'.format(subset) print('Preparing {} subset of USPTO'.format(subset)) self._subset = subset if subset == 'val': subset = 'valid' self._url = 'dataset/uspto.zip' data_path = get_download_dir() + '/uspto.zip' extracted_data_path = get_download_dir() + '/uspto' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, extracted_data_path) super(USPTO, self).__init__( raw_file_path=extracted_data_path + '/{}.txt'.format(subset), mol_graph_path=extracted_data_path + '/{}_mol_graphs.bin'.format(subset), mol_to_graph=mol_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, atom_pair_featurizer=atom_pair_featurizer, load=load)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./sider_dglgraph.bin', n_jobs=1): self._url = 'dataset/sider.zip' data_path = get_download_dir() + '/sider.zip' dir_path = get_download_dir() + '/sider' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/sider.csv') super(SIDER, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='freesolv_dglgraph.bin'): self._url = 'dataset/FreeSolv.zip' data_path = get_download_dir() + '/FreeSolv.zip' dir_path = get_download_dir() + '/FreeSolv' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/SAMPL.csv') # Iupac names self.iupac_names = df['iupac'].tolist() # Calculated hydration free energy self.calc_energy = df['calc'].tolist() self.load_full = False super(FreeSolv, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['expt'], load=load, log_every=log_every, init_mask=False)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='lipophilicity_dglgraph.bin'): self._url = 'dataset/lipophilicity.zip' data_path = get_download_dir() + '/lipophilicity.zip' dir_path = get_download_dir() + '/lipophilicity' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/Lipophilicity.csv') # ChEMBL ids self.chembl_ids = df['CMPD_CHEMBLID'].tolist() self.load_full = False super(Lipophilicity, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['exp'], load=load, log_every=log_every, init_mask=False)
def __init__(self, mode='dev', transform=None): assert mode in ['dev', 'valid', 'test'], "mode should be dev/valid/test" self.mode = mode self.transform = transform self.file_dir = pathlib.Path(get_download_dir(), mode) self.zip_file_path = pathlib.Path(get_download_dir(), '%s.zip' % mode) download(_urls['Alchemy'] + "%s.zip" % mode, path=str(self.zip_file_path)) extract_archive(str(self.zip_file_path), str(self.file_dir)) self._load()
def load_acm_raw(): url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that p_vs_p = data['PvsP'] # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] p_num = p_vs_a.shape[0] a_num = p_vs_a.shape[1] l_num = p_vs_l.shape[1] f_1 = open("./mp2v_data/acm_paper", "w") f_2 = open("./mp2v_data/acm_author", "w") f_3 = open("./mp2v_data/acm_field", "w") f_4 = open("./mp2v_data/acm_paper_author", "w") f_5 = open("./mp2v_data/acm_paper_field", "w") pa = p_vs_a.tocoo() pa_row_col = [pa.row, [i + p_num for i in pa.col]] ##pal # p # a # l for i in range(p_num): line = str(i) + ' i' + str(i) + '\n' f_1.write(line) for i in range(p_num, p_num + a_num): line = str(i) + ' a' + str(i) + '\n' f_2.write(line) for i in range(p_num + a_num, p_num + a_num + l_num): line = str(i) + ' f' + str(i) + '\n' f_3.write(line) for i in range(len(pa_row_col[0])): line1 = str(pa_row_col[0][i]) + ' ' + str(pa_row_col[1][i]) + '\n' f_4.write(line1) pl = p_vs_l.tocoo() pl_row_col = [pl.row, [i + p_num + a_num for i in pl.col]] for i in range(len(pl_row_col[0])): line1 = str(pl_row_col[0][i]) + ' ' + str(pl_row_col[1][i]) + '\n' f_5.write(line1) f_1.close() f_2.close() f_3.close() f_4.close() f_5.close()
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='AstraZeneca_chembl_solubility_graph.bin', log_of_values=True): self._url = 'dataset/AstraZeneca_ChEMBL_Solubility.csv' data_path = get_download_dir() + '/AstraZeneca_ChEMBL_Solubility.csv' download(_get_dgl_url(self._url), path=data_path) df = pd.read_csv(data_path) # ChEMBL ids self.chembl_ids = df['Molecule ChEMBL ID'].tolist() # Molecular weight self.mol_weight = df['Molecular Weight'].tolist() self.load_full = False super(AstraZenecaChEMBLSolubility, self).__init__( df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='Smiles', cache_file_path=cache_file_path, task_names=['Solubility'], load=load, log_every=log_every, init_mask=False) if log_of_values: self.labels = self.labels.log()
def _download(self): download_dir = get_download_dir() zip_file_path = os.path.join(download_dir, "tu_{}.zip".format(self.name)) download(self._url.format(self.name), path=zip_file_path) extract_dir = os.path.join(download_dir, "tu_{}".format(self.name)) extract_archive(zip_file_path, extract_dir) return extract_dir
def eval_imdb_mg2v(): m_num = 4183 a_num = 5084 d_num = 2004 data_path = get_download_dir() + '/imdb_3_class.pkl' # download(_get_dgl_url(url), path=data_path) f = open(data_path, mode="rb") data = pickle.load(f) labels = np.array(data['labels']) features = np.zeros((m_num, 64)) labels_arr = np.zeros((m_num, 3)) ff = open("/home/xuyou/deepwalk/MetaGraph2Vec/RandomWalk2Vec/imdb_g2v", "r") for line in ff: nums = line.split() if len(nums) == 2 or nums[0][0] != 'p': continue id = int(nums[0][1:]) fea = [float(i) for i in nums[1:]] if id not in range(0, m_num): continue features[id] = fea lab = labels[id] labels_arr[id][lab] = 1 ids = list(range(m_num)) np.random.shuffle(ids) my_KNN(features[ids[:2500]], labels_arr[ids[:2500]])
def eval_imdb(): m_num = 4183 a_num = 5084 d_num = 2004 data_path = get_download_dir() + '/imdb_3_class.pkl' # download(_get_dgl_url(url), path=data_path) f = open(data_path, mode="rb") data = pickle.load(f) labels = np.array(data['labels']) features = np.zeros((m_num, 64)) labels_arr = np.zeros((m_num, 3)) ff = open( "/home/xuyou/dgl/examples/pytorch/han/mp2v_data/acm_output_emb40.txt", "r") for line in ff: nums = line.split() if len(nums) == 2 or nums[0][0] != 'i': continue id = int(nums[0][1:]) fea = [float(i) for i in nums[1:]] if id not in range(0, m_num): continue features[id] = fea lab = labels[id] labels_arr[id][lab] = 1 ids = list(range(m_num)) np.random.shuffle(ids) my_KNN(features[ids[:2500]], labels_arr[ids[:2500]])
def eval_dblp_mg2v(): f = open("/home/xuyou/deepwalk/MetaGraph2Vec/RandomWalk2Vec/dblp_g2v", "r") data_path = get_download_dir() + '/LabDBLP.mat' a_num = 4057 p_num = 14328 c_num = 20 t_num = 8898 labels_arr = np.zeros((4057, 4)) features = np.zeros((4057, 64)) data = sio.loadmat(data_path) labels = data['Aut_lab'][:, 1] - 1 for line in f: nums = line.split() if len(nums) == 2: continue if nums[0][0] != 'a': continue id = int(nums[0][1:]) fea = [float(i) for i in nums[1:]] if id not in range(p_num, p_num + a_num): continue features[id - p_num] = fea lab = labels[id - p_num] labels_arr[id - p_num][lab] = 1 my_KNN(features[:2500], labels_arr[:2500])
def eval_dblp(): f = open( "/home/xuyou/dgl/examples/pytorch/han/mp2v_data/dblp_output_emb40.txt", "r") data_path = get_download_dir() + '/LabDBLP.mat' a_num = 4057 p_num = 14328 c_num = 20 t_num = 8898 labels_arr = np.zeros((4057, 4)) features = np.zeros((4057, 64)) data = sio.loadmat(data_path) labels = data['Aut_lab'][:, 1] - 1 for line in f: nums = line.split() if len(nums) == 2: continue if nums[0][0] != 'a': continue id = int(nums[0][1:]) fea = [float(i) for i in nums[1:]] if id not in range(p_num, p_num + a_num): continue features[id - p_num] = fea lab = labels[id - p_num] labels_arr[id - p_num][lab] = 1 my_KNN(features[:2500], labels_arr[:2500])
def __init__(self, mode='dev', transform=None): assert mode in ['dev', 'valid', 'test'], "mode should be dev/valid/test" self.mode = mode self.transform = transform self.file_dir = pathlib.Path(get_download_dir(), mode) self.zip_file_path = pathlib.Path(get_download_dir(), '%s_v20190730.zip' % mode) # download(_urls['Alchemy'] + "%s_v20190730.zip" % mode, # path=str(self.zip_file_path)) # extract_archive(str(self.zip_file_path), str(self.file_dir)) sub_dirs = os.listdir(self.file_dir) if 'sdf' not in sub_dirs: assert len(sub_dirs) == 1 self.file_dir = os.path.join(self.file_dir, sub_dirs[0]) self._load()
def __init__(self, hidden_size, latent_size, depth, vocab_file=None): super(DGLJTNNVAE, self).__init__() if vocab_file is None: default_dir = get_download_dir() vocab_file = '{}/jtvae/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtvae.zip'.format(default_dir) download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path) extract_archive(zip_file_path, '{}/jtvae'.format(default_dir)) with open(vocab_file, 'r') as f: self.vocab = Vocab([x.strip("\r\n ") for x in f]) self.hidden_size = hidden_size self.latent_size = latent_size self.depth = depth self.embedding = nn.Embedding(self.vocab.size(), hidden_size) self.mpn = DGLMPN(hidden_size, depth) self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding) self.decoder = DGLJTNNDecoder(self.vocab, hidden_size, latent_size // 2, self.embedding) self.jtmpn = DGLJTMPN(hidden_size, depth) self.T_mean = nn.Linear(hidden_size, latent_size // 2) self.T_var = nn.Linear(hidden_size, latent_size // 2) self.G_mean = nn.Linear(hidden_size, latent_size // 2) self.G_var = nn.Linear(hidden_size, latent_size // 2) self.atom_featurizer_enc = get_atom_featurizer_enc() self.bond_featurizer_enc = get_bond_featurizer_enc() self.atom_featurizer_dec = get_atom_featurizer_dec() self.bond_featurizer_dec = get_bond_featurizer_dec()
def __init__(self, data, vocab, training=True): dir = get_download_dir() _url = _get_dgl_url('dataset/jtnn.zip') zip_file_path = '{}/jtnn.zip'.format(dir) download(_url, path=zip_file_path) extract_archive(zip_file_path, '{}/jtnn'.format(dir)) print('Loading data...') if data in ['train', 'test']: # ZINC subset data_file = '{}/jtnn/{}.txt'.format(dir, data) else: # New dataset data_file = data with open(data_file) as f: self.data = [line.strip("\r\n ").split()[0] for line in f] self.vocab = vocab print('Loading finished') print('\t# samples:', len(self.data)) self.training = training self.atom_featurizer_enc = get_atom_featurizer_enc() self.bond_featurizer_enc = get_bond_featurizer_enc() self.atom_featurizer_dec = get_atom_featurizer_dec() self.bond_featurizer_dec = get_bond_featurizer_dec()
def convert_reddit_data(dataset, out_folder, self_loop=False): """ Load DGL graph dataset """ self_loop_str = "" if self_loop: self_loop_str = "_self_loop" download_dir = get_download_dir() extract_dir = os.path.join(download_dir, "{}{}".format(dataset, self_loop_str)) coo_adj = scipy.sparse.load_npz( os.path.join(extract_dir, "{}{}_graph.npz".format(dataset, self_loop_str))) reddit_data = np.load( os.path.join(extract_dir, "{}_data.npz".format(dataset))) features = reddit_data["feature"] labels = reddit_data["label"] node_types = reddit_data["node_types"] train_mask = (node_types == 1) val_mask = (node_types == 2) test_mask = (node_types == 3) scipy.sparse.save_npz(os.path.join(out_folder, 'adj.npz'), coo_adj) np.save(os.path.join(out_folder, 'feat.npy'), features) np.save(os.path.join(out_folder, 'labels.npy'), labels) np.save(os.path.join(out_folder, 'train.npy'), train_mask) np.save(os.path.join(out_folder, 'val.npy'), val_mask) np.save(os.path.join(out_folder, 'test.npy'), test_mask) print('Convert Finishes')
def __init__(self, name): self.name = name self.dir = get_download_dir() tgz_path = os.path.join(self.dir, '{}.tgz'.format(self.name)) download(_downlaod_prefix + '{}.tgz'.format(self.name), tgz_path) self.dir = os.path.join(self.dir, self.name) extract_archive(tgz_path, self.dir)
def __init__(self, subset, load_binding_pocket=True, sanitize=False, calc_charges=False, remove_hs=False, use_conformation=True, construct_graph_and_featurize=ACNN_graph_construction_and_featurization, zero_padding=True, num_processes=64): self.task_names = ['-logKd/Ki'] self.n_tasks = len(self.task_names) self._url = 'dataset/pdbbind_v2015.tar.gz' root_dir_path = get_download_dir() data_path = root_dir_path + '/pdbbind_v2015.tar.gz' extracted_data_path = root_dir_path + '/pdbbind_v2015' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, extracted_data_path) if subset == 'core': index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013' elif subset == 'refined': index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015' else: raise ValueError( 'Expect the subset_choice to be either ' 'core or refined, got {}'.format(subset)) self._preprocess(extracted_data_path, index_label_file, load_binding_pocket, sanitize, calc_charges, remove_hs, use_conformation, construct_graph_and_featurize, zero_padding, num_processes)
def __init__(self, data, vocab, training=True): self.dir = get_download_dir() self.zip_file_path = '{}/jtnn.zip'.format(self.dir) download(_url, path=self.zip_file_path) extract_archive(self.zip_file_path, '{}/jtnn'.format(self.dir)) print('Loading data...') if data in ['train', 'test']: data_file = '{}/jtnn/{}.txt'.format(self.dir, data) else: data_file = data with open(data_file) as f: self.data = [line.strip("\r\n ").split()[0] for line in f] if vocab == 'zinc': self.vocab_file = '{}/jtnn/vocab.txt'.format(self.dir) elif vocab == 'guacamol': self.vocab_file = '{}/jtnn/vocab_guacamol.txt'.format(self.dir) else: self.vocab_file = vocab print('Loading finished.') print('\tNum samples:', len(self.data)) print('\tVocab file:', self.vocab_file) self.training = training self.vocab = Vocab([x.strip("\r\n ") for x in open(self.vocab_file)])
def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None): super(DGLJTNNVAE, self).__init__() if vocab is None: if vocab_file is None: vocab_file = '{}/jtnn/{}.txt'.format( get_download_dir(), 'vocab') self.vocab = Vocab([x.strip("\r\n ") for x in open(vocab_file)]) else: self.vocab = vocab self.hidden_size = hidden_size self.latent_size = latent_size self.depth = depth self.embedding = nn.Embedding(self.vocab.size(), hidden_size) self.mpn = DGLMPN(hidden_size, depth) self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding) self.decoder = DGLJTNNDecoder( self.vocab, hidden_size, latent_size // 2, self.embedding) self.jtmpn = DGLJTMPN(hidden_size, depth) self.T_mean = nn.Linear(hidden_size, latent_size // 2) self.T_var = nn.Linear(hidden_size, latent_size // 2) self.G_mean = nn.Linear(hidden_size, latent_size // 2) self.G_var = nn.Linear(hidden_size, latent_size // 2) self.n_nodes_total = 0 self.n_passes = 0 self.n_edges_total = 0 self.n_tree_nodes_total = 0
def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None): super(DGLJTNNVAE, self).__init__() if vocab is None: if vocab_file is None: default_dir = get_download_dir() vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtnn.zip'.format(default_dir) download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path) extract_archive(zip_file_path, '{}/jtnn'.format(default_dir)) self.vocab = Vocab([x.strip("\r\n ") for x in open(vocab_file)]) else: self.vocab = vocab self.hidden_size = hidden_size self.latent_size = latent_size self.depth = depth self.embedding = nn.Embedding(self.vocab.size(), hidden_size) self.mpn = DGLMPN(hidden_size, depth) self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding) self.decoder = DGLJTNNDecoder( self.vocab, hidden_size, latent_size // 2, self.embedding) self.jtmpn = DGLJTMPN(hidden_size, depth) self.T_mean = nn.Linear(hidden_size, latent_size // 2) self.T_var = nn.Linear(hidden_size, latent_size // 2) self.G_mean = nn.Linear(hidden_size, latent_size // 2) self.G_var = nn.Linear(hidden_size, latent_size // 2) self.n_nodes_total = 0 self.n_passes = 0 self.n_edges_total = 0 self.n_tree_nodes_total = 0
def __init__(self, file_path=None): if file_path is None: from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive default_dir = get_download_dir() vocab_file = '{}/jtvae/vocab.txt'.format(default_dir) zip_file_path = '{}/jtvae.zip'.format(default_dir) download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path, overwrite=False) extract_archive(zip_file_path, '{}/jtvae'.format(default_dir)) with open(vocab_file, 'r') as f: self.vocab = [x.strip("\r\n ") for x in f] else: # Prepare a vocabulary from scratch vocab = set() with open(file_path, 'r') as f: for line in f: smiles = line.split()[0] mol = MolTree(smiles) for i in mol.nodes_dict: vocab.add(mol.nodes_dict[i]['smiles']) self.vocab = list(vocab) self.vmap = {x: i for i, x in enumerate(self.vocab)} self.slots = [get_slots(smiles) for smiles in self.vocab]
def __init__(self, mode='dev', mol_to_graph=mol_to_complete_graph, node_featurizer=alchemy_nodes, edge_featurizer=alchemy_edges, load=True): if mode == 'test': raise ValueError('The test mode is not supported before ' 'the Alchemy contest finishes.') assert mode in ['dev', 'valid', 'test'], \ 'Expect mode to be dev, valid or test, got {}.'.format(mode) self.mode = mode # Construct DGLGraphs from raw data or use the preprocessed data self.load = load file_dir = osp.join(get_download_dir(), 'Alchemy_data') if load: file_name = "{}_processed_dgl".format(mode) else: file_name = "{}_single_sdf".format(mode) self.file_dir = pathlib.Path(file_dir, file_name) self._url = 'dataset/alchemy/' self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip') download(_get_dgl_url(self._url + file_name + '.zip'), path=str(self.zip_file_path)) if not os.path.exists(str(self.file_dir)): archive = zipfile.ZipFile(self.zip_file_path) archive.extractall(file_dir) archive.close() self._load(mol_to_graph, node_featurizer, edge_featurizer)
def __init__(self, mode='dev', transform=None, from_raw=False): assert mode in ['dev', 'valid', 'test'], "mode should be dev/valid/test" self.mode = mode self.transform = transform # Construct the dgl graph from raw data or use the preprocessed data directly self.from_raw = from_raw file_dir = osp.join(get_download_dir(), './Alchemy_data') if not from_raw: file_name = "%s_processed" % (mode) else: file_name = "%s_single_sdf" % (mode) self.file_dir = pathlib.Path(file_dir, file_name) self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip') download(_urls['Alchemy'] + file_name + '.zip', path=str(self.zip_file_path)) if not os.path.exists(str(self.file_dir)): archive = zipfile.ZipFile(self.zip_file_path) archive.extractall(file_dir) archive.close() self._load()
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./tox21_dglgraph.bin', n_jobs=1): self._url = 'dataset/tox21.csv.gz' data_path = get_download_dir() + '/tox21.csv.gz' download(_get_dgl_url(self._url), path=data_path, overwrite=False) df = pd.read_csv(data_path) self.id = df['mol_id'] df = df.drop(columns=['mol_id']) self.load_full = False super(Tox21, self).__init__(df, smiles_to_graph, node_featurizer, edge_featurizer, "smiles", cache_file_path, load=load, log_every=log_every, n_jobs=n_jobs) self.id = [self.id[i] for i in self.valid_ids]
def load_acm_raw(remove_self_loop): assert not remove_self_loop url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] hg = dgl.heterograph({ ('paper', 'pa', 'author'): p_vs_a.nonzero(), ('author', 'ap', 'paper'): p_vs_a.transpose.nonzero(), ('paper', 'pf', 'field'): p_vs_l.nonzero(), ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero() }) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id labels = torch.LongTensor(labels) num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation( np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_nodes = hg.number_of_nodes('paper') train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask
def test_jtvae(): # Test DGLMolTree smiles = 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C' tree = DGLMolTree(smiles) assert tree.treesize() == 17 tree.assemble() assert tree._recover_node(0, tree.mol) == 'C[CH3:15]' tree.recover() # Test JTVAEDataset smiles = [ 'CCCCCCC1=NN2C(=N)/C(=C\c3cc(C)n(-c4ccc(C)cc4C)c3C)C(=O)N=C2S1', 'COCC[C@@H](C)C(=O)N(C)Cc1ccc(O)cc1' ] with open('data.txt', 'w') as f: for smi in smiles: f.write(smi + '\n') default_dir = get_download_dir() vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtnn.zip'.format(default_dir) download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path, overwrite=False) extract_archive(zip_file_path, '{}/jtnn'.format(default_dir)) with open(vocab_file, 'r') as f: vocab = Vocab([x.strip("\r\n ") for x in f]) dataset = JTVAEDataset('data.txt', vocab) assert len(dataset) == 2 assert set(dataset[0].keys()) == { 'cand_graphs', 'mol_graph', 'mol_tree', 'stereo_cand_graphs', 'stereo_cand_label', 'tree_mess_src_e', 'tree_mess_tgt_e', 'tree_mess_tgt_n', 'wid' } dataset.training = False assert set(dataset[0].keys()) == {'mol_graph', 'mol_tree', 'wid'} dataset.training = True collate_fn = JTVAECollator(training=True) loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn) for _, batch_data in enumerate(loader): assert set(batch_data.keys()) == { 'cand_batch_idx', 'cand_graph_batch', 'mol_graph_batch', 'mol_trees', 'stereo_cand_batch_idx', 'stereo_cand_graph_batch', 'stereo_cand_labels', 'stereo_cand_lengths', 'tree_mess_src_e', 'tree_mess_tgt_e', 'tree_mess_tgt_n' } dataset.training = False collate_fn = JTVAECollator(training=False) loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn) for _, batch_data in enumerate(loader): assert set(batch_data.keys()) == {'mol_graph_batch', 'mol_trees'} remove_file('data.txt') remove_file(zip_file_path) remove_dir(default_dir + '/jtnn')
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='esol_dglgraph.bin'): self._url = 'dataset/ESOL.zip' data_path = get_download_dir() + '/ESOL.zip' dir_path = get_download_dir() + '/ESOL' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/delaney-processed.csv') # Compound names in PubChem self.compound_names = df['Compound ID'].tolist() # Estimated solubility self.estimated_solubility = df['ESOL predicted log solubility in mols per litre'].tolist() # Minimum atom degree self.min_degree = df['Minimum Degree'].tolist() # Molecular weight self.mol_weight = df['Molecular Weight'].tolist() # Number of H-Bond Donors self.num_h_bond_donors = df['Number of H-Bond Donors'].tolist() # Number of rings self.num_rings = df['Number of Rings'].tolist() # Number of rotatable bonds self.num_rotatable_bonds = df['Number of Rotatable Bonds'].tolist() # Polar Surface Area self.polar_surface_area = df['Polar Surface Area'].tolist() self.load_full = False super(ESOL, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['measured log solubility in mols per litre'], load=load, log_every=log_every, init_mask=False)