Exemple #1
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./bbbp_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/bbbp.zip'
        data_path = get_download_dir() + '/bbbp.zip'
        dir_path = get_download_dir() + '/bbbp'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/BBBP.csv')

        super(BBBP, self).__init__(df=df,
                                   smiles_to_graph=smiles_to_graph,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   smiles_column='smiles',
                                   cache_file_path=cache_file_path,
                                   task_names=['p_np'],
                                   load=load,
                                   log_every=log_every,
                                   init_mask=True,
                                   n_jobs=n_jobs)

        self.load_full = False
        self.names = df['name'].tolist()
        self.names = [self.names[i] for i in self.valid_ids]
Exemple #2
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./muv_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/muv.zip'
        data_path = get_download_dir() + '/muv.zip'
        dir_path = get_download_dir() + '/muv'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/muv.csv')

        self.ids = df['mol_id'].tolist()
        self.load_full = False

        df = df.drop(columns=['mol_id'])

        super(MUV, self).__init__(df=df,
                                  smiles_to_graph=smiles_to_graph,
                                  node_featurizer=node_featurizer,
                                  edge_featurizer=edge_featurizer,
                                  smiles_column='smiles',
                                  cache_file_path=cache_file_path,
                                  load=load,
                                  log_every=log_every,
                                  init_mask=True,
                                  n_jobs=n_jobs)

        self.ids = [self.ids[i] for i in self.valid_ids]
Exemple #3
0
    def __init__(self,
                 subset,
                 mol_to_graph=mol_to_bigraph,
                 node_featurizer=default_node_featurizer,
                 edge_featurizer=default_edge_featurizer,
                 atom_pair_featurizer=default_atom_pair_featurizer,
                 load=True):
        assert subset in ['train', 'val', 'test'], \
            'Expect subset to be "train" or "val" or "test", got {}'.format(subset)
        print('Preparing {} subset of USPTO'.format(subset))
        self._subset = subset
        if subset == 'val':
            subset = 'valid'

        self._url = 'dataset/uspto.zip'
        data_path = get_download_dir() + '/uspto.zip'
        extracted_data_path = get_download_dir() + '/uspto'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, extracted_data_path)

        super(USPTO, self).__init__(
            raw_file_path=extracted_data_path + '/{}.txt'.format(subset),
            mol_graph_path=extracted_data_path + '/{}_mol_graphs.bin'.format(subset),
            mol_to_graph=mol_to_graph,
            node_featurizer=node_featurizer,
            edge_featurizer=edge_featurizer,
            atom_pair_featurizer=atom_pair_featurizer,
            load=load)
Exemple #4
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./sider_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/sider.zip'
        data_path = get_download_dir() + '/sider.zip'
        dir_path = get_download_dir() + '/sider'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/sider.csv')

        super(SIDER, self).__init__(df=df,
                                    smiles_to_graph=smiles_to_graph,
                                    node_featurizer=node_featurizer,
                                    edge_featurizer=edge_featurizer,
                                    smiles_column='smiles',
                                    cache_file_path=cache_file_path,
                                    load=load,
                                    log_every=log_every,
                                    init_mask=True,
                                    n_jobs=n_jobs)
Exemple #5
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='freesolv_dglgraph.bin'):

        self._url = 'dataset/FreeSolv.zip'
        data_path = get_download_dir() + '/FreeSolv.zip'
        dir_path = get_download_dir() + '/FreeSolv'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/SAMPL.csv')

        # Iupac names
        self.iupac_names = df['iupac'].tolist()
        # Calculated hydration free energy
        self.calc_energy = df['calc'].tolist()

        self.load_full = False

        super(FreeSolv, self).__init__(df=df,
                                       smiles_to_graph=smiles_to_graph,
                                       node_featurizer=node_featurizer,
                                       edge_featurizer=edge_featurizer,
                                       smiles_column='smiles',
                                       cache_file_path=cache_file_path,
                                       task_names=['expt'],
                                       load=load,
                                       log_every=log_every,
                                       init_mask=False)
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='lipophilicity_dglgraph.bin'):

        self._url = 'dataset/lipophilicity.zip'
        data_path = get_download_dir() + '/lipophilicity.zip'
        dir_path = get_download_dir() + '/lipophilicity'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/Lipophilicity.csv')

        # ChEMBL ids
        self.chembl_ids = df['CMPD_CHEMBLID'].tolist()

        self.load_full = False

        super(Lipophilicity, self).__init__(df=df,
                                            smiles_to_graph=smiles_to_graph,
                                            node_featurizer=node_featurizer,
                                            edge_featurizer=edge_featurizer,
                                            smiles_column='smiles',
                                            cache_file_path=cache_file_path,
                                            task_names=['exp'],
                                            load=load,
                                            log_every=log_every,
                                            init_mask=False)
Exemple #7
0
    def __init__(self, mode='dev', transform=None):
        assert mode in ['dev', 'valid',
                        'test'], "mode should be dev/valid/test"
        self.mode = mode
        self.transform = transform
        self.file_dir = pathlib.Path(get_download_dir(), mode)
        self.zip_file_path = pathlib.Path(get_download_dir(), '%s.zip' % mode)
        download(_urls['Alchemy'] + "%s.zip" % mode,
                 path=str(self.zip_file_path))
        extract_archive(str(self.zip_file_path), str(self.file_dir))

        self._load()
def load_acm_raw():
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'

    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']  # paper-field?
    p_vs_a = data['PvsA']  # paper-author
    p_vs_t = data['PvsT']  # paper-term, bag of words
    p_vs_c = data['PvsC']  # paper-conference, labels come from that
    p_vs_p = data['PvsP']
    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]
    label_ids = [0, 1, 2, 2, 1]

    p_vs_c_filter = p_vs_c[:, conf_ids]
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_l = p_vs_l[p_selected]
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]
    p_num = p_vs_a.shape[0]
    a_num = p_vs_a.shape[1]
    l_num = p_vs_l.shape[1]
    f_1 = open("./mp2v_data/acm_paper", "w")
    f_2 = open("./mp2v_data/acm_author", "w")
    f_3 = open("./mp2v_data/acm_field", "w")
    f_4 = open("./mp2v_data/acm_paper_author", "w")
    f_5 = open("./mp2v_data/acm_paper_field", "w")
    pa = p_vs_a.tocoo()
    pa_row_col = [pa.row, [i + p_num for i in pa.col]]
    ##pal
    # p
    # a
    # l
    for i in range(p_num):
        line = str(i) + ' i' + str(i) + '\n'
        f_1.write(line)
    for i in range(p_num, p_num + a_num):
        line = str(i) + ' a' + str(i) + '\n'
        f_2.write(line)
    for i in range(p_num + a_num, p_num + a_num + l_num):
        line = str(i) + ' f' + str(i) + '\n'
        f_3.write(line)
    for i in range(len(pa_row_col[0])):
        line1 = str(pa_row_col[0][i]) + ' ' + str(pa_row_col[1][i]) + '\n'
        f_4.write(line1)

    pl = p_vs_l.tocoo()
    pl_row_col = [pl.row, [i + p_num + a_num for i in pl.col]]
    for i in range(len(pl_row_col[0])):
        line1 = str(pl_row_col[0][i]) + ' ' + str(pl_row_col[1][i]) + '\n'
        f_5.write(line1)
    f_1.close()
    f_2.close()
    f_3.close()
    f_4.close()
    f_5.close()
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='AstraZeneca_chembl_solubility_graph.bin',
                 log_of_values=True):

        self._url = 'dataset/AstraZeneca_ChEMBL_Solubility.csv'
        data_path = get_download_dir() + '/AstraZeneca_ChEMBL_Solubility.csv'
        download(_get_dgl_url(self._url), path=data_path)
        df = pd.read_csv(data_path)

        # ChEMBL ids
        self.chembl_ids = df['Molecule ChEMBL ID'].tolist()
        # Molecular weight
        self.mol_weight = df['Molecular Weight'].tolist()

        self.load_full = False

        super(AstraZenecaChEMBLSolubility, self).__init__(
            df=df,
            smiles_to_graph=smiles_to_graph,
            node_featurizer=node_featurizer,
            edge_featurizer=edge_featurizer,
            smiles_column='Smiles',
            cache_file_path=cache_file_path,
            task_names=['Solubility'],
            load=load,
            log_every=log_every,
            init_mask=False)

        if log_of_values:
            self.labels = self.labels.log()
Exemple #10
0
 def _download(self):
     download_dir = get_download_dir()
     zip_file_path = os.path.join(download_dir, "tu_{}.zip".format(self.name))
     download(self._url.format(self.name), path=zip_file_path)
     extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
     extract_archive(zip_file_path, extract_dir)
     return extract_dir
def eval_imdb_mg2v():
    m_num = 4183
    a_num = 5084
    d_num = 2004
    data_path = get_download_dir() + '/imdb_3_class.pkl'
    # download(_get_dgl_url(url), path=data_path)
    f = open(data_path, mode="rb")
    data = pickle.load(f)
    labels = np.array(data['labels'])
    features = np.zeros((m_num, 64))
    labels_arr = np.zeros((m_num, 3))
    ff = open("/home/xuyou/deepwalk/MetaGraph2Vec/RandomWalk2Vec/imdb_g2v",
              "r")
    for line in ff:
        nums = line.split()
        if len(nums) == 2 or nums[0][0] != 'p':
            continue
        id = int(nums[0][1:])
        fea = [float(i) for i in nums[1:]]
        if id not in range(0, m_num):
            continue
        features[id] = fea
        lab = labels[id]
        labels_arr[id][lab] = 1
    ids = list(range(m_num))
    np.random.shuffle(ids)
    my_KNN(features[ids[:2500]], labels_arr[ids[:2500]])
def eval_imdb():
    m_num = 4183
    a_num = 5084
    d_num = 2004
    data_path = get_download_dir() + '/imdb_3_class.pkl'
    # download(_get_dgl_url(url), path=data_path)
    f = open(data_path, mode="rb")
    data = pickle.load(f)
    labels = np.array(data['labels'])
    features = np.zeros((m_num, 64))
    labels_arr = np.zeros((m_num, 3))
    ff = open(
        "/home/xuyou/dgl/examples/pytorch/han/mp2v_data/acm_output_emb40.txt",
        "r")
    for line in ff:
        nums = line.split()
        if len(nums) == 2 or nums[0][0] != 'i':
            continue
        id = int(nums[0][1:])
        fea = [float(i) for i in nums[1:]]
        if id not in range(0, m_num):
            continue
        features[id] = fea
        lab = labels[id]
        labels_arr[id][lab] = 1
    ids = list(range(m_num))
    np.random.shuffle(ids)
    my_KNN(features[ids[:2500]], labels_arr[ids[:2500]])
def eval_dblp_mg2v():
    f = open("/home/xuyou/deepwalk/MetaGraph2Vec/RandomWalk2Vec/dblp_g2v", "r")
    data_path = get_download_dir() + '/LabDBLP.mat'
    a_num = 4057
    p_num = 14328
    c_num = 20
    t_num = 8898
    labels_arr = np.zeros((4057, 4))
    features = np.zeros((4057, 64))
    data = sio.loadmat(data_path)
    labels = data['Aut_lab'][:, 1] - 1
    for line in f:
        nums = line.split()
        if len(nums) == 2:
            continue
        if nums[0][0] != 'a':
            continue
        id = int(nums[0][1:])
        fea = [float(i) for i in nums[1:]]
        if id not in range(p_num, p_num + a_num):
            continue
        features[id - p_num] = fea
        lab = labels[id - p_num]
        labels_arr[id - p_num][lab] = 1
    my_KNN(features[:2500], labels_arr[:2500])
def eval_dblp():
    f = open(
        "/home/xuyou/dgl/examples/pytorch/han/mp2v_data/dblp_output_emb40.txt",
        "r")
    data_path = get_download_dir() + '/LabDBLP.mat'
    a_num = 4057
    p_num = 14328
    c_num = 20
    t_num = 8898
    labels_arr = np.zeros((4057, 4))
    features = np.zeros((4057, 64))
    data = sio.loadmat(data_path)
    labels = data['Aut_lab'][:, 1] - 1
    for line in f:
        nums = line.split()
        if len(nums) == 2:
            continue
        if nums[0][0] != 'a':
            continue
        id = int(nums[0][1:])
        fea = [float(i) for i in nums[1:]]
        if id not in range(p_num, p_num + a_num):
            continue
        features[id - p_num] = fea
        lab = labels[id - p_num]
        labels_arr[id - p_num][lab] = 1
    my_KNN(features[:2500], labels_arr[:2500])
Exemple #15
0
 def __init__(self, mode='dev', transform=None):
     assert mode in ['dev', 'valid',
                     'test'], "mode should be dev/valid/test"
     self.mode = mode
     self.transform = transform
     self.file_dir = pathlib.Path(get_download_dir(), mode)
     self.zip_file_path = pathlib.Path(get_download_dir(),
                                       '%s_v20190730.zip' % mode)
     # download(_urls['Alchemy'] + "%s_v20190730.zip" % mode,
     #          path=str(self.zip_file_path))
     # extract_archive(str(self.zip_file_path), str(self.file_dir))
     sub_dirs = os.listdir(self.file_dir)
     if 'sdf' not in sub_dirs:
         assert len(sub_dirs) == 1
         self.file_dir = os.path.join(self.file_dir, sub_dirs[0])
     self._load()
Exemple #16
0
    def __init__(self, hidden_size, latent_size, depth, vocab_file=None):
        super(DGLJTNNVAE, self).__init__()

        if vocab_file is None:
            default_dir = get_download_dir()
            vocab_file = '{}/jtvae/{}.txt'.format(default_dir, 'vocab')
            zip_file_path = '{}/jtvae.zip'.format(default_dir)
            download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path)
            extract_archive(zip_file_path, '{}/jtvae'.format(default_dir))

        with open(vocab_file, 'r') as f:
            self.vocab = Vocab([x.strip("\r\n ") for x in f])

        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.depth = depth

        self.embedding = nn.Embedding(self.vocab.size(), hidden_size)
        self.mpn = DGLMPN(hidden_size, depth)
        self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding)
        self.decoder = DGLJTNNDecoder(self.vocab, hidden_size,
                                      latent_size // 2, self.embedding)
        self.jtmpn = DGLJTMPN(hidden_size, depth)

        self.T_mean = nn.Linear(hidden_size, latent_size // 2)
        self.T_var = nn.Linear(hidden_size, latent_size // 2)
        self.G_mean = nn.Linear(hidden_size, latent_size // 2)
        self.G_var = nn.Linear(hidden_size, latent_size // 2)

        self.atom_featurizer_enc = get_atom_featurizer_enc()
        self.bond_featurizer_enc = get_bond_featurizer_enc()
        self.atom_featurizer_dec = get_atom_featurizer_dec()
        self.bond_featurizer_dec = get_bond_featurizer_dec()
Exemple #17
0
    def __init__(self, data, vocab, training=True):
        dir = get_download_dir()

        _url = _get_dgl_url('dataset/jtnn.zip')
        zip_file_path = '{}/jtnn.zip'.format(dir)
        download(_url, path=zip_file_path)
        extract_archive(zip_file_path, '{}/jtnn'.format(dir))

        print('Loading data...')
        if data in ['train', 'test']:
            # ZINC subset
            data_file = '{}/jtnn/{}.txt'.format(dir, data)
        else:
            # New dataset
            data_file = data
        with open(data_file) as f:
            self.data = [line.strip("\r\n ").split()[0] for line in f]
        self.vocab = vocab

        print('Loading finished')
        print('\t# samples:', len(self.data))
        self.training = training

        self.atom_featurizer_enc = get_atom_featurizer_enc()
        self.bond_featurizer_enc = get_bond_featurizer_enc()
        self.atom_featurizer_dec = get_atom_featurizer_dec()
        self.bond_featurizer_dec = get_bond_featurizer_dec()
Exemple #18
0
def convert_reddit_data(dataset, out_folder, self_loop=False):
    """
  Load DGL graph dataset
  """
    self_loop_str = ""
    if self_loop:
        self_loop_str = "_self_loop"
    download_dir = get_download_dir()
    extract_dir = os.path.join(download_dir,
                               "{}{}".format(dataset, self_loop_str))

    coo_adj = scipy.sparse.load_npz(
        os.path.join(extract_dir,
                     "{}{}_graph.npz".format(dataset, self_loop_str)))

    reddit_data = np.load(
        os.path.join(extract_dir, "{}_data.npz".format(dataset)))
    features = reddit_data["feature"]
    labels = reddit_data["label"]
    node_types = reddit_data["node_types"]
    train_mask = (node_types == 1)
    val_mask = (node_types == 2)
    test_mask = (node_types == 3)

    scipy.sparse.save_npz(os.path.join(out_folder, 'adj.npz'), coo_adj)
    np.save(os.path.join(out_folder, 'feat.npy'), features)
    np.save(os.path.join(out_folder, 'labels.npy'), labels)
    np.save(os.path.join(out_folder, 'train.npy'), train_mask)
    np.save(os.path.join(out_folder, 'val.npy'), val_mask)
    np.save(os.path.join(out_folder, 'test.npy'), test_mask)

    print('Convert Finishes')
 def __init__(self, name):
     self.name = name
     self.dir = get_download_dir()
     tgz_path = os.path.join(self.dir, '{}.tgz'.format(self.name))
     download(_downlaod_prefix + '{}.tgz'.format(self.name), tgz_path)
     self.dir = os.path.join(self.dir, self.name)
     extract_archive(tgz_path, self.dir)
Exemple #20
0
    def __init__(self, subset, load_binding_pocket=True, sanitize=False, calc_charges=False,
                 remove_hs=False, use_conformation=True,
                 construct_graph_and_featurize=ACNN_graph_construction_and_featurization,
                 zero_padding=True, num_processes=64):
        self.task_names = ['-logKd/Ki']
        self.n_tasks = len(self.task_names)

        self._url = 'dataset/pdbbind_v2015.tar.gz'
        root_dir_path = get_download_dir()
        data_path = root_dir_path + '/pdbbind_v2015.tar.gz'
        extracted_data_path = root_dir_path + '/pdbbind_v2015'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, extracted_data_path)

        if subset == 'core':
            index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013'
        elif subset == 'refined':
            index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015'
        else:
            raise ValueError(
                'Expect the subset_choice to be either '
                'core or refined, got {}'.format(subset))

        self._preprocess(extracted_data_path, index_label_file, load_binding_pocket,
                         sanitize, calc_charges, remove_hs, use_conformation,
                         construct_graph_and_featurize, zero_padding, num_processes)
Exemple #21
0
    def __init__(self, data, vocab, training=True):
        self.dir = get_download_dir()
        self.zip_file_path = '{}/jtnn.zip'.format(self.dir)
        download(_url, path=self.zip_file_path)
        extract_archive(self.zip_file_path, '{}/jtnn'.format(self.dir))
        print('Loading data...')
        if data in ['train', 'test']:
            data_file = '{}/jtnn/{}.txt'.format(self.dir, data)
        else:
            data_file = data
        with open(data_file) as f:
            self.data = [line.strip("\r\n ").split()[0] for line in f]

        if vocab == 'zinc':
            self.vocab_file = '{}/jtnn/vocab.txt'.format(self.dir)
        elif vocab == 'guacamol':
            self.vocab_file = '{}/jtnn/vocab_guacamol.txt'.format(self.dir)
        else:
            self.vocab_file = vocab

        print('Loading finished.')
        print('\tNum samples:', len(self.data))
        print('\tVocab file:', self.vocab_file)
        self.training = training
        self.vocab = Vocab([x.strip("\r\n ") for x in open(self.vocab_file)])
Exemple #22
0
    def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None):
        super(DGLJTNNVAE, self).__init__()
        if vocab is None:
            if vocab_file is None:
                vocab_file = '{}/jtnn/{}.txt'.format(
                    get_download_dir(), 'vocab')

            self.vocab = Vocab([x.strip("\r\n ")
                                for x in open(vocab_file)])
        else:
            self.vocab = vocab

        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.depth = depth

        self.embedding = nn.Embedding(self.vocab.size(), hidden_size)
        self.mpn = DGLMPN(hidden_size, depth)
        self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding)
        self.decoder = DGLJTNNDecoder(
            self.vocab, hidden_size, latent_size // 2, self.embedding)
        self.jtmpn = DGLJTMPN(hidden_size, depth)

        self.T_mean = nn.Linear(hidden_size, latent_size // 2)
        self.T_var = nn.Linear(hidden_size, latent_size // 2)
        self.G_mean = nn.Linear(hidden_size, latent_size // 2)
        self.G_var = nn.Linear(hidden_size, latent_size // 2)

        self.n_nodes_total = 0
        self.n_passes = 0
        self.n_edges_total = 0
        self.n_tree_nodes_total = 0
Exemple #23
0
    def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None):
        super(DGLJTNNVAE, self).__init__()
        if vocab is None:
            if vocab_file is None:
                default_dir = get_download_dir()
                vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab')
                zip_file_path = '{}/jtnn.zip'.format(default_dir)
                download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path)
                extract_archive(zip_file_path, '{}/jtnn'.format(default_dir))

            self.vocab = Vocab([x.strip("\r\n ") for x in open(vocab_file)])
        else:
            self.vocab = vocab

        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.depth = depth

        self.embedding = nn.Embedding(self.vocab.size(), hidden_size)
        self.mpn = DGLMPN(hidden_size, depth)
        self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding)
        self.decoder = DGLJTNNDecoder(
            self.vocab, hidden_size, latent_size // 2, self.embedding)
        self.jtmpn = DGLJTMPN(hidden_size, depth)

        self.T_mean = nn.Linear(hidden_size, latent_size // 2)
        self.T_var = nn.Linear(hidden_size, latent_size // 2)
        self.G_mean = nn.Linear(hidden_size, latent_size // 2)
        self.G_var = nn.Linear(hidden_size, latent_size // 2)

        self.n_nodes_total = 0
        self.n_passes = 0
        self.n_edges_total = 0
        self.n_tree_nodes_total = 0
Exemple #24
0
    def __init__(self, file_path=None):
        if file_path is None:
            from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive

            default_dir = get_download_dir()
            vocab_file = '{}/jtvae/vocab.txt'.format(default_dir)
            zip_file_path = '{}/jtvae.zip'.format(default_dir)
            download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path, overwrite=False)
            extract_archive(zip_file_path, '{}/jtvae'.format(default_dir))

            with open(vocab_file, 'r') as f:
                self.vocab = [x.strip("\r\n ") for x in f]
        else:
            # Prepare a vocabulary from scratch
            vocab = set()
            with open(file_path, 'r') as f:
                for line in f:
                    smiles = line.split()[0]
                    mol = MolTree(smiles)
                    for i in mol.nodes_dict:
                        vocab.add(mol.nodes_dict[i]['smiles'])
            self.vocab = list(vocab)

        self.vmap = {x: i for i, x in enumerate(self.vocab)}
        self.slots = [get_slots(smiles) for smiles in self.vocab]
Exemple #25
0
    def __init__(self,
                 mode='dev',
                 mol_to_graph=mol_to_complete_graph,
                 node_featurizer=alchemy_nodes,
                 edge_featurizer=alchemy_edges,
                 load=True):
        if mode == 'test':
            raise ValueError('The test mode is not supported before '
                             'the Alchemy contest finishes.')

        assert mode in ['dev', 'valid', 'test'], \
            'Expect mode to be dev, valid or test, got {}.'.format(mode)

        self.mode = mode

        # Construct DGLGraphs from raw data or use the preprocessed data
        self.load = load
        file_dir = osp.join(get_download_dir(), 'Alchemy_data')

        if load:
            file_name = "{}_processed_dgl".format(mode)
        else:
            file_name = "{}_single_sdf".format(mode)
        self.file_dir = pathlib.Path(file_dir, file_name)

        self._url = 'dataset/alchemy/'
        self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip')
        download(_get_dgl_url(self._url + file_name + '.zip'),
                 path=str(self.zip_file_path))
        if not os.path.exists(str(self.file_dir)):
            archive = zipfile.ZipFile(self.zip_file_path)
            archive.extractall(file_dir)
            archive.close()

        self._load(mol_to_graph, node_featurizer, edge_featurizer)
Exemple #26
0
    def __init__(self, mode='dev', transform=None, from_raw=False):
        assert mode in ['dev', 'valid',
                        'test'], "mode should be dev/valid/test"
        self.mode = mode
        self.transform = transform

        # Construct the dgl graph from raw data or use the preprocessed data directly
        self.from_raw = from_raw
        file_dir = osp.join(get_download_dir(), './Alchemy_data')

        if not from_raw:
            file_name = "%s_processed" % (mode)
        else:
            file_name = "%s_single_sdf" % (mode)
        self.file_dir = pathlib.Path(file_dir, file_name)

        self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip')
        download(_urls['Alchemy'] + file_name + '.zip',
                 path=str(self.zip_file_path))
        if not os.path.exists(str(self.file_dir)):
            archive = zipfile.ZipFile(self.zip_file_path)
            archive.extractall(file_dir)
            archive.close()

        self._load()
Exemple #27
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./tox21_dglgraph.bin',
                 n_jobs=1):
        self._url = 'dataset/tox21.csv.gz'
        data_path = get_download_dir() + '/tox21.csv.gz'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        df = pd.read_csv(data_path)
        self.id = df['mol_id']

        df = df.drop(columns=['mol_id'])

        self.load_full = False

        super(Tox21, self).__init__(df,
                                    smiles_to_graph,
                                    node_featurizer,
                                    edge_featurizer,
                                    "smiles",
                                    cache_file_path,
                                    load=load,
                                    log_every=log_every,
                                    n_jobs=n_jobs)

        self.id = [self.id[i] for i in self.valid_ids]
Exemple #28
0
def load_acm_raw(remove_self_loop):
    assert not remove_self_loop
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'
    download(_get_dgl_url(url), path=data_path)

    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']  # paper-field?
    p_vs_a = data['PvsA']  # paper-author
    p_vs_t = data['PvsT']  # paper-term, bag of words
    p_vs_c = data['PvsC']  # paper-conference, labels come from that

    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]
    label_ids = [0, 1, 2, 2, 1]

    p_vs_c_filter = p_vs_c[:, conf_ids]
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_l = p_vs_l[p_selected]
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]

    hg = dgl.heterograph({
        ('paper', 'pa', 'author'): p_vs_a.nonzero(),
        ('author', 'ap', 'paper'): p_vs_a.transpose.nonzero(),
        ('paper', 'pf', 'field'): p_vs_l.nonzero(),
        ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
    })

    features = torch.FloatTensor(p_vs_t.toarray())

    pc_p, pc_c = p_vs_c.nonzero()
    labels = np.zeros(len(p_selected), dtype=np.int64)
    for conf_id, label_id in zip(conf_ids, label_ids):
        labels[pc_p[pc_c == conf_id]] = label_id
    labels = torch.LongTensor(labels)

    num_classes = 3

    float_mask = np.zeros(len(pc_p))
    for conf_id in conf_ids:
        pc_c_mask = (pc_c == conf_id)
        float_mask[pc_c_mask] = np.random.permutation(
            np.linspace(0, 1, pc_c_mask.sum()))
    train_idx = np.where(float_mask <= 0.2)[0]
    val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
    test_idx = np.where(float_mask > 0.3)[0]

    num_nodes = hg.number_of_nodes('paper')
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \
            train_mask, val_mask, test_mask
Exemple #29
0
def test_jtvae():
    # Test DGLMolTree
    smiles = 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C'
    tree = DGLMolTree(smiles)
    assert tree.treesize() == 17
    tree.assemble()
    assert tree._recover_node(0, tree.mol) == 'C[CH3:15]'
    tree.recover()

    # Test JTVAEDataset
    smiles = [
        'CCCCCCC1=NN2C(=N)/C(=C\c3cc(C)n(-c4ccc(C)cc4C)c3C)C(=O)N=C2S1',
        'COCC[C@@H](C)C(=O)N(C)Cc1ccc(O)cc1'
    ]
    with open('data.txt', 'w') as f:
        for smi in smiles:
            f.write(smi + '\n')

    default_dir = get_download_dir()
    vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab')
    zip_file_path = '{}/jtnn.zip'.format(default_dir)
    download(_get_dgl_url('dataset/jtnn.zip'),
             path=zip_file_path,
             overwrite=False)
    extract_archive(zip_file_path, '{}/jtnn'.format(default_dir))

    with open(vocab_file, 'r') as f:
        vocab = Vocab([x.strip("\r\n ") for x in f])
    dataset = JTVAEDataset('data.txt', vocab)
    assert len(dataset) == 2
    assert set(dataset[0].keys()) == {
        'cand_graphs', 'mol_graph', 'mol_tree', 'stereo_cand_graphs',
        'stereo_cand_label', 'tree_mess_src_e', 'tree_mess_tgt_e',
        'tree_mess_tgt_n', 'wid'
    }
    dataset.training = False
    assert set(dataset[0].keys()) == {'mol_graph', 'mol_tree', 'wid'}

    dataset.training = True
    collate_fn = JTVAECollator(training=True)
    loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
    for _, batch_data in enumerate(loader):
        assert set(batch_data.keys()) == {
            'cand_batch_idx', 'cand_graph_batch', 'mol_graph_batch',
            'mol_trees', 'stereo_cand_batch_idx', 'stereo_cand_graph_batch',
            'stereo_cand_labels', 'stereo_cand_lengths', 'tree_mess_src_e',
            'tree_mess_tgt_e', 'tree_mess_tgt_n'
        }

    dataset.training = False
    collate_fn = JTVAECollator(training=False)
    loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
    for _, batch_data in enumerate(loader):
        assert set(batch_data.keys()) == {'mol_graph_batch', 'mol_trees'}

    remove_file('data.txt')
    remove_file(zip_file_path)
    remove_dir(default_dir + '/jtnn')
Exemple #30
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='esol_dglgraph.bin'):

        self._url = 'dataset/ESOL.zip'
        data_path = get_download_dir() + '/ESOL.zip'
        dir_path = get_download_dir() + '/ESOL'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/delaney-processed.csv')

        # Compound names in PubChem
        self.compound_names = df['Compound ID'].tolist()
        # Estimated solubility
        self.estimated_solubility = df['ESOL predicted log solubility in mols per litre'].tolist()
        # Minimum atom degree
        self.min_degree = df['Minimum Degree'].tolist()
        # Molecular weight
        self.mol_weight = df['Molecular Weight'].tolist()
        # Number of H-Bond Donors
        self.num_h_bond_donors = df['Number of H-Bond Donors'].tolist()
        # Number of rings
        self.num_rings = df['Number of Rings'].tolist()
        # Number of rotatable bonds
        self.num_rotatable_bonds = df['Number of Rotatable Bonds'].tolist()
        # Polar Surface Area
        self.polar_surface_area = df['Polar Surface Area'].tolist()

        self.load_full = False

        super(ESOL, self).__init__(df=df,
                                   smiles_to_graph=smiles_to_graph,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   smiles_column='smiles',
                                   cache_file_path=cache_file_path,
                                   task_names=['measured log solubility in mols per litre'],
                                   load=load,
                                   log_every=log_every,
                                   init_mask=False)