Beispiel #1
0
    def __init__(self, data, vocab, training=True):
        dir = get_download_dir()

        _url = _get_dgl_url('dataset/jtnn.zip')
        zip_file_path = '{}/jtnn.zip'.format(dir)
        download(_url, path=zip_file_path)
        extract_archive(zip_file_path, '{}/jtnn'.format(dir))

        print('Loading data...')
        if data in ['train', 'test']:
            # ZINC subset
            data_file = '{}/jtnn/{}.txt'.format(dir, data)
        else:
            # New dataset
            data_file = data
        with open(data_file) as f:
            self.data = [line.strip("\r\n ").split()[0] for line in f]
        self.vocab = vocab

        print('Loading finished')
        print('\t# samples:', len(self.data))
        self.training = training

        self.atom_featurizer_enc = get_atom_featurizer_enc()
        self.bond_featurizer_enc = get_bond_featurizer_enc()
        self.atom_featurizer_dec = get_atom_featurizer_dec()
        self.bond_featurizer_dec = get_bond_featurizer_dec()
Beispiel #2
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./tox21_dglgraph.bin',
                 n_jobs=1):
        self._url = 'dataset/tox21.csv.gz'
        data_path = get_download_dir() + '/tox21.csv.gz'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        df = pd.read_csv(data_path)
        self.id = df['mol_id']

        df = df.drop(columns=['mol_id'])

        self.load_full = False

        super(Tox21, self).__init__(df,
                                    smiles_to_graph,
                                    node_featurizer,
                                    edge_featurizer,
                                    "smiles",
                                    cache_file_path,
                                    load=load,
                                    log_every=log_every,
                                    n_jobs=n_jobs)

        self.id = [self.id[i] for i in self.valid_ids]
Beispiel #3
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./sider_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/sider.zip'
        data_path = get_download_dir() + '/sider.zip'
        dir_path = get_download_dir() + '/sider'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/sider.csv')

        super(SIDER, self).__init__(df=df,
                                    smiles_to_graph=smiles_to_graph,
                                    node_featurizer=node_featurizer,
                                    edge_featurizer=edge_featurizer,
                                    smiles_column='smiles',
                                    cache_file_path=cache_file_path,
                                    load=load,
                                    log_every=log_every,
                                    init_mask=True,
                                    n_jobs=n_jobs)
Beispiel #4
0
    def __init__(self, hidden_size, latent_size, depth, vocab_file=None):
        super(DGLJTNNVAE, self).__init__()

        if vocab_file is None:
            default_dir = get_download_dir()
            vocab_file = '{}/jtvae/{}.txt'.format(default_dir, 'vocab')
            zip_file_path = '{}/jtvae.zip'.format(default_dir)
            download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path)
            extract_archive(zip_file_path, '{}/jtvae'.format(default_dir))

        with open(vocab_file, 'r') as f:
            self.vocab = Vocab([x.strip("\r\n ") for x in f])

        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.depth = depth

        self.embedding = nn.Embedding(self.vocab.size(), hidden_size)
        self.mpn = DGLMPN(hidden_size, depth)
        self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding)
        self.decoder = DGLJTNNDecoder(self.vocab, hidden_size,
                                      latent_size // 2, self.embedding)
        self.jtmpn = DGLJTMPN(hidden_size, depth)

        self.T_mean = nn.Linear(hidden_size, latent_size // 2)
        self.T_var = nn.Linear(hidden_size, latent_size // 2)
        self.G_mean = nn.Linear(hidden_size, latent_size // 2)
        self.G_var = nn.Linear(hidden_size, latent_size // 2)

        self.atom_featurizer_enc = get_atom_featurizer_enc()
        self.bond_featurizer_enc = get_bond_featurizer_enc()
        self.atom_featurizer_dec = get_atom_featurizer_dec()
        self.bond_featurizer_dec = get_bond_featurizer_dec()
Beispiel #5
0
    def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None):
        super(DGLJTNNVAE, self).__init__()
        if vocab is None:
            if vocab_file is None:
                default_dir = get_download_dir()
                vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab')
                zip_file_path = '{}/jtnn.zip'.format(default_dir)
                download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path)
                extract_archive(zip_file_path, '{}/jtnn'.format(default_dir))

            self.vocab = Vocab([x.strip("\r\n ") for x in open(vocab_file)])
        else:
            self.vocab = vocab

        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.depth = depth

        self.embedding = nn.Embedding(self.vocab.size(), hidden_size)
        self.mpn = DGLMPN(hidden_size, depth)
        self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding)
        self.decoder = DGLJTNNDecoder(
            self.vocab, hidden_size, latent_size // 2, self.embedding)
        self.jtmpn = DGLJTMPN(hidden_size, depth)

        self.T_mean = nn.Linear(hidden_size, latent_size // 2)
        self.T_var = nn.Linear(hidden_size, latent_size // 2)
        self.G_mean = nn.Linear(hidden_size, latent_size // 2)
        self.G_var = nn.Linear(hidden_size, latent_size // 2)

        self.n_nodes_total = 0
        self.n_passes = 0
        self.n_edges_total = 0
        self.n_tree_nodes_total = 0
Beispiel #6
0
    def __init__(self, file_path=None):
        if file_path is None:
            from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive

            default_dir = get_download_dir()
            vocab_file = '{}/jtvae/vocab.txt'.format(default_dir)
            zip_file_path = '{}/jtvae.zip'.format(default_dir)
            download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path, overwrite=False)
            extract_archive(zip_file_path, '{}/jtvae'.format(default_dir))

            with open(vocab_file, 'r') as f:
                self.vocab = [x.strip("\r\n ") for x in f]
        else:
            # Prepare a vocabulary from scratch
            vocab = set()
            with open(file_path, 'r') as f:
                for line in f:
                    smiles = line.split()[0]
                    mol = MolTree(smiles)
                    for i in mol.nodes_dict:
                        vocab.add(mol.nodes_dict[i]['smiles'])
            self.vocab = list(vocab)

        self.vmap = {x: i for i, x in enumerate(self.vocab)}
        self.slots = [get_slots(smiles) for smiles in self.vocab]
Beispiel #7
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./muv_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/muv.zip'
        data_path = get_download_dir() + '/muv.zip'
        dir_path = get_download_dir() + '/muv'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/muv.csv')

        self.ids = df['mol_id'].tolist()
        self.load_full = False

        df = df.drop(columns=['mol_id'])

        super(MUV, self).__init__(df=df,
                                  smiles_to_graph=smiles_to_graph,
                                  node_featurizer=node_featurizer,
                                  edge_featurizer=edge_featurizer,
                                  smiles_column='smiles',
                                  cache_file_path=cache_file_path,
                                  load=load,
                                  log_every=log_every,
                                  init_mask=True,
                                  n_jobs=n_jobs)

        self.ids = [self.ids[i] for i in self.valid_ids]
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='lipophilicity_dglgraph.bin'):

        self._url = 'dataset/lipophilicity.zip'
        data_path = get_download_dir() + '/lipophilicity.zip'
        dir_path = get_download_dir() + '/lipophilicity'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/Lipophilicity.csv')

        # ChEMBL ids
        self.chembl_ids = df['CMPD_CHEMBLID'].tolist()

        self.load_full = False

        super(Lipophilicity, self).__init__(df=df,
                                            smiles_to_graph=smiles_to_graph,
                                            node_featurizer=node_featurizer,
                                            edge_featurizer=edge_featurizer,
                                            smiles_column='smiles',
                                            cache_file_path=cache_file_path,
                                            task_names=['exp'],
                                            load=load,
                                            log_every=log_every,
                                            init_mask=False)
Beispiel #9
0
 def __init__(self, raw_dir=None, force_reload=False, verbose=True):
     url = _get_dgl_url('dataset/wn18.tgz')
     super(WN18Dataset, self).__init__('wn18',
                                       url=url,
                                       raw_dir=raw_dir,
                                       force_reload=force_reload,
                                       verbose=verbose)
Beispiel #10
0
    def __init__(self, subset, load_binding_pocket=True, sanitize=False, calc_charges=False,
                 remove_hs=False, use_conformation=True,
                 construct_graph_and_featurize=ACNN_graph_construction_and_featurization,
                 zero_padding=True, num_processes=64):
        self.task_names = ['-logKd/Ki']
        self.n_tasks = len(self.task_names)

        self._url = 'dataset/pdbbind_v2015.tar.gz'
        root_dir_path = get_download_dir()
        data_path = root_dir_path + '/pdbbind_v2015.tar.gz'
        extracted_data_path = root_dir_path + '/pdbbind_v2015'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, extracted_data_path)

        if subset == 'core':
            index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013'
        elif subset == 'refined':
            index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015'
        else:
            raise ValueError(
                'Expect the subset_choice to be either '
                'core or refined, got {}'.format(subset))

        self._preprocess(extracted_data_path, index_label_file, load_binding_pocket,
                         sanitize, calc_charges, remove_hs, use_conformation,
                         construct_graph_and_featurize, zero_padding, num_processes)
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='AstraZeneca_chembl_solubility_graph.bin',
                 log_of_values=True):

        self._url = 'dataset/AstraZeneca_ChEMBL_Solubility.csv'
        data_path = get_download_dir() + '/AstraZeneca_ChEMBL_Solubility.csv'
        download(_get_dgl_url(self._url), path=data_path)
        df = pd.read_csv(data_path)

        # ChEMBL ids
        self.chembl_ids = df['Molecule ChEMBL ID'].tolist()
        # Molecular weight
        self.mol_weight = df['Molecular Weight'].tolist()

        self.load_full = False

        super(AstraZenecaChEMBLSolubility, self).__init__(
            df=df,
            smiles_to_graph=smiles_to_graph,
            node_featurizer=node_featurizer,
            edge_featurizer=edge_featurizer,
            smiles_column='Smiles',
            cache_file_path=cache_file_path,
            task_names=['Solubility'],
            load=load,
            log_every=log_every,
            init_mask=False)

        if log_of_values:
            self.labels = self.labels.log()
Beispiel #12
0
def download_and_load_checkpoint(model_name, model, model_postfix,
                                 local_pretrained_path='pre_trained.pth', log=True):
    """Download pretrained model checkpoint

    The model will be loaded to CPU.

    Parameters
    ----------
    model_name : str
        Name of the model
    model : nn.Module
        Instantiated model instance
    model_postfix : str
        Postfix for pretrained model checkpoint
    local_pretrained_path : str
        Local name for the downloaded model checkpoint
    log : bool
        Whether to print progress for model loading

    Returns
    -------
    model : nn.Module
        Pretrained model
    """
    url_to_pretrained = _get_dgl_url(model_postfix)
    local_pretrained_path = '_'.join([model_name, local_pretrained_path])
    download(url_to_pretrained, path=local_pretrained_path, log=log)
    checkpoint = torch.load(local_pretrained_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])

    if log:
        print('Pretrained model loaded')

    return model
Beispiel #13
0
    def __init__(self,
                 subset,
                 mol_to_graph=mol_to_bigraph,
                 node_featurizer=default_node_featurizer,
                 edge_featurizer=default_edge_featurizer,
                 atom_pair_featurizer=default_atom_pair_featurizer,
                 load=True):
        assert subset in ['train', 'val', 'test'], \
            'Expect subset to be "train" or "val" or "test", got {}'.format(subset)
        print('Preparing {} subset of USPTO'.format(subset))
        self._subset = subset
        if subset == 'val':
            subset = 'valid'

        self._url = 'dataset/uspto.zip'
        data_path = get_download_dir() + '/uspto.zip'
        extracted_data_path = get_download_dir() + '/uspto'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, extracted_data_path)

        super(USPTO, self).__init__(
            raw_file_path=extracted_data_path + '/{}.txt'.format(subset),
            mol_graph_path=extracted_data_path + '/{}_mol_graphs.bin'.format(subset),
            mol_to_graph=mol_to_graph,
            node_featurizer=node_featurizer,
            edge_featurizer=edge_featurizer,
            atom_pair_featurizer=atom_pair_featurizer,
            load=load)
Beispiel #14
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=True,
                 log_every=1000,
                 cache_file_path='freesolv_dglgraph.bin'):

        self._url = 'dataset/FreeSolv.zip'
        data_path = get_download_dir() + '/FreeSolv.zip'
        dir_path = get_download_dir() + '/FreeSolv'
        download(_get_dgl_url(self._url), path=data_path)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/SAMPL.csv')

        # Iupac names
        self.iupac_names = df['iupac'].tolist()
        # Calculated hydration free energy
        self.calc_energy = df['calc'].tolist()

        self.load_full = False

        super(FreeSolv, self).__init__(df=df,
                                       smiles_to_graph=smiles_to_graph,
                                       node_featurizer=node_featurizer,
                                       edge_featurizer=edge_featurizer,
                                       smiles_column='smiles',
                                       cache_file_path=cache_file_path,
                                       task_names=['expt'],
                                       load=load,
                                       log_every=log_every,
                                       init_mask=False)
Beispiel #15
0
    def __init__(self,
                 mode='dev',
                 mol_to_graph=mol_to_complete_graph,
                 node_featurizer=alchemy_nodes,
                 edge_featurizer=alchemy_edges,
                 load=True):
        if mode == 'test':
            raise ValueError('The test mode is not supported before '
                             'the Alchemy contest finishes.')

        assert mode in ['dev', 'valid', 'test'], \
            'Expect mode to be dev, valid or test, got {}.'.format(mode)

        self.mode = mode

        # Construct DGLGraphs from raw data or use the preprocessed data
        self.load = load
        file_dir = osp.join(get_download_dir(), 'Alchemy_data')

        if load:
            file_name = "{}_processed_dgl".format(mode)
        else:
            file_name = "{}_single_sdf".format(mode)
        self.file_dir = pathlib.Path(file_dir, file_name)

        self._url = 'dataset/alchemy/'
        self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip')
        download(_get_dgl_url(self._url + file_name + '.zip'),
                 path=str(self.zip_file_path))
        if not os.path.exists(str(self.file_dir)):
            archive = zipfile.ZipFile(self.zip_file_path)
            archive.extractall(file_dir)
            archive.close()

        self._load(mol_to_graph, node_featurizer, edge_featurizer)
Beispiel #16
0
    def __init__(self,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path='./bbbp_dglgraph.bin',
                 n_jobs=1):

        self._url = 'dataset/bbbp.zip'
        data_path = get_download_dir() + '/bbbp.zip'
        dir_path = get_download_dir() + '/bbbp'
        download(_get_dgl_url(self._url), path=data_path, overwrite=False)
        extract_archive(data_path, dir_path)
        df = pd.read_csv(dir_path + '/BBBP.csv')

        super(BBBP, self).__init__(df=df,
                                   smiles_to_graph=smiles_to_graph,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   smiles_column='smiles',
                                   cache_file_path=cache_file_path,
                                   task_names=['p_np'],
                                   load=load,
                                   log_every=log_every,
                                   init_mask=True,
                                   n_jobs=n_jobs)

        self.load_full = False
        self.names = df['name'].tolist()
        self.names = [self.names[i] for i in self.valid_ids]
Beispiel #17
0
def load_acm_raw(remove_self_loop):
    assert not remove_self_loop
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'
    download(_get_dgl_url(url), path=data_path)

    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']  # paper-field?
    p_vs_a = data['PvsA']  # paper-author
    p_vs_t = data['PvsT']  # paper-term, bag of words
    p_vs_c = data['PvsC']  # paper-conference, labels come from that

    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]
    label_ids = [0, 1, 2, 2, 1]

    p_vs_c_filter = p_vs_c[:, conf_ids]
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_l = p_vs_l[p_selected]
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]

    hg = dgl.heterograph({
        ('paper', 'pa', 'author'): p_vs_a.nonzero(),
        ('author', 'ap', 'paper'): p_vs_a.transpose.nonzero(),
        ('paper', 'pf', 'field'): p_vs_l.nonzero(),
        ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
    })

    features = torch.FloatTensor(p_vs_t.toarray())

    pc_p, pc_c = p_vs_c.nonzero()
    labels = np.zeros(len(p_selected), dtype=np.int64)
    for conf_id, label_id in zip(conf_ids, label_ids):
        labels[pc_p[pc_c == conf_id]] = label_id
    labels = torch.LongTensor(labels)

    num_classes = 3

    float_mask = np.zeros(len(pc_p))
    for conf_id in conf_ids:
        pc_c_mask = (pc_c == conf_id)
        float_mask[pc_c_mask] = np.random.permutation(
            np.linspace(0, 1, pc_c_mask.sum()))
    train_idx = np.where(float_mask <= 0.2)[0]
    val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
    test_idx = np.where(float_mask > 0.3)[0]

    num_nodes = hg.number_of_nodes('paper')
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \
            train_mask, val_mask, test_mask
Beispiel #18
0
def test_jtvae():
    # Test DGLMolTree
    smiles = 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C'
    tree = DGLMolTree(smiles)
    assert tree.treesize() == 17
    tree.assemble()
    assert tree._recover_node(0, tree.mol) == 'C[CH3:15]'
    tree.recover()

    # Test JTVAEDataset
    smiles = [
        'CCCCCCC1=NN2C(=N)/C(=C\c3cc(C)n(-c4ccc(C)cc4C)c3C)C(=O)N=C2S1',
        'COCC[C@@H](C)C(=O)N(C)Cc1ccc(O)cc1'
    ]
    with open('data.txt', 'w') as f:
        for smi in smiles:
            f.write(smi + '\n')

    default_dir = get_download_dir()
    vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab')
    zip_file_path = '{}/jtnn.zip'.format(default_dir)
    download(_get_dgl_url('dataset/jtnn.zip'),
             path=zip_file_path,
             overwrite=False)
    extract_archive(zip_file_path, '{}/jtnn'.format(default_dir))

    with open(vocab_file, 'r') as f:
        vocab = Vocab([x.strip("\r\n ") for x in f])
    dataset = JTVAEDataset('data.txt', vocab)
    assert len(dataset) == 2
    assert set(dataset[0].keys()) == {
        'cand_graphs', 'mol_graph', 'mol_tree', 'stereo_cand_graphs',
        'stereo_cand_label', 'tree_mess_src_e', 'tree_mess_tgt_e',
        'tree_mess_tgt_n', 'wid'
    }
    dataset.training = False
    assert set(dataset[0].keys()) == {'mol_graph', 'mol_tree', 'wid'}

    dataset.training = True
    collate_fn = JTVAECollator(training=True)
    loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
    for _, batch_data in enumerate(loader):
        assert set(batch_data.keys()) == {
            'cand_batch_idx', 'cand_graph_batch', 'mol_graph_batch',
            'mol_trees', 'stereo_cand_batch_idx', 'stereo_cand_graph_batch',
            'stereo_cand_labels', 'stereo_cand_lengths', 'tree_mess_src_e',
            'tree_mess_tgt_e', 'tree_mess_tgt_n'
        }

    dataset.training = False
    collate_fn = JTVAECollator(training=False)
    loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
    for _, batch_data in enumerate(loader):
        assert set(batch_data.keys()) == {'mol_graph_batch', 'mol_trees'}

    remove_file('data.txt')
    remove_file(zip_file_path)
    remove_dir(default_dir + '/jtnn')
Beispiel #19
0
    def __init__(self, smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None, edge_featurizer=None, load=True, log_every=1000):
        self._url = 'dataset/pubchem_bioassay_aromaticity.csv'
        data_path = get_download_dir() + '/pubchem_bioassay_aromaticity.csv'
        download(_get_dgl_url(self._url), path=data_path)
        df = pd.read_csv(data_path)

        super(PubChemBioAssayAromaticity, self).__init__(
            df, smiles_to_graph, node_featurizer, edge_featurizer, "cano_smiles",
            "pubchem_aromaticity_dglgraph.bin", load=load, log_every=log_every)
Beispiel #20
0
def _download_babi_data():
    download_dir = get_download_dir()
    zip_file_path = os.path.join(download_dir, 'babi_data.zip')

    data_url = _get_dgl_url('models/ggnn_babi_data.zip')
    download(data_url, path=zip_file_path)

    extract_dir = os.path.join(download_dir, 'babi_data')
    if not os.path.exists(extract_dir):
        extract_archive(zip_file_path, extract_dir)
Beispiel #21
0
 def __init__(self,
              data_name,
              raw_dir=None,
              force_reload=False,
              verbose=False):
     _url = _get_dgl_url(f"dataset/{data_name}.zip")
     super(ExtDataset, self).__init__(name=data_name,
                                      url=_url,
                                      raw_dir=raw_dir,
                                      force_reload=force_reload,
                                      verbose=verbose)
Beispiel #22
0
def load_acm_raw():
    from dgl.data.utils import download, get_download_dir, _get_dgl_url
    from scipy import io as sio
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'
    download(_get_dgl_url(url), path=data_path)

    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']  # paper-field?
    p_vs_a = data['PvsA']  # paper-author
    p_vs_t = data['PvsT']  # paper-term, bag of words
    p_vs_c = data['PvsC']  # paper-conference, labels come from that

    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]
    label_ids = [0, 1, 2, 2, 1]

    p_vs_c_filter = p_vs_c[:, conf_ids]
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_l = p_vs_l[p_selected]
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]

    pa = dgl.bipartite(p_vs_a, 'paper', 'pa', 'author')
    pl = dgl.bipartite(p_vs_l, 'paper', 'pf', 'field')
    gs = [pa, pl]
    hg = dgl.hetero_from_relations(gs)

    features = torch.FloatTensor(p_vs_t.toarray())

    pc_p, pc_c = p_vs_c.nonzero()
    labels = np.zeros(len(p_selected), dtype=np.int64)
    for conf_id, label_id in zip(conf_ids, label_ids):
        labels[pc_p[pc_c == conf_id]] = label_id
    labels = torch.LongTensor(labels)

    num_classes = 3

    float_mask = np.zeros(len(pc_p))
    for conf_id in conf_ids:
        pc_c_mask = (pc_c == conf_id)
        float_mask[pc_c_mask] = np.random.permutation(
            np.linspace(0, 1, pc_c_mask.sum()))
    train_idx = np.where(float_mask <= 0.2)[0]
    val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
    test_idx = np.where(float_mask > 0.3)[0]

    hg.nodes["paper"].data["feat"] = features

    return hg, labels, num_classes, train_idx, val_idx, test_idx
Beispiel #23
0
def readFragmentScores(name='fpscores'):
    import gzip
    global _fscores
    fname = '{}.pkl.gz'.format(name)
    download(_get_dgl_url(os.path.join('dataset', fname)), path=fname)
    _fscores = cPickle.load(gzip.open(fname))
    outDict = {}
    for i in _fscores:
        for j in range(1, len(i)):
            outDict[i[j]] = float(i[0])
    _fscores = outDict
Beispiel #24
0
 def __init__(self,
              name,
              raw_dir=None,
              random_seed=717,
              train_size=0.7,
              val_size=0.1):
     assert name in ['gos', 'pol'], "Only supports 'gos' or 'pol'."
     self.seed = random_seed
     self.train_size = train_size
     self.val_size = val_size
     url = _get_dgl_url(self.file_urls[name])
     super(GASDataset, self).__init__(name=name, url=url, raw_dir=raw_dir)
Beispiel #25
0
    def __init__(self,
                 label_keys=None,
                 raw_dir=None,
                 force_reload=False,
                 verbose=True):

        self.label_keys = label_keys
        self._url = _get_dgl_url('dataset/qm9_ver2.zip')
        super(QM9Dataset_v2, self).__init__(name='qm9_v2',
                                            url=self._url,
                                            raw_dir=raw_dir,
                                            force_reload=force_reload,
                                            verbose=verbose)
Beispiel #26
0
 def __init__(self, mode='train', vocab_file=None):
     self.mode = mode
     self.dir = get_download_dir()
     self.zip_file_path='{}/sst.zip'.format(self.dir)
     self.pretrained_file = 'glove.840B.300d.txt' if mode == 'train' else ''
     self.pretrained_emb = None
     self.vocab_file = '{}/sst/vocab.txt'.format(self.dir) if vocab_file is None else vocab_file
     download(_get_dgl_url(_urls['sst']), path=self.zip_file_path)
     extract_archive(self.zip_file_path, '{}/sst'.format(self.dir))
     self.trees = []
     self.num_classes = 5
     print('Preprocessing...')
     self._load()
     print('Dataset creation finished. #Trees:', len(self.trees))
def test_acnn():
    remove_dir('tmp1')
    remove_dir('tmp2')

    url = _get_dgl_url('dgllife/example_mols.tar.gz')
    local_path = 'tmp1/example_mols.tar.gz'
    download(url, path=local_path)
    extract_archive(local_path, 'tmp2')

    pocket_mol, pocket_coords = load_molecule(
        'tmp2/example_mols/example.pdb', remove_hs=True)
    ligand_mol, ligand_coords = load_molecule(
        'tmp2/example_mols/example.pdbqt', remove_hs=True)

    remove_dir('tmp1')
    remove_dir('tmp2')

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    g1 = ACNN_graph_construction_and_featurization(ligand_mol,
                                                   pocket_mol,
                                                   ligand_coords,
                                                   pocket_coords)

    model = ACNN()
    model.to(device)
    g1.to(device)
    assert model(g1).shape == torch.Size([1, 1])

    bg = dgl.batch_hetero([g1, g1])
    bg.to(device)
    assert model(bg).shape == torch.Size([2, 1])

    model = ACNN(hidden_sizes=[1, 2],
                 weight_init_stddevs=[1, 1],
                 dropouts=[0.1, 0.],
                 features_to_use=torch.tensor([6., 8.]),
                 radial=[[12.0], [0.0, 2.0], [4.0]])
    model.to(device)
    g1.to(device)
    assert model(g1).shape == torch.Size([1, 1])

    bg = dgl.batch_hetero([g1, g1])
    bg.to(device)
    assert model(bg).shape == torch.Size([2, 1])
Beispiel #28
0
def download_data(dataset, fname):
    """Download dataset if built-in support exists

    Parameters
    ----------
    dataset : str
        Dataset name
    fname : str
        Name of dataset file
    """
    if dataset not in ['ChEMBL', 'ZINC']:
        # For dataset without built-in support, they should be locally processed.
        return

    data_path = fname
    download(_get_dgl_url(os.path.join('dataset', fname)), path=data_path)
Beispiel #29
0
    def __init__(self, data, vocab, training=True):
        self.dir = get_download_dir()
        self.zip_file_path = '{}/jtnn.zip'.format(self.dir)

        download(_get_dgl_url('dgllife/jtnn.zip'), path=self.zip_file_path)
        extract_archive(self.zip_file_path, '{}/jtnn'.format(self.dir))
        print('Loading data...')
        data_file = '{}/jtnn/{}.txt'.format(self.dir, data)
        with open(data_file) as f:
            self.data = [line.strip("\r\n ").split()[0] for line in f]
        self.vocab_file = '{}/jtnn/{}.txt'.format(self.dir, vocab)
        print('Loading finished.')
        print('\tNum samples:', len(self.data))
        print('\tVocab file:', self.vocab_file)
        self.training = training
        self.vocab = Vocab([x.strip("\r\n ") for x in open(self.vocab_file)])
Beispiel #30
0
def load_acm(remove_self_loop):
    filename = 'ACM3025.pkl'
    url = 'dataset/' + filename
    data_path = get_download_dir() + '/' + filename
    if osp.exists(data_path):
        print(f'Using existing file {filename}', file=sys.stderr)
    else:
        download(_get_dgl_url(url), path=data_path)

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    labels = labels.nonzero()[:, 1]

    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes))
        data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes))

    # Adjacency matrices for meta path based neighbors
    # (Mufei): I verified both of them are binary adjacency matrices with self loops
    author_g = dgl.from_scipy(data['PAP'])
    subject_g = dgl.from_scipy(data['PLP'])
    gs = [author_g, subject_g]

    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = author_g.number_of_nodes()
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    print('dataset loaded')
    pprint({
        'dataset': 'ACM',
        'train': train_mask.sum().item() / num_nodes,
        'val': val_mask.sum().item() / num_nodes,
        'test': test_mask.sum().item() / num_nodes
    })

    return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \
           train_mask, val_mask, test_mask