Beispiel #1
0
def build_graph(directory, random_seed, ctx):
    # dgl.load_backend('mxnet')
    ID, IM = load_data(directory)
    samples = sample(directory, random_seed)

    print('Building graph ...')
    g = dgl.DGLGraph(multigraph=True)
    g.add_nodes(ID.shape[0] + IM.shape[0])
    node_type = nd.zeros(g.number_of_nodes(), dtype='float32', ctx=ctx)
    node_type[:ID.shape[0]] = 1
    g.ndata['type'] = node_type

    print('Adding disease features ...')
    d_data = nd.zeros(shape=(g.number_of_nodes(), ID.shape[1]),
                      dtype='float32',
                      ctx=ctx)
    d_data[:ID.shape[0], :] = nd.from_numpy(ID)
    g.ndata['d_features'] = d_data

    print('Adding miRNA features ...')
    m_data = nd.zeros(shape=(g.number_of_nodes(), IM.shape[1]),
                      dtype='float32',
                      ctx=ctx)
    m_data[ID.shape[0]:ID.shape[0] + IM.shape[0], :] = nd.from_numpy(IM)
    g.ndata['m_features'] = m_data

    print('Adding edges ...')
    disease_ids = list(range(1, ID.shape[0] + 1))
    mirna_ids = list(range(1, IM.shape[0] + 1))

    disease_ids_invmap = {id_: i for i, id_ in enumerate(disease_ids)}
    mirna_ids_invmap = {id_: i for i, id_ in enumerate(mirna_ids)}

    sample_disease_vertices = [
        disease_ids_invmap[id_] for id_ in samples[:, 1]
    ]
    sample_mirna_vertices = [
        mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0]
    ]

    g.add_edges(sample_disease_vertices,
                sample_mirna_vertices,
                data={
                    'inv':
                    nd.zeros(samples.shape[0], dtype='int32', ctx=ctx),
                    'rating':
                    nd.from_numpy(samples[:, 2].astype('float32')).copyto(ctx)
                })
    g.add_edges(sample_mirna_vertices,
                sample_disease_vertices,
                data={
                    'inv':
                    nd.zeros(samples.shape[0], dtype='int32', ctx=ctx),
                    'rating':
                    nd.from_numpy(samples[:, 2].astype('float32')).copyto(ctx)
                })
    g.readonly()
    print('Successfully build graph !!')

    return g, disease_ids_invmap, mirna_ids_invmap
Beispiel #2
0
    def generate_mask(self):
        from mxnet import ndarray as nd
        valid_tensor = nd.from_numpy(
            self.ratings['valid'].values.astype('int32'))
        test_tensor = nd.from_numpy(
            self.ratings['test'].values.astype('int32'))
        train_tensor = nd.from_numpy(
            self.ratings['train'].values.astype('int32'))
        edge_data = {
            'valid': valid_tensor,
            'test': test_tensor,
            'train': train_tensor,
        }

        self.g.edges[self.rating_user_vertices,
                     self.rating_product_vertices].data.update(edge_data)
        self.g.edges[self.rating_product_vertices,
                     self.rating_user_vertices].data.update(edge_data)
Beispiel #3
0
    def build_graph(self):
        import mxnet as mx
        from mxnet import ndarray as nd
        from mxnet import gluon, autograd
        import dgl

        user_ids = list(self.users.index)
        product_ids = list(self.products.index)
        user_ids_invmap = {id_: i for i, id_ in enumerate(user_ids)}
        product_ids_invmap = {id_: i for i, id_ in enumerate(product_ids)}
        self.user_ids = user_ids
        self.product_ids = product_ids
        self.user_ids_invmap = user_ids_invmap
        self.product_ids_invmap = product_ids_invmap

        g = dgl.DGLGraph(multigraph=True)
        g.add_nodes(len(user_ids) + len(product_ids))

        # node type
        node_type = nd.zeros(g.number_of_nodes(), dtype='float32')
        node_type[:len(user_ids)] = 1
        g.ndata['type'] = node_type

        # user features
        print('Adding user features...')
        for user_column in self.users.columns:
            udata = nd.zeros(g.number_of_nodes(), dtype='int64')
            # 0 for padding
            udata[:len(user_ids)] = \
                    nd.from_numpy(self.users[user_column].cat.codes.values.astype('int64') + 1)
            g.ndata[user_column] = udata

        # product genre
        print('Adding product features...')
        product_genres = nd.from_numpy(
            self.products[self.genres].values.copy().astype('float32'))
        g.ndata['genre'] = nd.zeros((g.number_of_nodes(), len(self.genres)))
        g.ndata['genre'][len(user_ids):len(user_ids) +
                         len(product_ids)] = product_genres

        # product year
        if 'year' in self.products.columns:
            g.ndata['year'] = nd.zeros(g.number_of_nodes(), dtype='int64')
            # 0 for padding
            g.ndata['year'][len(user_ids):len(user_ids) + len(product_ids)] = \
                    nd.from_numpy(self.products['year'].cat.codes.values.astype('int64') + 1)
        '''
        # product title
        print('Parsing title...')
        nlp = stanfordnlp.Pipeline(use_gpu=False, processors='tokenize,lemma')
        vocab = set()
        title_words = []
        for t in tqdm.tqdm(self.products['title'].values):
            doc = nlp(t)
            words = set()
            for s in doc.sentences:
                words.update(w.lemma.lower() for w in s.words
                             if not re.fullmatch(r'['+string.punctuation+']+', w.lemma))
            vocab.update(words)
            title_words.append(words)
        vocab = list(vocab)
        vocab_invmap = {w: i for i, w in enumerate(vocab)}
        # bag-of-words
        g.ndata['title'] = nd.zeros((g.number_of_nodes(), len(vocab)))
        for i, tw in enumerate(tqdm.tqdm(title_words)):
            g.ndata['title'][len(user_ids) + i, [vocab_invmap[w] for w in tw]] = 1
        self.vocab = vocab
        self.vocab_invmap = vocab_invmap
        '''

        rating_user_vertices = [
            user_ids_invmap[id_] for id_ in self.ratings['user_id'].values
        ]
        rating_product_vertices = [
            product_ids_invmap[id_] + len(user_ids)
            for id_ in self.ratings['product_id'].values
        ]
        self.rating_user_vertices = rating_user_vertices
        self.rating_product_vertices = rating_product_vertices

        g.add_edges(rating_user_vertices,
                    rating_product_vertices,
                    data={
                        'inv':
                        nd.zeros(self.ratings.shape[0], dtype='int32'),
                        'rating':
                        nd.from_numpy(
                            self.ratings['rating'].values.astype('float32'))
                    })
        g.add_edges(rating_product_vertices,
                    rating_user_vertices,
                    data={
                        'inv':
                        nd.ones(self.ratings.shape[0], dtype='int32'),
                        'rating':
                        nd.from_numpy(
                            self.ratings['rating'].values.astype('float32'))
                    })
        self.g = g
        g.readonly()
Beispiel #4
0
def Train(directory, epochs, aggregator, embedding_size, layers, dropout,
          slope, lr, wd, random_seed, ctx):
    dgl.load_backend('mxnet')
    random.seed(random_seed)
    np.random.seed(random_seed)
    mx.random.seed(random_seed)

    g, disease_ids_invmap, mirna_ids_invmap = build_graph(
        directory, random_seed=random_seed, ctx=ctx)
    samples = sample(directory, random_seed=random_seed)
    ID, IM = load_data(directory)

    print('## vertices:', g.number_of_nodes())
    print('## edges:', g.number_of_edges())
    print('## disease nodes:', nd.sum(g.ndata['type'] == 1).asnumpy())
    print('## mirna nodes:', nd.sum(g.ndata['type'] == 0).asnumpy())

    samples_df = pd.DataFrame(samples, columns=['miRNA', 'disease', 'label'])
    sample_disease_vertices = [
        disease_ids_invmap[id_] for id_ in samples[:, 1]
    ]
    sample_mirna_vertices = [
        mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0]
    ]

    kf = KFold(n_splits=5, shuffle=True, random_state=random_seed)
    train_index = []
    test_index = []
    for train_idx, test_idx in kf.split(samples[:, 2]):
        train_index.append(train_idx)
        test_index.append(test_idx)

    auc_result = []
    acc_result = []
    pre_result = []
    recall_result = []
    f1_result = []

    fprs = []
    tprs = []

    for i in range(len(train_index)):
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print('Training for Fold ', i + 1)

        samples_df['train'] = 0
        samples_df['test'] = 0

        samples_df['train'].iloc[train_index[i]] = 1
        samples_df['test'].iloc[test_index[i]] = 1

        train_tensor = nd.from_numpy(
            samples_df['train'].values.astype('int32')).copyto(ctx)
        test_tensor = nd.from_numpy(
            samples_df['test'].values.astype('int32')).copyto(ctx)

        edge_data = {'train': train_tensor, 'test': test_tensor}

        g.edges[sample_disease_vertices,
                sample_mirna_vertices].data.update(edge_data)
        g.edges[sample_mirna_vertices,
                sample_disease_vertices].data.update(edge_data)

        train_eid = g.filter_edges(lambda edges: edges.data['train']).astype(
            'int64')
        g_train = g.edge_subgraph(train_eid, preserve_nodes=True)
        g_train.copy_from_parent()

        # get the training set
        rating_train = g_train.edata['rating']
        src_train, dst_train = g_train.all_edges()
        # get the testing edge set
        test_eid = g.filter_edges(lambda edges: edges.data['test']).astype(
            'int64')
        src_test, dst_test = g.find_edges(test_eid)
        rating_test = g.edges[test_eid].data['rating']
        src_train = src_train.copyto(ctx)
        src_test = src_test.copyto(ctx)
        dst_train = dst_train.copyto(ctx)
        dst_test = dst_test.copyto(ctx)
        print('## Training edges:', len(train_eid))
        print('## Testing edges:', len(test_eid))

        # Train the model
        model = GNNMDA(
            GraphEncoder(embedding_size=embedding_size,
                         n_layers=layers,
                         G=g_train,
                         aggregator=aggregator,
                         dropout=dropout,
                         slope=slope,
                         ctx=ctx),
            BilinearDecoder(feature_size=embedding_size))

        model.collect_params().initialize(
            init=mx.init.Xavier(magnitude=math.sqrt(2.0)), ctx=ctx)
        cross_entropy = gloss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'wd': wd
        })

        for epoch in range(epochs):
            start = time.time()
            for _ in range(10):
                with mx.autograd.record():
                    score_train = model(g_train, src_train, dst_train)
                    loss_train = cross_entropy(score_train,
                                               rating_train).mean()
                    loss_train.backward()
                trainer.step(1)

            h_val = model.encoder(g)
            score_val = model.decoder(h_val[src_test], h_val[dst_test])
            loss_val = cross_entropy(score_val, rating_test).mean()

            train_auc = metrics.roc_auc_score(
                np.squeeze(rating_train.asnumpy()),
                np.squeeze(score_train.asnumpy()))
            val_auc = metrics.roc_auc_score(np.squeeze(rating_test.asnumpy()),
                                            np.squeeze(score_val.asnumpy()))

            results_val = [
                0 if j < 0.5 else 1 for j in np.squeeze(score_val.asnumpy())
            ]
            accuracy_val = metrics.accuracy_score(rating_test.asnumpy(),
                                                  results_val)
            precision_val = metrics.precision_score(rating_test.asnumpy(),
                                                    results_val)
            recall_val = metrics.recall_score(rating_test.asnumpy(),
                                              results_val)
            f1_val = metrics.f1_score(rating_test.asnumpy(), results_val)

            end = time.time()

            print('Epoch:', epoch + 1,
                  'Train Loss: %.4f' % loss_train.asscalar(),
                  'Val Loss: %.4f' % loss_val.asscalar(),
                  'Acc: %.4f' % accuracy_val, 'Pre: %.4f' % precision_val,
                  'Recall: %.4f' % recall_val, 'F1: %.4f' % f1_val,
                  'Train AUC: %.4f' % train_auc, 'Val AUC: %.4f' % val_auc,
                  'Time: %.2f' % (end - start))

        h_test = model.encoder(g)
        score_test = model.decoder(h_test[src_test], h_test[dst_test])
        # loss_test = cross_entropy(score_test, rating_test).mean()

        fpr, tpr, thresholds = metrics.roc_curve(
            np.squeeze(rating_test.asnumpy()),
            np.squeeze(score_test.asnumpy()))
        test_auc = metrics.auc(fpr, tpr)

        results_test = [
            0 if j < 0.5 else 1 for j in np.squeeze(score_test.asnumpy())
        ]
        accuracy_test = metrics.accuracy_score(rating_test.asnumpy(),
                                               results_test)
        precision_test = metrics.precision_score(rating_test.asnumpy(),
                                                 results_test)
        recall_test = metrics.recall_score(rating_test.asnumpy(), results_test)
        f1_test = metrics.f1_score(rating_test.asnumpy(), results_test)

        print('Fold:', i + 1, 'Test Acc: %.4f' % accuracy_test,
              'Test Pre: %.4f' % precision_test,
              'Test Recall: %.4f' % recall_test, 'Test F1: %.4f' % f1_test,
              'Test AUC: %.4f' % test_auc)

        auc_result.append(test_auc)
        acc_result.append(accuracy_test)
        pre_result.append(precision_test)
        recall_result.append(recall_test)
        f1_result.append(f1_test)

        fprs.append(fpr)
        tprs.append(tpr)

    print('## Training Finished !')
    print(
        '----------------------------------------------------------------------------------------------------------'
    )

    return auc_result, acc_result, pre_result, recall_result, f1_result, fprs, tprs