def build_graph(directory, random_seed, ctx): # dgl.load_backend('mxnet') ID, IM = load_data(directory) samples = sample(directory, random_seed) print('Building graph ...') g = dgl.DGLGraph(multigraph=True) g.add_nodes(ID.shape[0] + IM.shape[0]) node_type = nd.zeros(g.number_of_nodes(), dtype='float32', ctx=ctx) node_type[:ID.shape[0]] = 1 g.ndata['type'] = node_type print('Adding disease features ...') d_data = nd.zeros(shape=(g.number_of_nodes(), ID.shape[1]), dtype='float32', ctx=ctx) d_data[:ID.shape[0], :] = nd.from_numpy(ID) g.ndata['d_features'] = d_data print('Adding miRNA features ...') m_data = nd.zeros(shape=(g.number_of_nodes(), IM.shape[1]), dtype='float32', ctx=ctx) m_data[ID.shape[0]:ID.shape[0] + IM.shape[0], :] = nd.from_numpy(IM) g.ndata['m_features'] = m_data print('Adding edges ...') disease_ids = list(range(1, ID.shape[0] + 1)) mirna_ids = list(range(1, IM.shape[0] + 1)) disease_ids_invmap = {id_: i for i, id_ in enumerate(disease_ids)} mirna_ids_invmap = {id_: i for i, id_ in enumerate(mirna_ids)} sample_disease_vertices = [ disease_ids_invmap[id_] for id_ in samples[:, 1] ] sample_mirna_vertices = [ mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0] ] g.add_edges(sample_disease_vertices, sample_mirna_vertices, data={ 'inv': nd.zeros(samples.shape[0], dtype='int32', ctx=ctx), 'rating': nd.from_numpy(samples[:, 2].astype('float32')).copyto(ctx) }) g.add_edges(sample_mirna_vertices, sample_disease_vertices, data={ 'inv': nd.zeros(samples.shape[0], dtype='int32', ctx=ctx), 'rating': nd.from_numpy(samples[:, 2].astype('float32')).copyto(ctx) }) g.readonly() print('Successfully build graph !!') return g, disease_ids_invmap, mirna_ids_invmap
def generate_mask(self): from mxnet import ndarray as nd valid_tensor = nd.from_numpy( self.ratings['valid'].values.astype('int32')) test_tensor = nd.from_numpy( self.ratings['test'].values.astype('int32')) train_tensor = nd.from_numpy( self.ratings['train'].values.astype('int32')) edge_data = { 'valid': valid_tensor, 'test': test_tensor, 'train': train_tensor, } self.g.edges[self.rating_user_vertices, self.rating_product_vertices].data.update(edge_data) self.g.edges[self.rating_product_vertices, self.rating_user_vertices].data.update(edge_data)
def build_graph(self): import mxnet as mx from mxnet import ndarray as nd from mxnet import gluon, autograd import dgl user_ids = list(self.users.index) product_ids = list(self.products.index) user_ids_invmap = {id_: i for i, id_ in enumerate(user_ids)} product_ids_invmap = {id_: i for i, id_ in enumerate(product_ids)} self.user_ids = user_ids self.product_ids = product_ids self.user_ids_invmap = user_ids_invmap self.product_ids_invmap = product_ids_invmap g = dgl.DGLGraph(multigraph=True) g.add_nodes(len(user_ids) + len(product_ids)) # node type node_type = nd.zeros(g.number_of_nodes(), dtype='float32') node_type[:len(user_ids)] = 1 g.ndata['type'] = node_type # user features print('Adding user features...') for user_column in self.users.columns: udata = nd.zeros(g.number_of_nodes(), dtype='int64') # 0 for padding udata[:len(user_ids)] = \ nd.from_numpy(self.users[user_column].cat.codes.values.astype('int64') + 1) g.ndata[user_column] = udata # product genre print('Adding product features...') product_genres = nd.from_numpy( self.products[self.genres].values.copy().astype('float32')) g.ndata['genre'] = nd.zeros((g.number_of_nodes(), len(self.genres))) g.ndata['genre'][len(user_ids):len(user_ids) + len(product_ids)] = product_genres # product year if 'year' in self.products.columns: g.ndata['year'] = nd.zeros(g.number_of_nodes(), dtype='int64') # 0 for padding g.ndata['year'][len(user_ids):len(user_ids) + len(product_ids)] = \ nd.from_numpy(self.products['year'].cat.codes.values.astype('int64') + 1) ''' # product title print('Parsing title...') nlp = stanfordnlp.Pipeline(use_gpu=False, processors='tokenize,lemma') vocab = set() title_words = [] for t in tqdm.tqdm(self.products['title'].values): doc = nlp(t) words = set() for s in doc.sentences: words.update(w.lemma.lower() for w in s.words if not re.fullmatch(r'['+string.punctuation+']+', w.lemma)) vocab.update(words) title_words.append(words) vocab = list(vocab) vocab_invmap = {w: i for i, w in enumerate(vocab)} # bag-of-words g.ndata['title'] = nd.zeros((g.number_of_nodes(), len(vocab))) for i, tw in enumerate(tqdm.tqdm(title_words)): g.ndata['title'][len(user_ids) + i, [vocab_invmap[w] for w in tw]] = 1 self.vocab = vocab self.vocab_invmap = vocab_invmap ''' rating_user_vertices = [ user_ids_invmap[id_] for id_ in self.ratings['user_id'].values ] rating_product_vertices = [ product_ids_invmap[id_] + len(user_ids) for id_ in self.ratings['product_id'].values ] self.rating_user_vertices = rating_user_vertices self.rating_product_vertices = rating_product_vertices g.add_edges(rating_user_vertices, rating_product_vertices, data={ 'inv': nd.zeros(self.ratings.shape[0], dtype='int32'), 'rating': nd.from_numpy( self.ratings['rating'].values.astype('float32')) }) g.add_edges(rating_product_vertices, rating_user_vertices, data={ 'inv': nd.ones(self.ratings.shape[0], dtype='int32'), 'rating': nd.from_numpy( self.ratings['rating'].values.astype('float32')) }) self.g = g g.readonly()
def Train(directory, epochs, aggregator, embedding_size, layers, dropout, slope, lr, wd, random_seed, ctx): dgl.load_backend('mxnet') random.seed(random_seed) np.random.seed(random_seed) mx.random.seed(random_seed) g, disease_ids_invmap, mirna_ids_invmap = build_graph( directory, random_seed=random_seed, ctx=ctx) samples = sample(directory, random_seed=random_seed) ID, IM = load_data(directory) print('## vertices:', g.number_of_nodes()) print('## edges:', g.number_of_edges()) print('## disease nodes:', nd.sum(g.ndata['type'] == 1).asnumpy()) print('## mirna nodes:', nd.sum(g.ndata['type'] == 0).asnumpy()) samples_df = pd.DataFrame(samples, columns=['miRNA', 'disease', 'label']) sample_disease_vertices = [ disease_ids_invmap[id_] for id_ in samples[:, 1] ] sample_mirna_vertices = [ mirna_ids_invmap[id_] + ID.shape[0] for id_ in samples[:, 0] ] kf = KFold(n_splits=5, shuffle=True, random_state=random_seed) train_index = [] test_index = [] for train_idx, test_idx in kf.split(samples[:, 2]): train_index.append(train_idx) test_index.append(test_idx) auc_result = [] acc_result = [] pre_result = [] recall_result = [] f1_result = [] fprs = [] tprs = [] for i in range(len(train_index)): print( '------------------------------------------------------------------------------------------------------' ) print('Training for Fold ', i + 1) samples_df['train'] = 0 samples_df['test'] = 0 samples_df['train'].iloc[train_index[i]] = 1 samples_df['test'].iloc[test_index[i]] = 1 train_tensor = nd.from_numpy( samples_df['train'].values.astype('int32')).copyto(ctx) test_tensor = nd.from_numpy( samples_df['test'].values.astype('int32')).copyto(ctx) edge_data = {'train': train_tensor, 'test': test_tensor} g.edges[sample_disease_vertices, sample_mirna_vertices].data.update(edge_data) g.edges[sample_mirna_vertices, sample_disease_vertices].data.update(edge_data) train_eid = g.filter_edges(lambda edges: edges.data['train']).astype( 'int64') g_train = g.edge_subgraph(train_eid, preserve_nodes=True) g_train.copy_from_parent() # get the training set rating_train = g_train.edata['rating'] src_train, dst_train = g_train.all_edges() # get the testing edge set test_eid = g.filter_edges(lambda edges: edges.data['test']).astype( 'int64') src_test, dst_test = g.find_edges(test_eid) rating_test = g.edges[test_eid].data['rating'] src_train = src_train.copyto(ctx) src_test = src_test.copyto(ctx) dst_train = dst_train.copyto(ctx) dst_test = dst_test.copyto(ctx) print('## Training edges:', len(train_eid)) print('## Testing edges:', len(test_eid)) # Train the model model = GNNMDA( GraphEncoder(embedding_size=embedding_size, n_layers=layers, G=g_train, aggregator=aggregator, dropout=dropout, slope=slope, ctx=ctx), BilinearDecoder(feature_size=embedding_size)) model.collect_params().initialize( init=mx.init.Xavier(magnitude=math.sqrt(2.0)), ctx=ctx) cross_entropy = gloss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) trainer = gluon.Trainer(model.collect_params(), 'adam', { 'learning_rate': lr, 'wd': wd }) for epoch in range(epochs): start = time.time() for _ in range(10): with mx.autograd.record(): score_train = model(g_train, src_train, dst_train) loss_train = cross_entropy(score_train, rating_train).mean() loss_train.backward() trainer.step(1) h_val = model.encoder(g) score_val = model.decoder(h_val[src_test], h_val[dst_test]) loss_val = cross_entropy(score_val, rating_test).mean() train_auc = metrics.roc_auc_score( np.squeeze(rating_train.asnumpy()), np.squeeze(score_train.asnumpy())) val_auc = metrics.roc_auc_score(np.squeeze(rating_test.asnumpy()), np.squeeze(score_val.asnumpy())) results_val = [ 0 if j < 0.5 else 1 for j in np.squeeze(score_val.asnumpy()) ] accuracy_val = metrics.accuracy_score(rating_test.asnumpy(), results_val) precision_val = metrics.precision_score(rating_test.asnumpy(), results_val) recall_val = metrics.recall_score(rating_test.asnumpy(), results_val) f1_val = metrics.f1_score(rating_test.asnumpy(), results_val) end = time.time() print('Epoch:', epoch + 1, 'Train Loss: %.4f' % loss_train.asscalar(), 'Val Loss: %.4f' % loss_val.asscalar(), 'Acc: %.4f' % accuracy_val, 'Pre: %.4f' % precision_val, 'Recall: %.4f' % recall_val, 'F1: %.4f' % f1_val, 'Train AUC: %.4f' % train_auc, 'Val AUC: %.4f' % val_auc, 'Time: %.2f' % (end - start)) h_test = model.encoder(g) score_test = model.decoder(h_test[src_test], h_test[dst_test]) # loss_test = cross_entropy(score_test, rating_test).mean() fpr, tpr, thresholds = metrics.roc_curve( np.squeeze(rating_test.asnumpy()), np.squeeze(score_test.asnumpy())) test_auc = metrics.auc(fpr, tpr) results_test = [ 0 if j < 0.5 else 1 for j in np.squeeze(score_test.asnumpy()) ] accuracy_test = metrics.accuracy_score(rating_test.asnumpy(), results_test) precision_test = metrics.precision_score(rating_test.asnumpy(), results_test) recall_test = metrics.recall_score(rating_test.asnumpy(), results_test) f1_test = metrics.f1_score(rating_test.asnumpy(), results_test) print('Fold:', i + 1, 'Test Acc: %.4f' % accuracy_test, 'Test Pre: %.4f' % precision_test, 'Test Recall: %.4f' % recall_test, 'Test F1: %.4f' % f1_test, 'Test AUC: %.4f' % test_auc) auc_result.append(test_auc) acc_result.append(accuracy_test) pre_result.append(precision_test) recall_result.append(recall_test) f1_result.append(f1_test) fprs.append(fpr) tprs.append(tpr) print('## Training Finished !') print( '----------------------------------------------------------------------------------------------------------' ) return auc_result, acc_result, pre_result, recall_result, f1_result, fprs, tprs