def evolvegcn_embedding(dataset, learning_type='unsupervise'): base_path = os.path.abspath( os.path.join(os.getcwd(), '../data/' + dataset + '/CTGCN')) origin_folder = os.path.join('..', '1.format') origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) embedding_folder = os.path.join('..', '2.embedding/EvolveGCNH') node_file = os.path.join('..', 'nodes_set/nodes.csv') duration = 15 max_time_num = len(os.listdir(origin_base_path)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) data_loader = DataLoader(nodes_set['node'].tolist(), max_time_num) t1 = time.time() print('start EvolveGCN embedding!') if learning_type == 'unsupervise': walk_pair_folder = 'evolvegcn_walk_pairs' node_freq_folder = 'evolvegcn_node_freq' walk_pair_base_path = os.path.abspath( os.path.join(base_path, walk_pair_folder)) node_freq_base_path = os.path.abspath( os.path.join(base_path, node_freq_folder)) for idx in range(0, max_time_num, duration): print('idx = ', idx) adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx, duration=duration) x_list, max_degree, _ = data_loader.get_degree_feature_list( origin_base_path, start_idx=idx, duration=duration) node_pair_list = data_loader.get_node_pair_list( walk_pair_base_path, start_idx=idx, duration=duration) neg_freq_list = data_loader.get_neg_freq_list(node_freq_base_path, start_idx=idx, duration=duration) evolvegcn_model = EvolveGCN(input_dim=max_degree, hidden_dim=128, output_dim=128, duration=duration, egcn_type='EGCNH') evolvegcn_loss = UnsupervisedLoss(neg_num=20, Q=20, node_pair_list=node_pair_list, neg_freq_list=neg_freq_list) evolvegcn = UnsupervisedEmbedding( base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=evolvegcn_model, loss=evolvegcn_loss, max_time_num=max_time_num) evolvegcn.learn_embedding(adj_list, x_list, epoch=5, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='evolvegcnh', export=True) break elif learning_type == 'supervise': label_file = os.path.join('..', 'nodes_set/trans_label.csv') label_path = os.path.abspath(os.path.join(base_path, label_file)) df_label = pd.read_csv(label_path, sep='\t') label_list = df_label['label'].values for idx in range(0, max_time_num, duration): print('idx = ', idx) adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx, duration=duration) x_list, max_degree, _ = data_loader.get_degree_feature_list( origin_base_path, start_idx=idx, duration=duration) evolvegcn_model = EvolveGCN(input_dim=max_degree, hidden_dim=128, output_dim=128, duration=duration, egcn_type='EGCNH') evolvegcn_loss = SupervisedLoss() evolvegcn_classifier = MLPClassifier(128, 64, label_list.max() + 1, layer_num=1, duration=duration, bias=True, trans_version='L') evolvegcn = SupervisedEmbedding( base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=evolvegcn_model, loss=evolvegcn_loss, classifier=evolvegcn_classifier, max_time_num=max_time_num) evolvegcn.learn_embedding(adj_list, x_list, label_list, epoch=50, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='evolvegcnh', classifier_file='evolvegcnh_cls', export=True) else: raise AttributeError('Unsupported learning type!') t2 = time.time() print('finish EvolveGCN embedding! cost time: ', t2 - t1, ' seconds!') return
def ctgcn_connective_embedding(dataset, learning_type='unsupervise'): base_path = os.path.abspath( os.path.join(os.getcwd(), '../data/' + dataset + '/CTGCN')) origin_folder = os.path.join('..', '1.format') embedding_folder = os.path.join('..', '2.embedding/CTGCN_C') core_folder = 'ctgcn_cores' core_base_path = os.path.abspath(os.path.join(base_path, core_folder)) node_file = os.path.join('..', 'nodes_set/nodes.csv') duration = 5 max_time_num = len(os.listdir(core_base_path)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) node_num = nodes_set.shape[0] data_loader = DataLoader(nodes_set['node'].tolist(), max_time_num) print('max time num: ', max_time_num) t1 = time.time() print('start CTGCN_C embedding on ' + dataset) if learning_type == 'unsupervise': walk_pair_folder = 'ctgcn_walk_pairs' node_freq_folder = 'ctgcn_node_freq' walk_pair_base_path = os.path.abspath( os.path.join(base_path, walk_pair_folder)) node_freq_base_path = os.path.abspath( os.path.join(base_path, node_freq_folder)) for idx in range(0, max_time_num, duration): print('idx = ', idx) time_num = min(duration, max_time_num - idx) adj_list = data_loader.get_core_adj_list(core_base_path, start_idx=idx, duration=time_num) x_list = data_loader.get_feature_list(None, start_idx=idx, duration=time_num) node_pair_list = data_loader.get_node_pair_list( walk_pair_base_path, start_idx=idx, duration=time_num) neg_freq_list = data_loader.get_neg_freq_list(node_freq_base_path, start_idx=idx, duration=time_num) ctgcn_model = CTGCN(input_dim=node_num, hidden_dim=500, output_dim=128, trans_num=1, diffusion_num=2, duration=time_num, bias=True, rnn_type='GRU', version='C', trans_version='L') ctgcn_loss = UnsupervisedLoss(neg_num=150, Q=10, node_pair_list=node_pair_list, neg_freq_list=neg_freq_list) ctgcn = UnsupervisedEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=ctgcn_model, loss=ctgcn_loss, max_time_num=max_time_num) ctgcn.learn_embedding(adj_list, x_list, single_output=False, epoch=5, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='ctgcn_c', embedding_type='connection', export=True) break elif learning_type == 'supervise': label_file = os.path.join('..', 'nodes_set/trans_label.csv') label_path = os.path.abspath(os.path.join(base_path, label_file)) df_label = pd.read_csv(label_path, sep='\t') label_list = df_label['label'].values for idx in range(0, max_time_num, duration): print('idx = ', idx) time_num = min(duration, max_time_num - idx) adj_list = data_loader.get_core_adj_list(core_base_path, start_idx=idx, duration=time_num) x_list = data_loader.get_feature_list(None, start_idx=idx, duration=time_num) ctgcn_model = CTGCN(input_dim=node_num, hidden_dim=500, output_dim=128, trans_num=1, diffusion_num=2, duration=time_num, bias=True, rnn_type='GRU', version='C', trans_version='L') ctgcn_loss = SupervisedLoss() ctgcn_classifier = MLPClassifier(128, 64, label_list.max() + 1, layer_num=1, duration=time_num, bias=True, trans_version='L') ctgcn = SupervisedEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=ctgcn_model, loss=ctgcn_loss, classifier=ctgcn_classifier, max_time_num=max_time_num) ctgcn.learn_embedding(adj_list, x_list, label_list, single_output=False, epoch=50, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='ctgcn_c', classifier_file='ctgcn_c_cls', embedding_type='connection', export=True) else: raise AttributeError('Unsupported learning type!') t2 = time.time() print('finish CTGCN_C embedding! cost time: ', t2 - t1, ' seconds!') return
def gcn_embedding(dataset, learning_type='unsupervise'): base_path = os.path.abspath( os.path.join(os.getcwd(), '../data/' + dataset + '/CTGCN')) origin_folder = os.path.join('..', '1.format') origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) embedding_folder = os.path.join('..', '2.embedding/GCN') node_file = os.path.join('..', 'nodes_set/nodes.csv') duration = 1 max_time_num = len(os.listdir(origin_base_path)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) node_num = nodes_set.shape[0] data_loader = DataLoader(nodes_set['node'].tolist(), max_time_num) t1 = time.time() print('start GCN embedding!') if learning_type == 'unsupervise': walk_pair_folder = 'gcn_walk_pairs' node_freq_folder = 'gcn_node_freq' walk_pair_base_path = os.path.abspath( os.path.join(base_path, walk_pair_folder)) node_freq_base_path = os.path.abspath( os.path.join(base_path, node_freq_folder)) # time_list = [] for idx in range(max_time_num): print('idx = ', idx) adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx, duration=duration) x_list = data_loader.get_feature_list(None, start_idx=idx, duration=duration) node_pair_list = data_loader.get_node_pair_list( walk_pair_base_path, start_idx=idx, duration=duration) neg_freq_list = data_loader.get_neg_freq_list(node_freq_base_path, start_idx=idx, duration=duration) gcn_model = GCN(input_dim=node_num, hidden_dim=500, output_dim=128, dropout=0.5, bias=True) gcn_loss = UnsupervisedLoss(neg_num=20, Q=20, node_pair_list=node_pair_list, neg_freq_list=neg_freq_list) gcn = UnsupervisedEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=gcn_model, loss=gcn_loss, max_time_num=max_time_num) # ta = time.time() gcn.learn_embedding(adj_list, x_list, epoch=5, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='gcn', export=True) if idx == 1: break # tb = time.time() # time_list.append(tb - ta) # df_output = pd.DataFrame({'time': time_list}) # df_output.to_csv(dataset + '_gcn_time.csv', sep=',', index=False) elif learning_type == 'supervise': label_file = os.path.join('..', 'nodes_set/trans_label.csv') label_path = os.path.abspath(os.path.join(base_path, label_file)) df_label = pd.read_csv(label_path, sep='\t') label_list = df_label['label'].values for idx in range(max_time_num): print('idx = ', idx) adj_list = data_loader.get_date_adj_list(origin_base_path, start_idx=idx, duration=duration) x_list = data_loader.get_feature_list(None, start_idx=idx, duration=duration) gcn_model = GCN(input_dim=node_num, hidden_dim=500, output_dim=128, dropout=0.5, bias=True) gcn_loss = SupervisedLoss() gcn_classifier = MLPClassifier(128, 64, label_list.max() + 1, layer_num=1, duration=duration, bias=True, trans_version='L') gcn = SupervisedEmbedding(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, node_list=nodes_set['node'].tolist(), model=gcn_model, loss=gcn_loss, classifier=gcn_classifier, max_time_num=max_time_num) gcn.learn_embedding(adj_list, x_list, label_list, epoch=50, batch_size=4096 * 8, lr=0.001, start_idx=idx, weight_decay=5e-4, model_file='gcn', classifier_file='gcn_cls', export=True) else: raise AttributeError('Unsupported learning type!') t2 = time.time() print('finish GCN embedding! cost time: ', t2 - t1, ' seconds!') return