def test_(): print("Computing feature maps...") Q, subgraphs, labels, shapes = compute_nystrom(use_node_labels, dim, community_detection, kernels) M = np.zeros((shapes[0], shapes[1], len(kernels))) for idx, k in enumerate(kernels): M[:, :, idx] = Q[idx] Q = M # Binarize labels le = LabelEncoder() y = le.fit_transform(labels) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in subgraphs]) x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32) for i in range(len(subgraphs)): communities = subgraphs[i].split() for j in range(len(communities)): x[i, j] = int(communities[j]) reg = x[0:2500] gen = x[2500:5000] mal = x[5000:] reg_label = y[:2500] gen_label = y[2500:5000] mal_label = y[5000:] train_reg = reg[0:1500] test_reg = reg[1500:] train_reg_y = reg_label[0:1500] test_reg_y = reg_label[1500:] train_mal = mal[0:1500] test_mal = mal[1500:] train_mal_y = mal_label[0:1500] test_mal_y = mal_label[1500:] train_gen = gen[0:1500] train_gen_y = gen_label[0:1500] train_fake = np.concatenate((train_reg, train_gen), axis=0) y_train_fake = np.concatenate((train_reg_y, train_gen_y), axis=0) train_real = np.concatenate((train_reg, train_mal), axis=0) y_train_real = np.concatenate((train_reg_y, train_mal_y), axis=0) test = np.concatenate((test_reg, test_mal), axis=0) y_test = np.concatenate((test_reg_y, test_mal_y), axis=0) def train_test(Q, x_train, x_test, y_train, y_test, batch_size): train_loader, test_loader = create_train_test_loaders( Q, x_train, x_test, y_train, y_test, batch_size) cnn = CNN(input_size=num_filters, hidden_size=hidden_size, num_classes=np.unique(y).size, dim=dim, num_kernels=num_kernels, max_document_length=max_document_length) if torch.cuda.is_available(): cnn.cuda() if torch.cuda.is_available(): criterion = nn.CrossEntropyLoss().cuda() else: criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) for epoch in range(num_epochs): for i, (graphs, labels) in enumerate(train_loader): graphs = Variable(graphs) labels = Variable(labels) optimizer.zero_grad() outputs = cnn(graphs) if torch.cuda.is_available(): loss = criterion(outputs, labels.cuda()) else: loss = criterion(outputs, labels) loss.backward() optimizer.step() # Test the Model cnn.eval() correct = 0 total = 0 TP = 0 TN = 0 FP = 0 FN = 0 predict = [] label = [] output = [] for graphs, labels in test_loader: graphs = Variable(graphs) outputs = cnn(graphs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels.cuda()).sum() TP += (predicted + labels.cuda() == 2).sum() FP += (predicted * 5 + labels.cuda() * 1 == 5).sum() FN += (predicted * 1 + labels.cuda() * 5 == 5).sum() TN += (predicted + labels.cuda() == 0).sum() predict.append(predicted) label.append(labels) output.append(outputs.data) if TP + FP == 0: precision = 0 else: precision = TP / (TP + FP) if TP + FN == 0: recall = 0 else: recall = TP / (TP + FN) l = np.zeros((len(label))) for i in range(len(label)): l[i] = int(label[i]) s = np.zeros((len(output))) for i in range(len(output)): s[i] = output[i][0][1] return TP, TN, FP, FN, precision, recall, l, s TP_fake, TN_fake, FP_fake, FN_fake, precision_fake, recall_fake, l_fake, s_fake = train_test( Q, train_fake, test, y_train_fake, y_test, batch_size) TP_real, TN_real, FP_real, FN_real, precision_real, recall_real, l_real, s_real = train_test( Q, train_real, test, y_train_real, y_test, batch_size) return TP_fake, TN_fake, FP_fake, FN_fake, precision_fake, recall_fake, l_fake, s_fake, TP_real, TN_real, FP_real, FN_real, precision_real, recall_real, l_real, s_real
def main(): global args args = parser.parse_args() # Check if CUDA is enabled args.cuda = not args.no_cuda and torch.cuda.is_available() unlabeled_datasets = [ "IMDB-BINARY", "IMDB-MULTI", "REDDIT-BINARY", "REDDIT-MULTI-5K", "COLLAB", "SYNTHETIC", "raw-gitgraph" ] if args.dataset in unlabeled_datasets: use_node_labels = False from graph_kernels import sp_kernel, wl_kernel else: use_node_labels = True from graph_kernels_labeled import sp_kernel, wl_kernel kernels = [wl_kernel] n_kernels = len(kernels) print('Computing graph maps') Q, subgraphs, labels, shapes = compute_nystrom(args.dataset, use_node_labels, args.d, args.community_detection, kernels) M = np.zeros((shapes[0], shapes[1], n_kernels)) for idx, k in enumerate(kernels): M[:, :, idx] = Q[idx] Q = M # Binarize labels le = LabelEncoder() y = le.fit_transform(labels) # Build vocabulary max_n_communities = max([len(x.split(" ")) for x in subgraphs]) x = np.zeros((len(subgraphs), max_n_communities), dtype=np.int32) for i in range(len(subgraphs)): communities = subgraphs[i].split() for j in range(len(communities)): x[i, j] = int(communities[j]) print(x[0, :]) kf = StratifiedKFold(n_splits=10, random_state=None) kf.shuffle = True accs = [] it = 0 print('Starting cross-validation') for train_index, test_index in kf.split(x, y): it += 1 best_acc1 = 0 x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1) train_loader, val_loader, test_loader = create_train_val_test_loaders( Q, x_train, x_val, x_test, y_train, y_val, y_test, args.batch_size) print('\tCreate model') model = CNN(input_size=args.n_filters, hidden_size=args.hidden_size, n_classes=np.unique(y).size, d=args.d, n_kernels=n_kernels, max_n_communities=max_n_communities) print('Optimizer') optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss() evaluation = lambda output, target: torch.sum(output.eq(target) ) / target.size()[0] lr = args.lr lr_step = (args.lr - args.lr * args.lr_decay) / ( args.epochs * args.schedule[1] - args.epochs * args.schedule[0]) if os.path.isdir(args.checkpoint_dir): shutil.rmtree(args.checkpoint_dir) os.makedirs(args.checkpoint_dir) print('Check cuda') if args.cuda: print('\t* Cuda') model = model.cuda() criterion = criterion.cuda() # Epoch for loop for epoch in range(0, args.epochs): if epoch > args.epochs * args.schedule[ 0] and epoch < args.epochs * args.schedule[1]: lr -= lr_step for param_group in optimizer.param_groups: param_group['lr'] = lr # train for one epoch train(train_loader, model, criterion, optimizer, epoch, evaluation) # evaluate on test set acc1 = validate(val_loader, model, criterion, evaluation) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best=is_best, directory=args.checkpoint_dir) # get the best checkpoint and test it with test set best_model_file = os.path.join(args.checkpoint_dir, 'model_best.pth') if not os.path.isdir(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) if os.path.isfile(best_model_file): print("=> loading best model '{}'".format(best_model_file)) checkpoint = torch.load(best_model_file) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) if args.cuda: model.cuda() optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded best model '{}' (epoch {})".format( best_model_file, checkpoint['epoch'])) else: print("=> no best model found at '{}'".format(best_model_file)) # For testing acc = validate(test_loader, model, criterion, evaluation) print("Accuracy at iteration " + str(it) + ": " + str(acc)) accs.append(acc) print("Average accuracy: ", np.mean(accs)) print("std: ", np.std(accs))
# if data_file in unlabeled_data_files: use_node_labels = False from graph_kernels import sp_kernel, wl_kernel # else: # use_node_labels = True # from graph_kernels_labeled import sp_kernel, wl_kernel # Choose kernels kernels=[wl_kernel] num_kernels = len(kernels) ds_name = sys.argv[1] pct_data = float(sys.argv[2]) assert(-.01 < pct_data < 1.01) seed = 42 print("Computing feature maps...") Q, subgraphs, labels,shapes = compute_nystrom(ds_name, pct_data, use_node_labels, dim, community_detection, kernels, seed) print("Finished feature maps") M=np.zeros((shapes[0],shapes[1],len(kernels))) for idx,k in enumerate(kernels): M[:,:,idx]=Q[idx] Q=M # Binarize labels le = LabelEncoder() y = le.fit_transform(labels) print("Building vocabulary") # Build vocabulary max_document_length = max([len(x.split(" ")) for x in subgraphs]) x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32) for i in range(len(subgraphs)):
"IMDB-BINARY", "IMDB-MULTI", "REDDIT-BINARY", "REDDIT-MULTI-5K", "COLLAB", "SYNTHETIC" ] if data_file in unlabeled_data_files: use_node_labels = False from graph_kernels import sp_kernel, wl_kernel else: use_node_labels = True from graph_kernels_labeled import sp_kernel, wl_kernel # Choose kernels kernels = [wl_kernel] num_kernels = len(kernels) print("Computing feature maps...") Q, subgraphs, labels, shapes = compute_nystrom(data_file, use_node_labels, dim, community_detection, kernels) M = np.zeros((shapes[0], shapes[1], len(kernels))) for idx, k in enumerate(kernels): M[:, :, idx] = Q[idx] Q = M # Binarize labels le = LabelEncoder() y = le.fit_transform(labels) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in subgraphs]) x = np.zeros((len(subgraphs), max_document_length), dtype=np.int32) for i in range(len(subgraphs)):