def prepare_data(): """Load and preprocess data""" data = load_data(data_path) prep = Preprocessor(data) prep.preprocess_data() prep.shuffle_data() prep.encode_data() return prep.X, prep.Y
def _neigh(): train_data, test_data = load_data('intermediate/User11.log', ratio=0.8) # train_neigh = gen_neigh_set(train_data) # save_neigh(train_neigh, 'intermediate/train_neigh.pickle') # test_neigh = gen_neigh_set(test_data) # save_neigh(test_neigh, 'intermediate/test_neigh.pickle') train_neigh = load_neigh('intermediate/train_neigh.pickle') test_neigh = load_neigh('intermediate/test_neigh.pickle') attack1_data, _ = load_data('intermediate/User11Attack1.log', ratio=1.0) attack1_neigh = gen_neigh_set(attack1_data) attack2_data, _ = load_data('intermediate/User11Attack2.log', ratio=1.0) attack2_neigh = gen_neigh_set(attack2_data) attack3_data, _ = load_data('intermediate/User11Attack3.log', ratio=1.0) attack3_neigh = gen_neigh_set(attack3_data) fig = go.Figure() y_true = [] y_score = [] _, scores = score(train_neigh, test_neigh) y_true = y_true + [1] * len(scores) y_score += scores # plot.plot_score(fig, scores, name='Test') _, scores = score(train_neigh, attack1_neigh) y_true = y_true + [0] * len(scores) y_score += scores # plot.plot_score(fig, scores, name='Attack1') _, scores = score(train_neigh, attack2_neigh) y_true = y_true + [0] * len(scores) y_score += scores # plot.plot_score(fig, scores, name='Attack2') _, scores = score(train_neigh, attack3_neigh) y_true = y_true + [0] * len(scores) y_score += scores # plot.plot_score(fig, scores, name='Attack3') plot.plot_roc(fig, y_true, y_score) fig.show()
def get_UCI_data_loader(args, b_size, num_workers): p = load_data(args.dataset) train_data = p.data test_data = p.test_data valid_data = p.valid_data train_set = PrepareUCIData(train_data) test_set = PrepareUCIData(test_data) valid_set = PrepareUCIData(valid_data) trainloader = torch.utils.data.DataLoader(train_set, batch_size=b_size, shuffle=True, num_workers=num_workers) testloader = torch.utils.data.DataLoader(test_set, batch_size=b_size, shuffle=True, num_workers=num_workers) validloader = torch.utils.data.DataLoader(valid_set, batch_size=b_size, shuffle=True, num_workers=num_workers) return trainloader, testloader, validloader
import torch import torch.nn as nn import torch.optim as optim import matplotlib.pyplot as plt import config from LSTM import LSTM from utils import dataloader device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") DATA_DIR = config.DATA_DIR train_X, train_Y, dev_X, dev_Y = dataloader.load_data(DATA_DIR) batch_size = config.BATCH_SIZE num_epochs = config.NUM_EPOCHS initial_lr = config.LR hidden_size = config.HIDDEN_SIZE num_layers = config.NUM_LAYERS # Define model print("Build LSTM model ..") model = LSTM( input_size=6, # TODO : 6 hidden_size=hidden_size, batch_size=batch_size, output_size=2, # TODO : 2 num_layers=num_layers ) model.to(device) loss_function = nn.NLLLoss() val_acc = 0.0
config_flat_mean_shift.yml config_flat_blurring_mean_shift.yml config_gaussian_mean_shift.yml config_gaussian_blurring_mean_shift.yml config_mod_shift.yml config_blurring_mod_shift.yml """ ) args = parser.parse_args() # load data config with open(args.data, "r") as config_file: data_config = yaml.load(config_file, Loader=yaml.FullLoader) # load data and gt data = load_data(data_config) gt = load_data(data_config, gt = True) # load clustering config with open(args.clustering, "r") as config_file: clustering_config = yaml.load(config_file, Loader=yaml.FullLoader) # instantiate clustering target_dir = os.path.join(data_config["root_path"], data_config["dataset"], "results") if data_config["dataset"] =="CREMI": target_dir = os.path.join(target_dir, data_config["set"]) clusterer = get_clusterer(clustering_config["method"], clustering_config,
if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' # torch.manual_seed(11785) test_data_path = args.test_data_path val_data_path = args.dev_data_path val_label_path = args.dev_label_path input_dim = 40 output_dim = 47 if args.mode == 'Test': print('Loading test data...') test_data = load_data(test_data_path) test_dataset = SpeechDataset(test_data) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_wrapper_test) test_kbar = pkbar.Kbar(int(len(test_data) / args.batch_size) + 1) else: print('Loading validation data... ') val_data = load_data(val_data_path) val_label = load_label(val_label_path) val_dataset = SpeechDataset(val_data, val_label) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False,
for key, val in IO_session.items(): print(key, '=', val) file_type = IO_session.get('type', None) if file_type: if file_type == 'csv': if is_train: # training required_fields = [ 'train_file', 'text_column', 'label_column', 'batch_size' ] check_fields(required_fields, IO_session) text_column = IO_session['text_column'] label_column = str2list(IO_session['label_column']) train_iter, test_iter, TEXT = load_data(file_type, IO_session, is_train=is_train) vocab = TEXT.vocab # save vocab output_dir = IO_session.get('output_dir', 'output') writer = SummaryWriter(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) pickle.dump( vocab, open(os.path.join(output_dir, 'vocab.cache'), 'wb')) else: # decoding required_fields = [ 'decode_file', 'text_column', 'vocab_file', 'batch_size'
def predict(target_log='intermediate/User11.log', load_epoch=1): target_path = target_log.split('.')[0] attack_log1 = target_path + 'Attack1.log' attack_log2 = target_path + 'Attack2.log' attack_log3 = target_path + 'Attack3.log' dictionary = FileIndexTransformer() dictionary.load('model/runs/dictionary.pkl') corpus_size = dictionary.size() print('Loading data...') _, _, test_pos_data, test_pos_time = load_data(target_log, ratio=0.8) test_neg_data1, test_neg_time1, _, _ = load_data(attack_log1, ratio=None) test_neg_data2, test_neg_time2, _, _ = load_data(attack_log2, ratio=None) test_neg_data3, test_neg_time3, _, _ = load_data(attack_log3, ratio=None) test_pos_data = dictionary.transform(test_pos_data) test_neg_data1 = dictionary.transform(test_neg_data1) test_neg_data2 = dictionary.transform(test_neg_data2) test_neg_data3 = dictionary.transform(test_neg_data3) test_data_gen = get_test_data( test_pos_data, test_pos_time, [test_neg_data1, test_neg_data2, test_neg_data3], [test_neg_time1, test_neg_time2, test_neg_time3]) print('Loading modle(epoch %d)...' % load_epoch) model = skip_gram(corpus_size, emb_dim) if (load_epoch != 0): model.load_state_dict( torch.load('model/runs/path2vec_epoch%d.pt' % load_epoch)) # 0 represent positive, 1 represent negitive(attack) # the higher loss score mean attack (cannnot fit history well) label_true = [] label_pred = [] mms = MinMaxScaler() for test_data, test_time, y in test_data_gen: avg_loss = 0.0 step = 0 for batch, label in get_batch(test_data, test_time, time_window, seq_window, batch_size, None): batch_neg = get_neg_data(test_data, 10, batch_size, batch) batch_input = torch.tensor(batch, dtype=torch.long) batch_label = torch.tensor(label, dtype=torch.long) batch_neg = torch.tensor(batch_neg, dtype=torch.long) loss = model(batch_input, batch_label, batch_neg) avg_loss += loss.item() step += 1 avg_loss = avg_loss / step label_true.append(y) label_pred.append(avg_loss) print('Label: %d\t Loss: %f' % (y, avg_loss)) label_pred = mms.transform(label_pred) roc_auc = roc_auc_score(label_true, label_pred) print('AUC score: %f\n' % roc_auc) fpr, tpr, thresholds = roc_curve(label_true, label_pred) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
def _path2vec(target_log='intermediate/User11.log', load_epoch=0): print('Loading data...') train_data, train_time, _, _ = load_data(target_log, ratio=0.8) train_data_len = int(len(train_data) * 0.8) train_data, valid_data = train_data[:train_data_len], train_data[ train_data_len:] train_time, valid_time = train_time[:train_data_len], train_time[ train_data_len:] print('Length of train data: %d' % len(train_data)) print('Length of valid data: %d' % len(valid_data)) # file to idx dictionary = FileDictionary() dictionary.fit(train_data) dictionary.save('model/runs/dictionary.pkl') corpus_size = dictionary.size() print('Number of unique files: %d' % corpus_size) train_data = dictionary.transform(train_data) valid_data = dictionary.transform(valid_data) model = skip_gram(corpus_size, emb_dim) optim = SGD(model.parameters(), lr=learning_rate) if (load_epoch != 0): model.load_state_dict( torch.load('model/runs/path2vec_epoch%d.pt' % load_epoch)) writer = SummaryWriter('model/runs/path2vec') step = 0 for epo in range(num_epochs): avg_loss = 0 start_time = time.time() for batch, label in get_batch(train_data, train_time, time_window, seq_window, batch_size, print_step=log_step): batch_neg = get_neg_data(train_data, 10, batch_size, batch) batch_input = torch.tensor(batch, dtype=torch.long) batch_label = torch.tensor(label, dtype=torch.long) batch_neg = torch.tensor(batch_neg, dtype=torch.long) loss = model(batch_input, batch_label, batch_neg) optim.zero_grad() loss.backward() optim.step() step += 1 avg_loss += loss.item() if step % log_step == 0: print('Average loss at step %d: %f' % (step, avg_loss / log_step)) writer.add_scalar('training loss', avg_loss / log_step, step) avg_loss = 0 if step % valid_step == 0: valid_model(model, valid_data, valid_time, step, writer=writer) print('epoch %d time cost: %d s' % (epo, time.time() - start_time)) start_time = time.time() torch.save(model.state_dict(), 'model/runs/path2vec_epoch%d.pt' % (num_epochs + load_epoch))
if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' torch.manual_seed(11785) train_data_path = args.train_data_path train_label_path = args.train_label_path val_data_path = args.dev_data_path val_label_path = args.dev_label_path input_dim = 40 output_dim = 47 print('Loading training data... ') train_data = load_data(train_data_path) train_label = load_label(train_label_path) train_dataset = SpeechDataset(train_data, train_label) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_wrapper) # import pdb; pdb.set_trace() train_kbar = pkbar.Kbar(int(len(train_data) / args.batch_size) + 1) print('Loading validation data... ') val_data = load_data(val_data_path) val_label = load_label(val_label_path) val_dataset = SpeechDataset(val_data, val_label) val_loader = DataLoader(val_dataset, batch_size=args.batch_size,