Ejemplo n.º 1
0
def prepare_data():
    """Load and preprocess data"""
    data = load_data(data_path)

    prep = Preprocessor(data)

    prep.preprocess_data()
    prep.shuffle_data()
    prep.encode_data()

    return prep.X, prep.Y
Ejemplo n.º 2
0
def _neigh():
    train_data, test_data = load_data('intermediate/User11.log', ratio=0.8)
    # train_neigh = gen_neigh_set(train_data)
    # save_neigh(train_neigh, 'intermediate/train_neigh.pickle')
    # test_neigh = gen_neigh_set(test_data)
    # save_neigh(test_neigh, 'intermediate/test_neigh.pickle')
    train_neigh = load_neigh('intermediate/train_neigh.pickle')
    test_neigh = load_neigh('intermediate/test_neigh.pickle')

    attack1_data, _ = load_data('intermediate/User11Attack1.log', ratio=1.0)
    attack1_neigh = gen_neigh_set(attack1_data)
    attack2_data, _ = load_data('intermediate/User11Attack2.log', ratio=1.0)
    attack2_neigh = gen_neigh_set(attack2_data)
    attack3_data, _ = load_data('intermediate/User11Attack3.log', ratio=1.0)
    attack3_neigh = gen_neigh_set(attack3_data)

    fig = go.Figure()
    y_true = []
    y_score = []
    _, scores = score(train_neigh, test_neigh)
    y_true = y_true + [1] * len(scores)
    y_score += scores
    # plot.plot_score(fig, scores, name='Test')
    _, scores = score(train_neigh, attack1_neigh)
    y_true = y_true + [0] * len(scores)
    y_score += scores
    # plot.plot_score(fig, scores, name='Attack1')
    _, scores = score(train_neigh, attack2_neigh)
    y_true = y_true + [0] * len(scores)
    y_score += scores
    # plot.plot_score(fig, scores, name='Attack2')
    _, scores = score(train_neigh, attack3_neigh)
    y_true = y_true + [0] * len(scores)
    y_score += scores
    # plot.plot_score(fig, scores, name='Attack3')
    plot.plot_roc(fig, y_true, y_score)
    fig.show()
Ejemplo n.º 3
0
def get_UCI_data_loader(args, b_size, num_workers):
    p = load_data(args.dataset)

    train_data = p.data
    test_data = p.test_data
    valid_data = p.valid_data
    train_set = PrepareUCIData(train_data)
    test_set = PrepareUCIData(test_data)
    valid_set = PrepareUCIData(valid_data)

    trainloader = torch.utils.data.DataLoader(train_set,
                                              batch_size=b_size,
                                              shuffle=True,
                                              num_workers=num_workers)
    testloader = torch.utils.data.DataLoader(test_set,
                                             batch_size=b_size,
                                             shuffle=True,
                                             num_workers=num_workers)
    validloader = torch.utils.data.DataLoader(valid_set,
                                              batch_size=b_size,
                                              shuffle=True,
                                              num_workers=num_workers)

    return trainloader, testloader, validloader
Ejemplo n.º 4
0
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import config
from LSTM import LSTM
from utils import dataloader


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DATA_DIR = config.DATA_DIR
train_X, train_Y, dev_X, dev_Y = dataloader.load_data(DATA_DIR)

batch_size = config.BATCH_SIZE
num_epochs = config.NUM_EPOCHS
initial_lr = config.LR
hidden_size = config.HIDDEN_SIZE
num_layers = config.NUM_LAYERS

# Define model
print("Build LSTM model ..")
model = LSTM(
    input_size=6,  # TODO : 6
    hidden_size=hidden_size,
    batch_size=batch_size,
    output_size=2,  # TODO : 2
    num_layers=num_layers
)
model.to(device)
loss_function = nn.NLLLoss()
val_acc = 0.0
Ejemplo n.º 5
0
                            config_flat_mean_shift.yml
                            config_flat_blurring_mean_shift.yml
                            config_gaussian_mean_shift.yml
                            config_gaussian_blurring_mean_shift.yml
                            config_mod_shift.yml
                            config_blurring_mod_shift.yml
                            """
                         )

args = parser.parse_args()
# load data config
with open(args.data, "r") as config_file:
    data_config = yaml.load(config_file, Loader=yaml.FullLoader)

# load data and gt
data = load_data(data_config)
gt = load_data(data_config, gt = True)

# load clustering config
with open(args.clustering, "r") as config_file:
    clustering_config = yaml.load(config_file, Loader=yaml.FullLoader)



# instantiate clustering
target_dir = os.path.join(data_config["root_path"], data_config["dataset"], "results")
if data_config["dataset"] =="CREMI":
    target_dir = os.path.join(target_dir, data_config["set"])

clusterer = get_clusterer(clustering_config["method"],
                          clustering_config,
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    # torch.manual_seed(11785)

    test_data_path = args.test_data_path
    val_data_path = args.dev_data_path
    val_label_path = args.dev_label_path
    input_dim = 40
    output_dim = 47

    if args.mode == 'Test':
        print('Loading test data...')
        test_data = load_data(test_data_path)
        test_dataset = SpeechDataset(test_data)
        test_loader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 collate_fn=collate_wrapper_test)
        test_kbar = pkbar.Kbar(int(len(test_data) / args.batch_size) + 1)

    else:
        print('Loading validation data... ')
        val_data = load_data(val_data_path)
        val_label = load_label(val_label_path)
        val_dataset = SpeechDataset(val_data, val_label)
        val_loader = DataLoader(val_dataset,
                                batch_size=args.batch_size,
                                shuffle=False,
Ejemplo n.º 7
0
 for key, val in IO_session.items():
     print(key, '=', val)
 file_type = IO_session.get('type', None)
 if file_type:
     if file_type == 'csv':
         if is_train:
             # training
             required_fields = [
                 'train_file', 'text_column', 'label_column',
                 'batch_size'
             ]
             check_fields(required_fields, IO_session)
             text_column = IO_session['text_column']
             label_column = str2list(IO_session['label_column'])
             train_iter, test_iter, TEXT = load_data(file_type,
                                                     IO_session,
                                                     is_train=is_train)
             vocab = TEXT.vocab
             # save vocab
             output_dir = IO_session.get('output_dir', 'output')
             writer = SummaryWriter(output_dir)
             if not os.path.exists(output_dir):
                 os.makedirs(output_dir)
             pickle.dump(
                 vocab,
                 open(os.path.join(output_dir, 'vocab.cache'), 'wb'))
         else:
             # decoding
             required_fields = [
                 'decode_file', 'text_column', 'vocab_file',
                 'batch_size'
Ejemplo n.º 8
0
def predict(target_log='intermediate/User11.log', load_epoch=1):
    target_path = target_log.split('.')[0]
    attack_log1 = target_path + 'Attack1.log'
    attack_log2 = target_path + 'Attack2.log'
    attack_log3 = target_path + 'Attack3.log'

    dictionary = FileIndexTransformer()
    dictionary.load('model/runs/dictionary.pkl')
    corpus_size = dictionary.size()

    print('Loading data...')
    _, _, test_pos_data, test_pos_time = load_data(target_log, ratio=0.8)
    test_neg_data1, test_neg_time1, _, _ = load_data(attack_log1, ratio=None)
    test_neg_data2, test_neg_time2, _, _ = load_data(attack_log2, ratio=None)
    test_neg_data3, test_neg_time3, _, _ = load_data(attack_log3, ratio=None)
    test_pos_data = dictionary.transform(test_pos_data)
    test_neg_data1 = dictionary.transform(test_neg_data1)
    test_neg_data2 = dictionary.transform(test_neg_data2)
    test_neg_data3 = dictionary.transform(test_neg_data3)
    test_data_gen = get_test_data(
        test_pos_data, test_pos_time,
        [test_neg_data1, test_neg_data2, test_neg_data3],
        [test_neg_time1, test_neg_time2, test_neg_time3])

    print('Loading modle(epoch %d)...' % load_epoch)
    model = skip_gram(corpus_size, emb_dim)
    if (load_epoch != 0):
        model.load_state_dict(
            torch.load('model/runs/path2vec_epoch%d.pt' % load_epoch))

    # 0 represent positive, 1 represent negitive(attack)
    # the higher loss score mean attack (cannnot fit history well)
    label_true = []
    label_pred = []
    mms = MinMaxScaler()
    for test_data, test_time, y in test_data_gen:
        avg_loss = 0.0
        step = 0
        for batch, label in get_batch(test_data, test_time, time_window,
                                      seq_window, batch_size, None):
            batch_neg = get_neg_data(test_data, 10, batch_size, batch)

            batch_input = torch.tensor(batch, dtype=torch.long)
            batch_label = torch.tensor(label, dtype=torch.long)
            batch_neg = torch.tensor(batch_neg, dtype=torch.long)

            loss = model(batch_input, batch_label, batch_neg)
            avg_loss += loss.item()
            step += 1
        avg_loss = avg_loss / step
        label_true.append(y)
        label_pred.append(avg_loss)
        print('Label: %d\t Loss: %f' % (y, avg_loss))
    label_pred = mms.transform(label_pred)
    roc_auc = roc_auc_score(label_true, label_pred)
    print('AUC score: %f\n' % roc_auc)
    fpr, tpr, thresholds = roc_curve(label_true, label_pred)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
Ejemplo n.º 9
0
def _path2vec(target_log='intermediate/User11.log', load_epoch=0):
    print('Loading data...')
    train_data, train_time, _, _ = load_data(target_log, ratio=0.8)
    train_data_len = int(len(train_data) * 0.8)
    train_data, valid_data = train_data[:train_data_len], train_data[
        train_data_len:]
    train_time, valid_time = train_time[:train_data_len], train_time[
        train_data_len:]
    print('Length of train data: %d' % len(train_data))
    print('Length of valid data: %d' % len(valid_data))

    # file to idx
    dictionary = FileDictionary()
    dictionary.fit(train_data)
    dictionary.save('model/runs/dictionary.pkl')
    corpus_size = dictionary.size()
    print('Number of unique files: %d' % corpus_size)
    train_data = dictionary.transform(train_data)
    valid_data = dictionary.transform(valid_data)

    model = skip_gram(corpus_size, emb_dim)
    optim = SGD(model.parameters(), lr=learning_rate)
    if (load_epoch != 0):
        model.load_state_dict(
            torch.load('model/runs/path2vec_epoch%d.pt' % load_epoch))

    writer = SummaryWriter('model/runs/path2vec')

    step = 0
    for epo in range(num_epochs):
        avg_loss = 0
        start_time = time.time()
        for batch, label in get_batch(train_data,
                                      train_time,
                                      time_window,
                                      seq_window,
                                      batch_size,
                                      print_step=log_step):
            batch_neg = get_neg_data(train_data, 10, batch_size, batch)

            batch_input = torch.tensor(batch, dtype=torch.long)
            batch_label = torch.tensor(label, dtype=torch.long)
            batch_neg = torch.tensor(batch_neg, dtype=torch.long)

            loss = model(batch_input, batch_label, batch_neg)
            optim.zero_grad()
            loss.backward()
            optim.step()

            step += 1
            avg_loss += loss.item()
            if step % log_step == 0:
                print('Average loss at step %d: %f' %
                      (step, avg_loss / log_step))
                writer.add_scalar('training loss', avg_loss / log_step, step)
                avg_loss = 0
            if step % valid_step == 0:
                valid_model(model, valid_data, valid_time, step, writer=writer)
        print('epoch %d time cost: %d s' % (epo, time.time() - start_time))
        start_time = time.time()

    torch.save(model.state_dict(),
               'model/runs/path2vec_epoch%d.pt' % (num_epochs + load_epoch))
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    torch.manual_seed(11785)

    train_data_path = args.train_data_path
    train_label_path = args.train_label_path
    val_data_path = args.dev_data_path
    val_label_path = args.dev_label_path
    input_dim = 40
    output_dim = 47

    print('Loading training data... ')
    train_data = load_data(train_data_path)
    train_label = load_label(train_label_path)
    train_dataset = SpeechDataset(train_data, train_label)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_wrapper)
    # import pdb; pdb.set_trace()
    train_kbar = pkbar.Kbar(int(len(train_data) / args.batch_size) + 1)

    print('Loading validation data... ')
    val_data = load_data(val_data_path)
    val_label = load_label(val_label_path)
    val_dataset = SpeechDataset(val_data, val_label)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,