Example #1
0
def test_loading():
    loader = Data_Loader('./transfer_data.npy', 0.7)
    train_X, train_Y, test_X, test_Y = loader.get_data()
    assert train_X.shape == (int(1000000 * 0.7), 81)
    assert train_Y.shape == (int(1000000 * 0.7), 81)
    assert test_X.shape == (1000000 - int(1000000 * 0.7), 81)
    assert test_Y.shape == (1000000 - int(1000000 * 0.7), 81)
def get_data(file_path, num_words=None, maxlen=150, filter_json=None):
    data_loader = Data_Loader()
    paras, labels = data_loader.get_para_label(file_path, filter_json)
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(paras)
    print('tokenizer fitted successfully!')
    sequences = tokenizer.texts_to_sequences(paras)
    print('sequences converted successfully!')
    paded_sequences = pad_sequences(sequences, maxlen=maxlen)
    print('sequences padded successfully!')
    return np.array(paded_sequences), np.array(labels), tokenizer.word_index, tokenizer.word_counts
Example #3
0
    def test_route_leak(self):
        '''
        description: directly use the after-trained model (from weak dataset) to pred the well-known dataset
        :return: csv file
        '''
        global SA_LSTM_flag
        loader = Data_Loader(Event_name='route_leak', Event_num=2)
        self.INPUT_SIZE = loader.INPUT_SIZE
        x, y0 = loader.loadroute_leak()
        INPUT_SIZE = x.shape[1]
        true_pred = pd.DataFrame()
        Event_list = [
            'prefix_hijack', 'route_leak', 'breakout', 'edge', 'defcon'
        ]
        print(INPUT_SIZE)
        import pickle
        for Event_name in Event_list:
            scaler = pickle.load(
                open('../params/' + Event_name + '_scaler.pkl', 'rb'))
            x0 = scaler.transform(x.values)
            test_x, test_y = loader.to_timestep(x=x0,
                                                y=y0.values,
                                                event_len=1440)

            test_x = torch.tensor(test_x, dtype=torch.float32)
            test_y = torch.tensor(np.array(test_y))
            Path = '../params/best_lstm_params_' + Event_name + '.pkl'
            if SA_LSTM_flag:
                model = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                                INPUT_SIZE=self.INPUT_SIZE,
                                Hidden_SIZE=128,
                                LSTM_layer_NUM=1)
                model.load_state_dict(torch.load(Path))
                test_output, attn = model(test_x)
            else:
                model = LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                             INPUT_SIZE=self.INPUT_SIZE,
                             Hidden_SIZE=128,
                             LSTM_layer_NUM=1)
                model.load_state_dict(torch.load(Path))
                test_output = model(test_x)

            pred_y = torch.max(test_output, 1)[1].cpu().data.numpy()
            from sklearn.metrics import classification_report
            print(
                classification_report(y_true=test_y,
                                      y_pred=pred_y,
                                      target_names=['Normal', 'Abnomarl ']))
            true_pred['pred_' + Event_name] = pred_y
        true_pred['true'] = test_y
        true_pred.to_csv('../result_doc/pred_true_route_leak_train.csv')
Example #4
0
def main(args):
	# print("num_class:", args.num_class)
	# data_loader     = DataLoader(args)
	data_loader     = Data_Loader(args)
	args.embeddings = data_loader.embeddings
	args.doc_size   = data_loader.doc_size

	model           = Model(args)
	generator       = Generator()
	discrimitor     = Discrimitor(args)

	train(model, data_loader, generator, discrimitor, args)
Example #5
0
    def test(self, data_path, event_name,
             window):  #API for testing on the other datasets
        x, y0 = Data_Loader.load(data_path, window)
        INPUT_SIZE = x.shape[1]
        true_pred = pd.DataFrame()
        Event_list = [
            'prefix_hijack', 'route_leak', 'breakout', 'edge', 'defcon'
        ]
        print(INPUT_SIZE)
        import pickle
        for Event_name in Event_list:
            scaler = pickle.load(
                open('../params/' + Event_name + '_scaler.pkl', 'rb'))
            x0 = scaler.transform(x.values)
            test_x, test_y = Data_Loader.to_timestep(x0, y0.values, window)

            test_x = torch.tensor(test_x, dtype=torch.float32)
            test_y = torch.tensor(np.array(test_y))
            Path = '../params/best_lstm_params_' + Event_name + '.pkl'
            model = LSTM()
            model.load_state_dict(torch.load(Path))
            test_output = model(test_x)
            output = test_output.detach().numpy()
            output_event_conf = output[:, 1]
            # print(len(output_event_conf))
            # print(output_event_conf[output_event_conf>0.8])
            condition_1 = np.array(output_event_conf > 0.9, dtype=int)
            condition_2 = torch.max(test_output, 1)[1].cpu().data.numpy()
            pred_y = condition_1 & condition_2
            from sklearn.metrics import classification_report
            target_l = ['normal']
            target_l.append(event_name)
            print(
                classification_report(y_true=test_y,
                                      y_pred=pred_y,
                                      target_names=target_l))
            true_pred['pred_' + Event_name] = pred_y
        true_pred['true'] = test_y
        true_pred.to_csv('../result_doc/pred_true_' + event_name + '.csv')
Example #6
0
from Data_Loader import Data_Loader
from Model import Neural_Model

if __name__ == '__main__':
    # loading data
    loader = Data_Loader('./transfer_data.npy', 0.7)
    train_X, train_Y, test_X, test_Y = loader.get_data()
    # build model
    config = {
        'input dim': train_X.shape[1],
        'dense layer 1 dim': 128,
        'dense layer 2 dim': 128,
        'output dim': train_Y.shape[1],
        'epochs': 5,
        'loss': 'mean_absolute_error'
    }
    model = Neural_Model(config)
    model.build_model()
    # training
    model.train_model(train_X, train_Y)
    # evaluate
    model.test_model(test_X, test_Y)
Example #7
0
n_train = int(N_sample * (1 - test_size) * (1 - val_size))

epochs = int(n_iter / (n_train / batch_size))

f = open("Trainning_INFO_Regression_58k_QQBAL.txt", "w+")

f.write('INFO: Epochs:{} -- Batch size:{} \n'.format(epochs, batch_size))

start = time.time()

X, y = Load_Files('truth_DR12Q.fits',
                  'data_dr12.fits',
                  N_sample, ['QSO', 'QSO_BAL'],
                  classification=False)
train_loader, test_loader, val_loader = Data_Loader(X, y, N_sample, batch_size,
                                                    test_size, val_size)
"""
for i in range(100):
    x=np.linspace(300,1000,443)
    print('Redshift:{}'.format(y[i]))
    plt.plot(x,X[i,:])
    plt.xlabel('Wavelength')
    plt.ylabel('Renormalized Flux [Arb unix]')
    plt.show()
    
"""

# In[13]:


class Net_R(nn.Module):
Example #8
0
    def baseline(self,
                 Event_num,
                 read_from_file=True,
                 include_MANRS_data=True,
                 baseline_Feature=False):  #baseline method SVM,RF

        Event_name = self.Event_list[Event_num - 1]
        target_list = ['normal', Event_name]
        loader = Data_Loader(Event_name=Event_name,
                             Event_num=Event_num,
                             TIME_STEP=1,
                             WINDOW_SIZE=self.WINDOW_SIZE)

        train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet(
            read_from_file=read_from_file,
            include_MANRS_data=include_MANRS_data,
            baseline=baseline_Feature)
        self.INPUT_SIZE = loader.INPUT_SIZE
        train_x = train_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE])
        test_x = test_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE])
        from sklearn import tree
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.svm import SVC
        from sklearn.model_selection import GridSearchCV
        tuned_param_SVC = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000]
        }, {
            'kernel': ['linear'],
            'C': [1, 10, 100, 1000]
        }]
        scores = ['precision', 'f1']
        tuned_param_RT = [{
            'n_estimators': range(20, 500, 30),
        }]
        for score in scores:
            svc = GridSearchCV(SVC(),
                               tuned_param_SVC,
                               scoring='%s_macro' % score,
                               n_jobs=8)
            rf = GridSearchCV(RandomForestClassifier(bootstrap=True,
                                                     oob_score=True),
                              tuned_param_RT,
                              scoring='%s_macro' % score,
                              n_jobs=8)
            # tree=tree.DecisionTree()
            models = [svc, rf]
            for clf in models:
                clf.fit(train_x, train_y)
                pred_y = clf.predict(test_x)
                message = 'Tuning hyper-parameters for %s' % score
                message = "Best parameters set found on development set:\n\n"
                message += str(clf.best_params_)
                message += '\n\n'
                message += "Grid scores on development set:\n\n"

                print("Best parameters set found on development set:")
                print()
                print(clf.best_params_)
                print()
                print("Grid scores on development set:")
                print()
                means = clf.cv_results_['mean_test_score']
                stds = clf.cv_results_['std_test_score']
                for mean, std, params in zip(means, stds,
                                             clf.cv_results_['params']):
                    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
                    message += "%0.3f (+/-%0.03f) for %r \n" % (mean, std * 2,
                                                                params)
                message += "\nDetailed classification report:\n\nThe model is trained on the full development set.\nThe scores are computed on the full evaluation set.\n\n"
                print()

                print("Detailed classification report:")
                print()
                print("The model is trained on the full development set.")
                print("The scores are computed on the full evaluation set.")
                print()
                from sklearn.metrics import classification_report
                message += str(
                    classification_report(y_true=test_y,
                                          y_pred=pred_y,
                                          target_names=target_list))
                print(
                    classification_report(y_true=test_y,
                                          y_pred=pred_y,
                                          target_names=target_list))
                baseline_txt_path = '../result_doc/baseline.txt'
                with open(baseline_txt_path, 'a') as f:
                    f.write(message)
Example #9
0
    def transfer_fine_tune(self,
                           Event_name,
                           Scheme='A',
                           save_epoch=0,
                           labelsmoothing=False,
                           confidence=False,
                           base_lr=1e-6,
                           out_lr=1e-4):  #the implement of transfer learning
        loader = Data_Loader(Event_name='route_leak', Event_num=2)
        train_x, train_y0, test_x, test_y = loader.loadroute_leak_train_test(
            scheme=Scheme)
        INPUT_SIZE = train_x.shape[1]
        true_pred = pd.DataFrame()
        #Event_name = 'route_leak'
        self.INPUT_SIZE = loader.INPUT_SIZE
        print(INPUT_SIZE)
        import pickle

        scaler = pickle.load(
            open('../params/' + Event_name + '_scaler.pkl', 'rb'))
        train_x = scaler.transform(train_x.values)
        train_x, train_y = loader.to_timestep(x=train_x,
                                              y=train_y0.values,
                                              event_len=1440)
        train_x, eval_x, train_y, eval_y = train_test_split(train_x,
                                                            train_y,
                                                            test_size=0.2,
                                                            random_state=42)

        train_x = torch.tensor(train_x, dtype=torch.float32)
        train_y = torch.tensor(train_y, dtype=torch.long)
        datasets = Data.TensorDataset(train_x, train_y)
        train_loader = Data.DataLoader(dataset=datasets,
                                       batch_size=self.BATCH_SIZE,
                                       shuffle=True,
                                       num_workers=2)

        test_x = scaler.transform(test_x.values)
        test_x, test_y = loader.to_timestep(x=test_x,
                                            y=test_y.values,
                                            event_len=1440)

        test_x = torch.tensor(test_x, dtype=torch.float32)
        test_y = torch.tensor(np.array(test_y))

        eval_x = torch.tensor(eval_x, dtype=torch.float32)
        eval_y = torch.tensor(np.array(eval_y))
        eval_x = eval_x.cuda()
        eval_y = eval_y.numpy()
        test_x = test_x.cuda()
        test_y = test_y.numpy()

        Path = '../params/best_lstm_params_' + Event_name + '.pkl'

        model = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                        INPUT_SIZE=self.INPUT_SIZE,
                        Hidden_SIZE=128,
                        LSTM_layer_NUM=1)
        model.load_state_dict(torch.load(Path))

        model = model.cuda()
        ignored_params = list(map(id, model.out.parameters()))
        base_params = filter(lambda p: id(p) not in ignored_params,
                             model.parameters())
        optimizer = torch.optim.Adam([{
            'params': base_params,
            'lr': 1e-5
        }, {
            'params': model.out.parameters(),
            'lr': 1e-3
        }])

        if labelsmoothing:
            print("label smoothing")
            loss_func = LabelSmoothingLoss(
                classes=2, smoothing=0.5
            )  #Special variant of CrossEntropyLoss,to prevent overfitting.
        else:
            loss_func = nn.CrossEntropyLoss()

        from sklearn.metrics import f1_score
        best_f1_score = 0.0

        for epoch in range(self.EPOCH):
            for step, (x, y) in tqdm(enumerate(train_loader)):
                x = x.cuda()
                y = y.cuda()
                output, attn_weights = model(x)
                #print(y)
                loss = loss_func(output, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if step % 1400 == 0:

                    eval_output, attn_weights = model(eval_x)
                    pred_y = torch.max(eval_output, 1)[1].cpu().data.numpy()
                    accuracy = float(np.sum(pred_y == eval_y)) / float(
                        eval_y.size)
                    print('Epoch: ', epoch,
                          '| train loss: %.4f' % loss.cpu().data.numpy(),
                          '| eval accuracy: %.2f' % accuracy)
                    from sklearn.metrics import classification_report

                    temp_str = classification_report(
                        y_true=eval_y,
                        y_pred=pred_y,
                        target_names=['nomral', 'abnormal'])
                    a = classification_report(
                        y_true=eval_y,
                        y_pred=pred_y,
                        target_names=['nomral', 'abnormal'],
                        output_dict=True)
                    #temp_f1=a['abnormal']['f1-score']
                    #temp_f1 = f1_score(y_pred=pred_y, y_true=eval_y, average='macro')
                    #print('temp_f1', temp_f1)
                    # temp_sum=temp_f1+temp_route_f1
                    #if (best_f1_score < temp_f1):
                    if (epoch == save_epoch):
                        print(temp_str + '\n' + str(temp_f1))
                        with open(
                                '../result_doc/retrain' + Event_name + '.txt',
                                'a') as f:
                            message = 'epoch:' + str(
                                epoch) + ' f1_score:' + str(temp_f1) + '\n'
                            f.write(message)
                        best_f1_score = temp_f1
                        torch.save(
                            model.state_dict(),
                            '../params/retrain' + Event_name + Scheme + '.pkl')

        path = '../params/retrain' + Event_name + Scheme + '.pkl'
        model.load_state_dict(torch.load(path))
        test_output, attn_weights = model(test_x)

        if confidence:
            output = test_output.cpu().data.numpy()
            pred_y = output[:, 1]  # anomaly confidence
        else:
            pred_y = torch.max(test_output, 1)[1].cpu().data.numpy()
        #test_report = classification_report(y_true=test_y, y_pred=pred_y,
        #                                    target_names=['Normal', 'Abnormal'])
        #print(test_report)
        self.true_pred['pred_' + Event_name] = pred_y
        self.true_pred['true'] = test_y
Example #10
0
    def train(self,
              Event_num,
              read_from_file=True,
              include_MANRS_data=True,
              WINDOW_SIZE=30,
              HIDDEN_SIZE=128,
              save_epoch=10):  #the implement of training model
        '''

        :param Event_num: the index type of the anomaly
        :param read_from_file: already have the file to read
        :param include_MANRS_data: join the legitimate data
        :param WINDOW_SIZE: the sliding window size
        :param HIDDEN_SIZE: the hidden size of LSTM model
        :return: save model under the path ../params/best_lstm_params_' + Event_name + '2.pkl
        '''

        self.WINDOW_SIZE = WINDOW_SIZE
        self.Hidden_SIZE = HIDDEN_SIZE

        global SA_LSTM_flag
        Event_name = self.Event_list[Event_num - 1]
        Path = '../params/best_lstm_params_' + Event_name + '.pkl'
        target_list = ['normal', Event_name]
        loader = Data_Loader(Event_name=Event_name,
                             Event_num=Event_num,
                             TIME_STEP=1,
                             WINDOW_SIZE=self.WINDOW_SIZE)

        train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet(
            read_from_file=read_from_file,
            include_MANRS_data=include_MANRS_data)
        self.INPUT_SIZE = loader.INPUT_SIZE
        datasets = Data.TensorDataset(train_x, train_y)
        train_loader = Data.DataLoader(dataset=datasets,
                                       batch_size=self.BATCH_SIZE,
                                       shuffle=True,
                                       num_workers=2)

        eval_x = eval_x.cuda()
        eval_y = eval_y.numpy()
        test_x = test_x.cuda()
        test_y = test_y.numpy()
        if SA_LSTM_flag:
            lstm = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                           INPUT_SIZE=self.INPUT_SIZE,
                           Hidden_SIZE=self.Hidden_SIZE,
                           LSTM_layer_NUM=self.LSTM_layer_NUM)
        else:
            lstm = LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                        INPUT_SIZE=self.INPUT_SIZE,
                        Hidden_SIZE=self.Hidden_SIZE,
                        LSTM_layer_NUM=self.LSTM_layer_NUM)

        lstm = lstm.cuda()
        optimizer = torch.optim.Adam(lstm.parameters(), lr=self.LR)

        loss_func = nn.CrossEntropyLoss()
        h_state = None

        from sklearn.metrics import f1_score
        best_f1_score = 0.0
        best_epoch = 0

        #train_length = len(train_loader)

        for epoch in range(self.EPOCH):
            for step, (x, y) in tqdm(enumerate(train_loader)):
                x = x.cuda()
                y = y.cuda()
                if SA_LSTM_flag:
                    output, attn_weights = lstm(x)
                else:
                    output = lstm(x)

                loss = loss_func(output, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if step % 10000 == 0:
                    if SA_LSTM_flag:
                        eval_output, attn_weights = lstm(test_x)
                    else:
                        eval_output = lstm(test_x)
                    pred_y = torch.max(eval_output, 1)[1].cpu().data.numpy()

                    #print(pred_y)
                    #print(eval_y)
                    accuracy = float(np.sum(pred_y == test_y)) / float(
                        test_y.size)
                    print('Epoch: ', epoch,
                          '| train loss: %.4f' % loss.cpu().data.numpy(),
                          '| test accuracy: %.2f' % accuracy)
                    from sklearn.metrics import classification_report

                    temp_str = classification_report(y_true=test_y,
                                                     y_pred=pred_y,
                                                     target_names=target_list)
                    temp_f1 = f1_score(y_pred=pred_y,
                                       y_true=test_y,
                                       average='macro')
                    print('temp_f1', temp_f1)
                    # temp_sum=temp_f1+temp_route_f1
                    #if (best_f1_score < temp_f1):
                    if (epoch == save_epoch):
                        print(temp_str + '\n' + str(temp_f1))
                        with open(
                                '../result_doc/test_best_f1' + Event_name +
                                '.txt', 'a') as f:
                            message = 'epoch:' + str(
                                epoch) + ' f1_score:' + str(temp_f1) + '\n'
                            f.write(message)
                        best_f1_score = temp_f1
                        best_epoch = epoch
                        torch.save(lstm.state_dict(), Path)

        lstm.load_state_dict(torch.load(Path))
        if SA_LSTM_flag:
            test_output, attn_weights = lstm(test_x)
        else:
            test_output = lstm(test_x)
        pred_y = torch.max(test_output, 1)[1].cpu().data.numpy()

        from sklearn.metrics import classification_report

        test_report = classification_report(y_true=test_y,
                                            y_pred=pred_y,
                                            target_names=target_list)
        test_parameter_path = '../result_doc/test_parameter' + '_' + Event_name + '.txt'
        with open(test_parameter_path, 'a') as f:
            message = "TimeStep:" + str(
                self.TIME_STEP
            ) + '\tWINDOW_SIZE:' + str(
                self.WINDOW_SIZE
            ) + "\tLSTM_NUM: " + str(self.LSTM_NUM) + '\tLayer num: ' + str(
                self.LSTM_layer_NUM
            ) + '\tLR:' + str(self.LR) + '\tBatch_size: ' + str(
                self.BATCH_SIZE) + '\tHidden_size: ' + str(
                    self.Hidden_SIZE
                ) + '\tNormalizer:MinMaxScaler' + '\t epoch:' + str(
                    best_epoch) + '\tf1_score:' + str(
                        best_f1_score) + '\n' + 'include_MANRS_data:' + str(
                            include_MANRS_data
                        ) + '\t time_bins:60s' + '\n' + test_report + '\n\n'
            print(message)

            f.write(message)
        self.models.append(lstm)
        #attn_weights_df=pd.DataFrame(best_attn_weights)
        #print(attn_weights_df)
        #attn_weights_df.to_csv('../result_doc/atten_weights' + '_' + Event_name + '.csv')
        torch.save(lstm, '../params/lstm' + Event_name + '.pkl')
n_train = int(N_sample * (1 - test_size) * (1 - val_size))
epochs = int(n_iter / (n_train / batch_size))

fi = open("Trainning_INFO_80k.txt", "w+")

fi.write('INFO: Epochs:{} -- Batch size:{} \n'.format(epochs, batch_size))

start = time.time()

X, y = Load_Files('truth_DR12Q.fits',
                  'data_dr12.fits',
                  N_sample,
                  None,
                  classification=True)
train_loader, test_loader, val_loader, train_s, test_s, val_s = Data_Loader(
    X, y, N_sample, batch_size, test_size, val_size, classification=True)

# CNN for classification

learning_rate = 0.1


class Net_C(nn.Module):
    def __init__(self):
        super(Net_C, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, 15, stride=2)
        self.conv2 = nn.Conv1d(64, 128, 15, stride=2)
        self.conv3 = nn.Conv1d(128, 256, 15, stride=2)
        self.conv4 = nn.Conv1d(256, 256, 15, stride=2)
        self.pool = nn.MaxPool1d(2, 1)
        self.fc1 = nn.Linear(3328, 16)