Example #1
0
    def baseline(self,
                 Event_num,
                 read_from_file=True,
                 include_MANRS_data=True,
                 baseline_Feature=False):  #baseline method SVM,RF

        Event_name = self.Event_list[Event_num - 1]
        target_list = ['normal', Event_name]
        loader = Data_Loader(Event_name=Event_name,
                             Event_num=Event_num,
                             TIME_STEP=1,
                             WINDOW_SIZE=self.WINDOW_SIZE)

        train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet(
            read_from_file=read_from_file,
            include_MANRS_data=include_MANRS_data,
            baseline=baseline_Feature)
        self.INPUT_SIZE = loader.INPUT_SIZE
        train_x = train_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE])
        test_x = test_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE])
        from sklearn import tree
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.svm import SVC
        from sklearn.model_selection import GridSearchCV
        tuned_param_SVC = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100, 1000]
        }, {
            'kernel': ['linear'],
            'C': [1, 10, 100, 1000]
        }]
        scores = ['precision', 'f1']
        tuned_param_RT = [{
            'n_estimators': range(20, 500, 30),
        }]
        for score in scores:
            svc = GridSearchCV(SVC(),
                               tuned_param_SVC,
                               scoring='%s_macro' % score,
                               n_jobs=8)
            rf = GridSearchCV(RandomForestClassifier(bootstrap=True,
                                                     oob_score=True),
                              tuned_param_RT,
                              scoring='%s_macro' % score,
                              n_jobs=8)
            # tree=tree.DecisionTree()
            models = [svc, rf]
            for clf in models:
                clf.fit(train_x, train_y)
                pred_y = clf.predict(test_x)
                message = 'Tuning hyper-parameters for %s' % score
                message = "Best parameters set found on development set:\n\n"
                message += str(clf.best_params_)
                message += '\n\n'
                message += "Grid scores on development set:\n\n"

                print("Best parameters set found on development set:")
                print()
                print(clf.best_params_)
                print()
                print("Grid scores on development set:")
                print()
                means = clf.cv_results_['mean_test_score']
                stds = clf.cv_results_['std_test_score']
                for mean, std, params in zip(means, stds,
                                             clf.cv_results_['params']):
                    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
                    message += "%0.3f (+/-%0.03f) for %r \n" % (mean, std * 2,
                                                                params)
                message += "\nDetailed classification report:\n\nThe model is trained on the full development set.\nThe scores are computed on the full evaluation set.\n\n"
                print()

                print("Detailed classification report:")
                print()
                print("The model is trained on the full development set.")
                print("The scores are computed on the full evaluation set.")
                print()
                from sklearn.metrics import classification_report
                message += str(
                    classification_report(y_true=test_y,
                                          y_pred=pred_y,
                                          target_names=target_list))
                print(
                    classification_report(y_true=test_y,
                                          y_pred=pred_y,
                                          target_names=target_list))
                baseline_txt_path = '../result_doc/baseline.txt'
                with open(baseline_txt_path, 'a') as f:
                    f.write(message)
Example #2
0
    def train(self,
              Event_num,
              read_from_file=True,
              include_MANRS_data=True,
              WINDOW_SIZE=30,
              HIDDEN_SIZE=128,
              save_epoch=10):  #the implement of training model
        '''

        :param Event_num: the index type of the anomaly
        :param read_from_file: already have the file to read
        :param include_MANRS_data: join the legitimate data
        :param WINDOW_SIZE: the sliding window size
        :param HIDDEN_SIZE: the hidden size of LSTM model
        :return: save model under the path ../params/best_lstm_params_' + Event_name + '2.pkl
        '''

        self.WINDOW_SIZE = WINDOW_SIZE
        self.Hidden_SIZE = HIDDEN_SIZE

        global SA_LSTM_flag
        Event_name = self.Event_list[Event_num - 1]
        Path = '../params/best_lstm_params_' + Event_name + '.pkl'
        target_list = ['normal', Event_name]
        loader = Data_Loader(Event_name=Event_name,
                             Event_num=Event_num,
                             TIME_STEP=1,
                             WINDOW_SIZE=self.WINDOW_SIZE)

        train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet(
            read_from_file=read_from_file,
            include_MANRS_data=include_MANRS_data)
        self.INPUT_SIZE = loader.INPUT_SIZE
        datasets = Data.TensorDataset(train_x, train_y)
        train_loader = Data.DataLoader(dataset=datasets,
                                       batch_size=self.BATCH_SIZE,
                                       shuffle=True,
                                       num_workers=2)

        eval_x = eval_x.cuda()
        eval_y = eval_y.numpy()
        test_x = test_x.cuda()
        test_y = test_y.numpy()
        if SA_LSTM_flag:
            lstm = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                           INPUT_SIZE=self.INPUT_SIZE,
                           Hidden_SIZE=self.Hidden_SIZE,
                           LSTM_layer_NUM=self.LSTM_layer_NUM)
        else:
            lstm = LSTM(WINDOW_SIZE=self.WINDOW_SIZE,
                        INPUT_SIZE=self.INPUT_SIZE,
                        Hidden_SIZE=self.Hidden_SIZE,
                        LSTM_layer_NUM=self.LSTM_layer_NUM)

        lstm = lstm.cuda()
        optimizer = torch.optim.Adam(lstm.parameters(), lr=self.LR)

        loss_func = nn.CrossEntropyLoss()
        h_state = None

        from sklearn.metrics import f1_score
        best_f1_score = 0.0
        best_epoch = 0

        #train_length = len(train_loader)

        for epoch in range(self.EPOCH):
            for step, (x, y) in tqdm(enumerate(train_loader)):
                x = x.cuda()
                y = y.cuda()
                if SA_LSTM_flag:
                    output, attn_weights = lstm(x)
                else:
                    output = lstm(x)

                loss = loss_func(output, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if step % 10000 == 0:
                    if SA_LSTM_flag:
                        eval_output, attn_weights = lstm(test_x)
                    else:
                        eval_output = lstm(test_x)
                    pred_y = torch.max(eval_output, 1)[1].cpu().data.numpy()

                    #print(pred_y)
                    #print(eval_y)
                    accuracy = float(np.sum(pred_y == test_y)) / float(
                        test_y.size)
                    print('Epoch: ', epoch,
                          '| train loss: %.4f' % loss.cpu().data.numpy(),
                          '| test accuracy: %.2f' % accuracy)
                    from sklearn.metrics import classification_report

                    temp_str = classification_report(y_true=test_y,
                                                     y_pred=pred_y,
                                                     target_names=target_list)
                    temp_f1 = f1_score(y_pred=pred_y,
                                       y_true=test_y,
                                       average='macro')
                    print('temp_f1', temp_f1)
                    # temp_sum=temp_f1+temp_route_f1
                    #if (best_f1_score < temp_f1):
                    if (epoch == save_epoch):
                        print(temp_str + '\n' + str(temp_f1))
                        with open(
                                '../result_doc/test_best_f1' + Event_name +
                                '.txt', 'a') as f:
                            message = 'epoch:' + str(
                                epoch) + ' f1_score:' + str(temp_f1) + '\n'
                            f.write(message)
                        best_f1_score = temp_f1
                        best_epoch = epoch
                        torch.save(lstm.state_dict(), Path)

        lstm.load_state_dict(torch.load(Path))
        if SA_LSTM_flag:
            test_output, attn_weights = lstm(test_x)
        else:
            test_output = lstm(test_x)
        pred_y = torch.max(test_output, 1)[1].cpu().data.numpy()

        from sklearn.metrics import classification_report

        test_report = classification_report(y_true=test_y,
                                            y_pred=pred_y,
                                            target_names=target_list)
        test_parameter_path = '../result_doc/test_parameter' + '_' + Event_name + '.txt'
        with open(test_parameter_path, 'a') as f:
            message = "TimeStep:" + str(
                self.TIME_STEP
            ) + '\tWINDOW_SIZE:' + str(
                self.WINDOW_SIZE
            ) + "\tLSTM_NUM: " + str(self.LSTM_NUM) + '\tLayer num: ' + str(
                self.LSTM_layer_NUM
            ) + '\tLR:' + str(self.LR) + '\tBatch_size: ' + str(
                self.BATCH_SIZE) + '\tHidden_size: ' + str(
                    self.Hidden_SIZE
                ) + '\tNormalizer:MinMaxScaler' + '\t epoch:' + str(
                    best_epoch) + '\tf1_score:' + str(
                        best_f1_score) + '\n' + 'include_MANRS_data:' + str(
                            include_MANRS_data
                        ) + '\t time_bins:60s' + '\n' + test_report + '\n\n'
            print(message)

            f.write(message)
        self.models.append(lstm)
        #attn_weights_df=pd.DataFrame(best_attn_weights)
        #print(attn_weights_df)
        #attn_weights_df.to_csv('../result_doc/atten_weights' + '_' + Event_name + '.csv')
        torch.save(lstm, '../params/lstm' + Event_name + '.pkl')