def test_loading(): loader = Data_Loader('./transfer_data.npy', 0.7) train_X, train_Y, test_X, test_Y = loader.get_data() assert train_X.shape == (int(1000000 * 0.7), 81) assert train_Y.shape == (int(1000000 * 0.7), 81) assert test_X.shape == (1000000 - int(1000000 * 0.7), 81) assert test_Y.shape == (1000000 - int(1000000 * 0.7), 81)
def get_data(file_path, num_words=None, maxlen=150, filter_json=None): data_loader = Data_Loader() paras, labels = data_loader.get_para_label(file_path, filter_json) tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(paras) print('tokenizer fitted successfully!') sequences = tokenizer.texts_to_sequences(paras) print('sequences converted successfully!') paded_sequences = pad_sequences(sequences, maxlen=maxlen) print('sequences padded successfully!') return np.array(paded_sequences), np.array(labels), tokenizer.word_index, tokenizer.word_counts
def test_route_leak(self): ''' description: directly use the after-trained model (from weak dataset) to pred the well-known dataset :return: csv file ''' global SA_LSTM_flag loader = Data_Loader(Event_name='route_leak', Event_num=2) self.INPUT_SIZE = loader.INPUT_SIZE x, y0 = loader.loadroute_leak() INPUT_SIZE = x.shape[1] true_pred = pd.DataFrame() Event_list = [ 'prefix_hijack', 'route_leak', 'breakout', 'edge', 'defcon' ] print(INPUT_SIZE) import pickle for Event_name in Event_list: scaler = pickle.load( open('../params/' + Event_name + '_scaler.pkl', 'rb')) x0 = scaler.transform(x.values) test_x, test_y = loader.to_timestep(x=x0, y=y0.values, event_len=1440) test_x = torch.tensor(test_x, dtype=torch.float32) test_y = torch.tensor(np.array(test_y)) Path = '../params/best_lstm_params_' + Event_name + '.pkl' if SA_LSTM_flag: model = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE, INPUT_SIZE=self.INPUT_SIZE, Hidden_SIZE=128, LSTM_layer_NUM=1) model.load_state_dict(torch.load(Path)) test_output, attn = model(test_x) else: model = LSTM(WINDOW_SIZE=self.WINDOW_SIZE, INPUT_SIZE=self.INPUT_SIZE, Hidden_SIZE=128, LSTM_layer_NUM=1) model.load_state_dict(torch.load(Path)) test_output = model(test_x) pred_y = torch.max(test_output, 1)[1].cpu().data.numpy() from sklearn.metrics import classification_report print( classification_report(y_true=test_y, y_pred=pred_y, target_names=['Normal', 'Abnomarl '])) true_pred['pred_' + Event_name] = pred_y true_pred['true'] = test_y true_pred.to_csv('../result_doc/pred_true_route_leak_train.csv')
def main(args): # print("num_class:", args.num_class) # data_loader = DataLoader(args) data_loader = Data_Loader(args) args.embeddings = data_loader.embeddings args.doc_size = data_loader.doc_size model = Model(args) generator = Generator() discrimitor = Discrimitor(args) train(model, data_loader, generator, discrimitor, args)
def test(self, data_path, event_name, window): #API for testing on the other datasets x, y0 = Data_Loader.load(data_path, window) INPUT_SIZE = x.shape[1] true_pred = pd.DataFrame() Event_list = [ 'prefix_hijack', 'route_leak', 'breakout', 'edge', 'defcon' ] print(INPUT_SIZE) import pickle for Event_name in Event_list: scaler = pickle.load( open('../params/' + Event_name + '_scaler.pkl', 'rb')) x0 = scaler.transform(x.values) test_x, test_y = Data_Loader.to_timestep(x0, y0.values, window) test_x = torch.tensor(test_x, dtype=torch.float32) test_y = torch.tensor(np.array(test_y)) Path = '../params/best_lstm_params_' + Event_name + '.pkl' model = LSTM() model.load_state_dict(torch.load(Path)) test_output = model(test_x) output = test_output.detach().numpy() output_event_conf = output[:, 1] # print(len(output_event_conf)) # print(output_event_conf[output_event_conf>0.8]) condition_1 = np.array(output_event_conf > 0.9, dtype=int) condition_2 = torch.max(test_output, 1)[1].cpu().data.numpy() pred_y = condition_1 & condition_2 from sklearn.metrics import classification_report target_l = ['normal'] target_l.append(event_name) print( classification_report(y_true=test_y, y_pred=pred_y, target_names=target_l)) true_pred['pred_' + Event_name] = pred_y true_pred['true'] = test_y true_pred.to_csv('../result_doc/pred_true_' + event_name + '.csv')
from Data_Loader import Data_Loader from Model import Neural_Model if __name__ == '__main__': # loading data loader = Data_Loader('./transfer_data.npy', 0.7) train_X, train_Y, test_X, test_Y = loader.get_data() # build model config = { 'input dim': train_X.shape[1], 'dense layer 1 dim': 128, 'dense layer 2 dim': 128, 'output dim': train_Y.shape[1], 'epochs': 5, 'loss': 'mean_absolute_error' } model = Neural_Model(config) model.build_model() # training model.train_model(train_X, train_Y) # evaluate model.test_model(test_X, test_Y)
n_train = int(N_sample * (1 - test_size) * (1 - val_size)) epochs = int(n_iter / (n_train / batch_size)) f = open("Trainning_INFO_Regression_58k_QQBAL.txt", "w+") f.write('INFO: Epochs:{} -- Batch size:{} \n'.format(epochs, batch_size)) start = time.time() X, y = Load_Files('truth_DR12Q.fits', 'data_dr12.fits', N_sample, ['QSO', 'QSO_BAL'], classification=False) train_loader, test_loader, val_loader = Data_Loader(X, y, N_sample, batch_size, test_size, val_size) """ for i in range(100): x=np.linspace(300,1000,443) print('Redshift:{}'.format(y[i])) plt.plot(x,X[i,:]) plt.xlabel('Wavelength') plt.ylabel('Renormalized Flux [Arb unix]') plt.show() """ # In[13]: class Net_R(nn.Module):
def baseline(self, Event_num, read_from_file=True, include_MANRS_data=True, baseline_Feature=False): #baseline method SVM,RF Event_name = self.Event_list[Event_num - 1] target_list = ['normal', Event_name] loader = Data_Loader(Event_name=Event_name, Event_num=Event_num, TIME_STEP=1, WINDOW_SIZE=self.WINDOW_SIZE) train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet( read_from_file=read_from_file, include_MANRS_data=include_MANRS_data, baseline=baseline_Feature) self.INPUT_SIZE = loader.INPUT_SIZE train_x = train_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE]) test_x = test_x.reshape([-1, self.INPUT_SIZE * self.WINDOW_SIZE]) from sklearn import tree from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV tuned_param_SVC = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = ['precision', 'f1'] tuned_param_RT = [{ 'n_estimators': range(20, 500, 30), }] for score in scores: svc = GridSearchCV(SVC(), tuned_param_SVC, scoring='%s_macro' % score, n_jobs=8) rf = GridSearchCV(RandomForestClassifier(bootstrap=True, oob_score=True), tuned_param_RT, scoring='%s_macro' % score, n_jobs=8) # tree=tree.DecisionTree() models = [svc, rf] for clf in models: clf.fit(train_x, train_y) pred_y = clf.predict(test_x) message = 'Tuning hyper-parameters for %s' % score message = "Best parameters set found on development set:\n\n" message += str(clf.best_params_) message += '\n\n' message += "Grid scores on development set:\n\n" print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) message += "%0.3f (+/-%0.03f) for %r \n" % (mean, std * 2, params) message += "\nDetailed classification report:\n\nThe model is trained on the full development set.\nThe scores are computed on the full evaluation set.\n\n" print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() from sklearn.metrics import classification_report message += str( classification_report(y_true=test_y, y_pred=pred_y, target_names=target_list)) print( classification_report(y_true=test_y, y_pred=pred_y, target_names=target_list)) baseline_txt_path = '../result_doc/baseline.txt' with open(baseline_txt_path, 'a') as f: f.write(message)
def transfer_fine_tune(self, Event_name, Scheme='A', save_epoch=0, labelsmoothing=False, confidence=False, base_lr=1e-6, out_lr=1e-4): #the implement of transfer learning loader = Data_Loader(Event_name='route_leak', Event_num=2) train_x, train_y0, test_x, test_y = loader.loadroute_leak_train_test( scheme=Scheme) INPUT_SIZE = train_x.shape[1] true_pred = pd.DataFrame() #Event_name = 'route_leak' self.INPUT_SIZE = loader.INPUT_SIZE print(INPUT_SIZE) import pickle scaler = pickle.load( open('../params/' + Event_name + '_scaler.pkl', 'rb')) train_x = scaler.transform(train_x.values) train_x, train_y = loader.to_timestep(x=train_x, y=train_y0.values, event_len=1440) train_x, eval_x, train_y, eval_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42) train_x = torch.tensor(train_x, dtype=torch.float32) train_y = torch.tensor(train_y, dtype=torch.long) datasets = Data.TensorDataset(train_x, train_y) train_loader = Data.DataLoader(dataset=datasets, batch_size=self.BATCH_SIZE, shuffle=True, num_workers=2) test_x = scaler.transform(test_x.values) test_x, test_y = loader.to_timestep(x=test_x, y=test_y.values, event_len=1440) test_x = torch.tensor(test_x, dtype=torch.float32) test_y = torch.tensor(np.array(test_y)) eval_x = torch.tensor(eval_x, dtype=torch.float32) eval_y = torch.tensor(np.array(eval_y)) eval_x = eval_x.cuda() eval_y = eval_y.numpy() test_x = test_x.cuda() test_y = test_y.numpy() Path = '../params/best_lstm_params_' + Event_name + '.pkl' model = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE, INPUT_SIZE=self.INPUT_SIZE, Hidden_SIZE=128, LSTM_layer_NUM=1) model.load_state_dict(torch.load(Path)) model = model.cuda() ignored_params = list(map(id, model.out.parameters())) base_params = filter(lambda p: id(p) not in ignored_params, model.parameters()) optimizer = torch.optim.Adam([{ 'params': base_params, 'lr': 1e-5 }, { 'params': model.out.parameters(), 'lr': 1e-3 }]) if labelsmoothing: print("label smoothing") loss_func = LabelSmoothingLoss( classes=2, smoothing=0.5 ) #Special variant of CrossEntropyLoss,to prevent overfitting. else: loss_func = nn.CrossEntropyLoss() from sklearn.metrics import f1_score best_f1_score = 0.0 for epoch in range(self.EPOCH): for step, (x, y) in tqdm(enumerate(train_loader)): x = x.cuda() y = y.cuda() output, attn_weights = model(x) #print(y) loss = loss_func(output, y) optimizer.zero_grad() loss.backward() optimizer.step() if step % 1400 == 0: eval_output, attn_weights = model(eval_x) pred_y = torch.max(eval_output, 1)[1].cpu().data.numpy() accuracy = float(np.sum(pred_y == eval_y)) / float( eval_y.size) print('Epoch: ', epoch, '| train loss: %.4f' % loss.cpu().data.numpy(), '| eval accuracy: %.2f' % accuracy) from sklearn.metrics import classification_report temp_str = classification_report( y_true=eval_y, y_pred=pred_y, target_names=['nomral', 'abnormal']) a = classification_report( y_true=eval_y, y_pred=pred_y, target_names=['nomral', 'abnormal'], output_dict=True) #temp_f1=a['abnormal']['f1-score'] #temp_f1 = f1_score(y_pred=pred_y, y_true=eval_y, average='macro') #print('temp_f1', temp_f1) # temp_sum=temp_f1+temp_route_f1 #if (best_f1_score < temp_f1): if (epoch == save_epoch): print(temp_str + '\n' + str(temp_f1)) with open( '../result_doc/retrain' + Event_name + '.txt', 'a') as f: message = 'epoch:' + str( epoch) + ' f1_score:' + str(temp_f1) + '\n' f.write(message) best_f1_score = temp_f1 torch.save( model.state_dict(), '../params/retrain' + Event_name + Scheme + '.pkl') path = '../params/retrain' + Event_name + Scheme + '.pkl' model.load_state_dict(torch.load(path)) test_output, attn_weights = model(test_x) if confidence: output = test_output.cpu().data.numpy() pred_y = output[:, 1] # anomaly confidence else: pred_y = torch.max(test_output, 1)[1].cpu().data.numpy() #test_report = classification_report(y_true=test_y, y_pred=pred_y, # target_names=['Normal', 'Abnormal']) #print(test_report) self.true_pred['pred_' + Event_name] = pred_y self.true_pred['true'] = test_y
def train(self, Event_num, read_from_file=True, include_MANRS_data=True, WINDOW_SIZE=30, HIDDEN_SIZE=128, save_epoch=10): #the implement of training model ''' :param Event_num: the index type of the anomaly :param read_from_file: already have the file to read :param include_MANRS_data: join the legitimate data :param WINDOW_SIZE: the sliding window size :param HIDDEN_SIZE: the hidden size of LSTM model :return: save model under the path ../params/best_lstm_params_' + Event_name + '2.pkl ''' self.WINDOW_SIZE = WINDOW_SIZE self.Hidden_SIZE = HIDDEN_SIZE global SA_LSTM_flag Event_name = self.Event_list[Event_num - 1] Path = '../params/best_lstm_params_' + Event_name + '.pkl' target_list = ['normal', Event_name] loader = Data_Loader(Event_name=Event_name, Event_num=Event_num, TIME_STEP=1, WINDOW_SIZE=self.WINDOW_SIZE) train_x, train_y, test_x, test_y, eval_x, eval_y = loader.loadDataSet( read_from_file=read_from_file, include_MANRS_data=include_MANRS_data) self.INPUT_SIZE = loader.INPUT_SIZE datasets = Data.TensorDataset(train_x, train_y) train_loader = Data.DataLoader(dataset=datasets, batch_size=self.BATCH_SIZE, shuffle=True, num_workers=2) eval_x = eval_x.cuda() eval_y = eval_y.numpy() test_x = test_x.cuda() test_y = test_y.numpy() if SA_LSTM_flag: lstm = SA_LSTM(WINDOW_SIZE=self.WINDOW_SIZE, INPUT_SIZE=self.INPUT_SIZE, Hidden_SIZE=self.Hidden_SIZE, LSTM_layer_NUM=self.LSTM_layer_NUM) else: lstm = LSTM(WINDOW_SIZE=self.WINDOW_SIZE, INPUT_SIZE=self.INPUT_SIZE, Hidden_SIZE=self.Hidden_SIZE, LSTM_layer_NUM=self.LSTM_layer_NUM) lstm = lstm.cuda() optimizer = torch.optim.Adam(lstm.parameters(), lr=self.LR) loss_func = nn.CrossEntropyLoss() h_state = None from sklearn.metrics import f1_score best_f1_score = 0.0 best_epoch = 0 #train_length = len(train_loader) for epoch in range(self.EPOCH): for step, (x, y) in tqdm(enumerate(train_loader)): x = x.cuda() y = y.cuda() if SA_LSTM_flag: output, attn_weights = lstm(x) else: output = lstm(x) loss = loss_func(output, y) optimizer.zero_grad() loss.backward() optimizer.step() if step % 10000 == 0: if SA_LSTM_flag: eval_output, attn_weights = lstm(test_x) else: eval_output = lstm(test_x) pred_y = torch.max(eval_output, 1)[1].cpu().data.numpy() #print(pred_y) #print(eval_y) accuracy = float(np.sum(pred_y == test_y)) / float( test_y.size) print('Epoch: ', epoch, '| train loss: %.4f' % loss.cpu().data.numpy(), '| test accuracy: %.2f' % accuracy) from sklearn.metrics import classification_report temp_str = classification_report(y_true=test_y, y_pred=pred_y, target_names=target_list) temp_f1 = f1_score(y_pred=pred_y, y_true=test_y, average='macro') print('temp_f1', temp_f1) # temp_sum=temp_f1+temp_route_f1 #if (best_f1_score < temp_f1): if (epoch == save_epoch): print(temp_str + '\n' + str(temp_f1)) with open( '../result_doc/test_best_f1' + Event_name + '.txt', 'a') as f: message = 'epoch:' + str( epoch) + ' f1_score:' + str(temp_f1) + '\n' f.write(message) best_f1_score = temp_f1 best_epoch = epoch torch.save(lstm.state_dict(), Path) lstm.load_state_dict(torch.load(Path)) if SA_LSTM_flag: test_output, attn_weights = lstm(test_x) else: test_output = lstm(test_x) pred_y = torch.max(test_output, 1)[1].cpu().data.numpy() from sklearn.metrics import classification_report test_report = classification_report(y_true=test_y, y_pred=pred_y, target_names=target_list) test_parameter_path = '../result_doc/test_parameter' + '_' + Event_name + '.txt' with open(test_parameter_path, 'a') as f: message = "TimeStep:" + str( self.TIME_STEP ) + '\tWINDOW_SIZE:' + str( self.WINDOW_SIZE ) + "\tLSTM_NUM: " + str(self.LSTM_NUM) + '\tLayer num: ' + str( self.LSTM_layer_NUM ) + '\tLR:' + str(self.LR) + '\tBatch_size: ' + str( self.BATCH_SIZE) + '\tHidden_size: ' + str( self.Hidden_SIZE ) + '\tNormalizer:MinMaxScaler' + '\t epoch:' + str( best_epoch) + '\tf1_score:' + str( best_f1_score) + '\n' + 'include_MANRS_data:' + str( include_MANRS_data ) + '\t time_bins:60s' + '\n' + test_report + '\n\n' print(message) f.write(message) self.models.append(lstm) #attn_weights_df=pd.DataFrame(best_attn_weights) #print(attn_weights_df) #attn_weights_df.to_csv('../result_doc/atten_weights' + '_' + Event_name + '.csv') torch.save(lstm, '../params/lstm' + Event_name + '.pkl')
n_train = int(N_sample * (1 - test_size) * (1 - val_size)) epochs = int(n_iter / (n_train / batch_size)) fi = open("Trainning_INFO_80k.txt", "w+") fi.write('INFO: Epochs:{} -- Batch size:{} \n'.format(epochs, batch_size)) start = time.time() X, y = Load_Files('truth_DR12Q.fits', 'data_dr12.fits', N_sample, None, classification=True) train_loader, test_loader, val_loader, train_s, test_s, val_s = Data_Loader( X, y, N_sample, batch_size, test_size, val_size, classification=True) # CNN for classification learning_rate = 0.1 class Net_C(nn.Module): def __init__(self): super(Net_C, self).__init__() self.conv1 = nn.Conv1d(1, 64, 15, stride=2) self.conv2 = nn.Conv1d(64, 128, 15, stride=2) self.conv3 = nn.Conv1d(128, 256, 15, stride=2) self.conv4 = nn.Conv1d(256, 256, 15, stride=2) self.pool = nn.MaxPool1d(2, 1) self.fc1 = nn.Linear(3328, 16)