def get_magrove_generator(text_processing_func, data_dir: str, label_dict: dict): owasp_train_file = r'D:\Store\document\all_my_work\CZY\bishe\mangrove_old\lstm\data\extraction\owasp-slice-train-2.txt' owasp_test_file = r'D:\Store\document\all_my_work\CZY\bishe\mangrove_old\lstm\data\extraction\owasp-slice-test-2.txt' owasp_train_file = settings.relative_path_from_root('data/mangrove/t-train.txt') owasp_test_file = settings.relative_path_from_root('data/mangrove/t-test.txt') train_x, train_y = parseDataFile(owasp_train_file) test_x, test_y = parseDataFile(owasp_test_file) def gen() -> (str, int): for idx in range(len(train_x)): yield train_x[idx], train_y[idx] for idx in range(len(test_x)): yield test_x[idx], test_y[idx] return gen
def server(model_npz=None, host='127.0.0.1', port=8888, debug=False): global MODEL if model_npz: MODEL = lstm.load_model(settings.relative_path_from_root(model_npz)) else: logging.warning("Model not specify, can't predict") app.run(host=host, port=port, debug=debug)
def transform(data_dir, label_dir): tokenizer = Tokenizer() train, valid, test = dataloader.load_data(data_dir, label_dir, tokenizer=tokenizer, valid_portion=0) with open(settings.relative_path_from_root('data/mangrove/t-train.txt'), 'w') as f: for idx in range(len(train[0])): label = 'truepositive' if train[1][idx] == 0 else "falsepositive" _slice = tokenizer.decode(train[0][idx]) f.write('{} :: {}\n'.format(_slice, label)) with open(settings.relative_path_from_root('data/mangrove/t-test.txt'), 'w') as f: for idx in range(len(test[0])): label = 'truepositive' if train[1][idx] == 0 else "falsepositive" _slice = tokenizer.decode(train[0][idx]) f.write('{} :: {}\n'.format(_slice, label)) with open(settings.relative_path_from_root('data/mangrove/dict.txt'), 'w') as f: for token, _int in tokenizer.get_token_dict().items(): f.write('{} {}\n'.format(token, _int))
def label(): """ 接受一条标记数据 :return: """ label_json = request.get_json() data_dir = settings.relative_path_from_root('data/label/' + label_json['project']) if not os.path.exists(data_dir): os.makedirs(data_dir) with open(data_dir + "/label-" + label_json["flowHash"] + ".json", 'w') as f: json.dump(label_json, f) return jsonify({"msg": "true"}), 200
def train(self, slice_dir: str, label_dir: str, dim: int = 128, epochs: int = 20, timeout: float = 5): tokenizer = Tokenizer() current_time = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) model_file = settings.relative_path_from_root( 'model/theano-lstm-{}.npz'.format(current_time)) train_lstm(data_dir=slice_dir, label_dir=label_dir, tokenizer=tokenizer, dim_proj=dim, max_epochs=epochs, batch_size=8, saveto=model_file, time_out=timeout * 60.0)
def predict(): """ 预测一个slice, 同时将slice保存 :return: """ if not MODEL: logging.warning("Model not specify, can't predict") return jsonify({"msg": "Model not specify, can't predict"}), 500 slice_json = request.get_json() data_dir = settings.relative_path_from_root('data/slice/' + slice_json['project']) if not os.path.exists(data_dir): os.makedirs(data_dir) with open(data_dir + '/slice-' + slice_json["flowHash"] + ".json", 'w') as f: json.dump(slice_json, f) isTP = lstm.predict(MODEL, slice_json["slice"]) logging.info("Predict {0} as {1}".format(slice_json["flowHash"], isTP)) return jsonify({"msg": str(isTP)}), 200
:copyright: (c) 2019 by Anemone Xu. :license: Apache 2.0, see LICENSE for more details. """ import _theano.dataloader as dataloader from _theano.tokenizer import * import settings def transform(data_dir, label_dir): tokenizer = Tokenizer() train, valid, test = dataloader.load_data(data_dir, label_dir, tokenizer=tokenizer, valid_portion=0) with open(settings.relative_path_from_root('data/mangrove/t-train.txt'), 'w') as f: for idx in range(len(train[0])): label = 'truepositive' if train[1][idx] == 0 else "falsepositive" _slice = tokenizer.decode(train[0][idx]) f.write('{} :: {}\n'.format(_slice, label)) with open(settings.relative_path_from_root('data/mangrove/t-test.txt'), 'w') as f: for idx in range(len(test[0])): label = 'truepositive' if train[1][idx] == 0 else "falsepositive" _slice = tokenizer.decode(train[0][idx]) f.write('{} :: {}\n'.format(_slice, label)) with open(settings.relative_path_from_root('data/mangrove/dict.txt'), 'w') as f: for token, _int in tokenizer.get_token_dict().items(): f.write('{} {}\n'.format(token, _int)) if __name__ == '__main__': transform(settings.relative_path_from_root('data/slice/benchmark1.2'), settings.relative_path_from_root('data/label/benchmark1.2'))
n_train = int(numpy.round(n_samples * (1. - test_portion))) test_set_x = [all_set_x[s] for s in sidx[n_train:]] test_set_y = [all_set_y[s] for s in sidx[n_train:]] train_set_x = [all_set_x[s] for s in sidx[:n_train]] train_set_y = [all_set_y[s] for s in sidx[:n_train]] # split train set into valid set # TODO 每次训练都应重新shuffle n_samples = len(train_set_x) sidx = numpy.random.permutation(n_samples) n_train = int(numpy.round(n_samples * (1. - valid_portion))) valid_set_x = [train_set_x[s] for s in sidx[n_train:]] valid_set_y = [train_set_y[s] for s in sidx[n_train:]] real_train_set_x = [train_set_x[s] for s in sidx[:n_train]] real_train_set_y = [train_set_y[s] for s in sidx[:n_train]] # TODO sort_by_len 感觉没啥用啊 会影响实验结果吗? train = (real_train_set_x, real_train_set_y) valid = (valid_set_x, valid_set_y) test = (test_set_x, test_set_y) return train, valid, test if __name__ == '__main__': load_data(settings.relative_path_from_root('data/slice/benchmark'), settings.relative_path_from_root('data/label/benchmark'))
def train_lstm( data_dir: str, label_dir: str, tokenizer: Tokenizer, dim_proj=16, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run disp_freq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.001, # Learning rate for sgd (not used for adadelta and rmsprop) optimizer=adam, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). saveto='lstm_model.npz', # The best model will be saved there validFreq=70, # Compute the validation error after this number of update. maxlen=None, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=16, # The batch size used for validation/test set. # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=False, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. time_out=1000, # timeout ): # Model options model_options = locals().copy() logging.info("model options: {}".format(model_options)) logging.info('Loading data') train, valid, test = dataloader.load_data(data_dir, label_dir, tokenizer=tokenizer, valid_portion=0, test_portion=0.2) model_options['n_words'] = len(tokenizer.get_token_dict()) + 10 if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) if not train: raise Exception("Dataset could not be zero records") ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim logging.info('Building model') # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options, False) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U']**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=list(tparams.values())) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) logging.info("%d training and %d test datapoints" % (len(train[0]), len(test[0]))) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) // batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.time() last_best_time = 0. total_batches = None try: eidx = 0 # 一次训练 while eidx < max_epochs and time_out > (time.time() - start_time) / 60.0: eidx = eidx + 1 n_samples = 0 logging.info('Epoch {}'.format(eidx)) # Get new shuffled index for the training set. # 与py2(mangrove)不同,py3迭代器用完不会会从0开始,因此需要复制一个作为后面评估结果试用 kf, train_batch = tee( get_minibatches_idx(len(train[0]), batch_size, shuffle=True), 2) kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) test_batch = get_minibatches_idx(len(test[0]), valid_batch_size) id_of_batches = 0 for _, train_index in kf: id_of_batches += 1 if id_of_batches % disp_freq == 0: logging.info("Calculating {}/{} batch".format( id_of_batches, total_batches)) uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): logging.error('bad cost detected: {}'.format(cost)) return 1., 1., 1. if total_batches is None: total_batches = id_of_batches use_noise.set_value(0.) train_err, _ = pred_error(f_pred, prepare_data, train, train_batch) test_err, details = pred_error(f_pred, prepare_data, test, test_batch) valid_err = test_err if len(test[0]) > 0 else train_err history_errs.append([valid_err, test_err]) if best_p is None or (valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 last_best_time = (time.time() - start_time) / 60.0 now_time = time.time() total_time = (now_time - start_time) / 60.0 # print("{train_file}\t{dim}\t{batch_size}\t{depth}\t{train_acc}\t{test_acc}\t{tp}\t{tn}\t{fp}\t{fn}") logging.info( "Train Acc: {train_acc}%, Test Acc: {test_acc}%, Recall & Precision: {details}" .format(train_acc=(1 - train_err) * 100, test_acc=(1 - test_err) * 100, details=details)) # print('%s\t%d\t%d\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\t%.2f\t%.2f' % ( # data_dir, dim_proj, batch_size, 0, , , # details, eidx, total_time / eidx, last_best_time, total_time)) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: logging.warning('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logging.error("Training interupted") end_time = time.time() total_time = (end_time - start_time) / 60.0 if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err, _ = pred_error(f_pred, prepare_data, train, kf_train_sorted) test_size = len(test[0]) valid_err, test_err = 0, 0 if test_size > 0: kf_test = get_minibatches_idx(test_size, test_size) test_err, details = pred_error(f_pred, prepare_data, test, kf_test) valid_err = test_err if saveto: with open(settings.relative_path_from_root(saveto + '.tokenizer'), 'wb') as f: pickle.dump(tokenizer, f) with open(saveto + '.args', 'wb') as f: pickle.dump(model_options, f) numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print('%s\t%d\t%d\t%d\t%.2f\t%.2f\t%s\t%d\t%.2f\t%.2f\t%.2f' % (data_dir, dim_proj, batch_size, 0, (1 - train_err) * 100, (1 - test_err) * 100, details, eidx, total_time / eidx, last_best_time, total_time))
if __name__ == '__main__': logging.basicConfig( format= '%(asctime)s : %(levelname)s : %(filename)s : %(funcName)s : %(message)s', level=logging.INFO) numpy.set_printoptions(threshold=10000000, precision=2, suppress=True) data_dir = sys.argv[1] label_dir = sys.argv[2] dim = int(sys.argv[3]) max_epochs = int(sys.argv[4]) time_out_h = int(sys.argv[5]) vocab = {} model_file = settings.relative_path_from_root('model/theano-lstm.npz') if len(sys.argv) > 6 and sys.argv[6] == 'test': modelFile = sys.argv[7] # TODO # test_lstm(dim_proj=dim, n_words=130, dataFile=train_file, reload_model=modelFile, vocab=vocab) else: tokenizer = Tokenizer() train_lstm(data_dir=data_dir, label_dir=label_dir, tokenizer=tokenizer, dim_proj=dim, max_epochs=max_epochs, batch_size=8, saveto=model_file, time_out=time_out_h * 60.0)