def test_predict(): dd = DGADetector() test_domains = cudf.Series(["nvidia.com", "dfsdfsdf"]) dd.load_model(model_filepath) actual_output = dd.predict(test_domains) expected_output = cudf.Series([1, 0]) assert actual_output.equals(actual_output)
def main(): epoch = int(args["epoch"]) input_filepath = args["training_data"] batch_size = int(args["batch_size"]) output_dir = args["output_dir"] col_names = ["domain", "type"] dtypes = ["str", "int32"] input_df = cudf.read_csv(input_filepath, names=col_names, dtype=dtypes) domain_train, domain_test, type_train, type_test = train_test_split( input_df, "type", train_size=0.7) test_df = create_df(domain_test, type_test) train_df = create_df(domain_train, type_train) train_dataset = DetectorDataset(train_df, batch_size) test_dataset = DetectorDataset(test_df, batch_size) dd = DGADetector(lr=LR) dd.init_model( n_layers=N_LAYERS, char_vocab=CHAR_VOCAB, hidden_size=HIDDEN_SIZE, n_domain_type=N_DOMAIN_TYPE, ) model_filepath = train_and_eval(dd, train_dataset, test_dataset, epoch, output_dir)
def test_load_model(): dd = DGADetector() dd.load_model(model_filepath) gpu_count = torch.cuda.device_count() if gpu_count > 1: assert isinstance(dd.model, nn.DataParallel) else: assert isinstance(dd.model, RNNClassifier)
def test_predict(): if torch.cuda.is_available(): dd = DGADetector() test_domains = cudf.Series(["nvidia.com", "dfsdfsdf"]) dd.load_model(MODEL_FILENAME) actual_output = dd.predict(test_domains) expected_output = cudf.Series([1, 0]) assert actual_output.equals(expected_output)
def worker_init(): # Initialization for each dask worker from clx.analytics.dga_detector import DGADetector worker = dask.distributed.get_worker() dd = DGADetector() print("Initializing Dask worker: " + str(worker) + " with dga model. Model File: " + str(args.model)) dd.load_model(args.model) worker.data["dga_detector"] = dd print("Successfully initialized dask worker " + str(worker))
def worker_init(self): # Initialization for each dask worker from clx.analytics.dga_detector import DGADetector worker = dask.distributed.get_worker() dd = DGADetector() print("Initializing Dask worker: " + str(worker) + " with dga model. Model File: " + str(self.args.model)) dd.load_model(self.args.model) # this dict can be used for adding more objects to distributed dask worker obj_dict = {"dga_detector": dd} worker = utils.init_dask_workers(worker, self.config, obj_dict)
def main(): epochs = int(args["epochs"]) input_filepath = args["training_data"] batch_size = int(args["batch_size"]) output_dir = args["output_dir"] # load input data to gpu memory input_df = cudf.read_csv(input_filepath) train_data = input_df['domain'] labels = input_df['type'] del input_df dd = DGADetector(lr=LR) dd.init_model( n_layers=N_LAYERS, char_vocab=CHAR_VOCAB, hidden_size=HIDDEN_SIZE, n_domain_type=N_DOMAIN_TYPE, ) dd.train_model(train_data, labels, batch_size=batch_size, epochs=epochs, train_size=0.7) if not os.path.exists(output_dir): print("Creating directory '{}'".format(output_dir)) os.makedirs(output_dir) now = datetime.now() model_filename = "rnn_classifier_{}.bin".format( now.strftime("%Y-%m-%d_%H_%M_%S")) model_filepath = os.path.join(output_dir, model_filename) print("Saving trained model to location '{}'".format(model_filepath)) dd.save_model(model_filepath)
def main(): epoch = int(args["epoch"]) train_file_path = args["train_file_path"] test_file_path = args["test_file_path"] batch_size = int(args["batch_size"]) output_dir = args["output_dir"] log.info("train_file_path : %s" % (train_file_path)) log.info("test_file_path : %s" % (test_file_path)) col_names = ["domain", "type"] dtypes = ["str", "int32"] df = cudf.read_csv(train_file_path, names=col_names, dtype=dtypes) test_df = cudf.read_csv(test_file_path, names=col_names, dtype=dtypes) dd = DGADetector() dd.init_model() dataset = DetectorDataset(df, batch_size) test_dataset = DetectorDataset(test_df, batch_size) del df del test_df train(dd, dataset, test_dataset, epoch, output_dir)
def test_load_model(tmpdir): if torch.cuda.is_available(): # save model dd.save_model(str(tmpdir.join("clx_dga.mdl"))) assert path.exists(str(tmpdir.join("clx_dga.mdl"))) # load model dd2 = DGADetector() dd2.init_model() dd2.load_model(str(tmpdir.join("clx_dga.mdl"))) gpu_count = torch.cuda.device_count() if gpu_count > 1: assert isinstance(dd2.model.module, RNNClassifier) else: assert isinstance(dd2.model, RNNClassifier)
def test_train_model(): dd = DGADetector() dd.init_model() total_loss = dd.train_model(test_partitioned_dfs, test_dataset_len) assert isinstance(total_loss, (int, float))
def test_evaluate_model(): dd = DGADetector() dd.init_model() accuracy = dd.evaluate_model(test_partitioned_dfs, test_dataset_len) assert isinstance(accuracy, (int, float))
def test_evaluate_model(): if torch.cuda.is_available(): dd = DGADetector() dd.init_model() accuracy = dd.evaluate_model(dataset) assert isinstance(accuracy, (int, float))
def test_train_model(): if torch.cuda.is_available(): dd = DGADetector() dd.init_model() total_loss = dd.train_model(dataset) assert isinstance(total_loss, (int, float))
def test_load_model(): if torch.cuda.is_available(): dd = DGADetector() dd.load_model(MODEL_FILENAME) assert isinstance(dd.model, RNNClassifier)
test_dataset_len = 4 test_batchsize = 2 test_df = cudf.DataFrame( { "domain": [ "studytour.com.tw", "cnn.com", "bakercityherald.com", "bankmobile.com", ], "type": [1, 1, 0, 1], } ) dataset = DetectorDataset(test_df, test_batchsize) dd = DGADetector() dd.init_model() def test_train_model(): if torch.cuda.is_available(): # train model total_loss = dd.train_model(dataset) assert isinstance(total_loss, (int, float)) def test_evaluate_model(): if torch.cuda.is_available(): # evaluate model accuracy = dd.evaluate_model(dataset) assert isinstance(accuracy, (int, float))