Ejemplo n.º 1
0
def test_predict():
    dd = DGADetector()
    test_domains = cudf.Series(["nvidia.com", "dfsdfsdf"])
    dd.load_model(model_filepath)
    actual_output = dd.predict(test_domains)
    expected_output = cudf.Series([1, 0])
    assert actual_output.equals(actual_output)
Ejemplo n.º 2
0
def main():
    epoch = int(args["epoch"])
    input_filepath = args["training_data"]
    batch_size = int(args["batch_size"])
    output_dir = args["output_dir"]

    col_names = ["domain", "type"]
    dtypes = ["str", "int32"]
    input_df = cudf.read_csv(input_filepath, names=col_names, dtype=dtypes)
    domain_train, domain_test, type_train, type_test = train_test_split(
        input_df, "type", train_size=0.7)

    test_df = create_df(domain_test, type_test)
    train_df = create_df(domain_train, type_train)

    train_dataset = DetectorDataset(train_df, batch_size)
    test_dataset = DetectorDataset(test_df, batch_size)

    dd = DGADetector(lr=LR)
    dd.init_model(
        n_layers=N_LAYERS,
        char_vocab=CHAR_VOCAB,
        hidden_size=HIDDEN_SIZE,
        n_domain_type=N_DOMAIN_TYPE,
    )
    model_filepath = train_and_eval(dd, train_dataset, test_dataset, epoch,
                                    output_dir)
Ejemplo n.º 3
0
def test_load_model():
    dd = DGADetector()
    dd.load_model(model_filepath)
    gpu_count = torch.cuda.device_count()
    if gpu_count > 1:
        assert isinstance(dd.model, nn.DataParallel)
    else:
        assert isinstance(dd.model, RNNClassifier)
Ejemplo n.º 4
0
def test_predict():
    if torch.cuda.is_available():
        dd = DGADetector()
        test_domains = cudf.Series(["nvidia.com", "dfsdfsdf"])
        dd.load_model(MODEL_FILENAME)
        actual_output = dd.predict(test_domains)
        expected_output = cudf.Series([1, 0])
        assert actual_output.equals(expected_output)
Ejemplo n.º 5
0
def worker_init():
    # Initialization for each dask worker
    from clx.analytics.dga_detector import DGADetector

    worker = dask.distributed.get_worker()
    dd = DGADetector()
    print("Initializing Dask worker: " + str(worker) +
          " with dga model. Model File: " + str(args.model))
    dd.load_model(args.model)
    worker.data["dga_detector"] = dd
    print("Successfully initialized dask worker " + str(worker))
Ejemplo n.º 6
0
    def worker_init(self):
        # Initialization for each dask worker
        from clx.analytics.dga_detector import DGADetector

        worker = dask.distributed.get_worker()
        dd = DGADetector()
        print("Initializing Dask worker: " + str(worker) +
              " with dga model. Model File: " + str(self.args.model))
        dd.load_model(self.args.model)
        # this dict can be used for adding more objects to distributed dask worker
        obj_dict = {"dga_detector": dd}
        worker = utils.init_dask_workers(worker, self.config, obj_dict)
Ejemplo n.º 7
0
def main():
    epochs = int(args["epochs"])
    input_filepath = args["training_data"]
    batch_size = int(args["batch_size"])
    output_dir = args["output_dir"]
    # load input data to gpu memory
    input_df = cudf.read_csv(input_filepath)
    train_data = input_df['domain']
    labels = input_df['type']
    del input_df
    dd = DGADetector(lr=LR)
    dd.init_model(
        n_layers=N_LAYERS,
        char_vocab=CHAR_VOCAB,
        hidden_size=HIDDEN_SIZE,
        n_domain_type=N_DOMAIN_TYPE,
    )
    dd.train_model(train_data,
                   labels,
                   batch_size=batch_size,
                   epochs=epochs,
                   train_size=0.7)

    if not os.path.exists(output_dir):
        print("Creating directory '{}'".format(output_dir))
        os.makedirs(output_dir)
    now = datetime.now()
    model_filename = "rnn_classifier_{}.bin".format(
        now.strftime("%Y-%m-%d_%H_%M_%S"))
    model_filepath = os.path.join(output_dir, model_filename)
    print("Saving trained model to location '{}'".format(model_filepath))
    dd.save_model(model_filepath)
Ejemplo n.º 8
0
def main():
    epoch = int(args["epoch"])
    train_file_path = args["train_file_path"]
    test_file_path = args["test_file_path"]
    batch_size = int(args["batch_size"])
    output_dir = args["output_dir"]

    log.info("train_file_path : %s" % (train_file_path))
    log.info("test_file_path : %s" % (test_file_path))

    col_names = ["domain", "type"]
    dtypes = ["str", "int32"]
    df = cudf.read_csv(train_file_path, names=col_names, dtype=dtypes)
    test_df = cudf.read_csv(test_file_path, names=col_names, dtype=dtypes)

    dd = DGADetector()
    dd.init_model()

    dataset = DetectorDataset(df, batch_size)
    test_dataset = DetectorDataset(test_df, batch_size)
    del df
    del test_df
    train(dd, dataset, test_dataset, epoch, output_dir)
Ejemplo n.º 9
0
def test_load_model(tmpdir):
    if torch.cuda.is_available():
        # save model
        dd.save_model(str(tmpdir.join("clx_dga.mdl")))
        assert path.exists(str(tmpdir.join("clx_dga.mdl")))
        # load model
        dd2 = DGADetector()
        dd2.init_model()
        dd2.load_model(str(tmpdir.join("clx_dga.mdl")))
        gpu_count = torch.cuda.device_count()
        if gpu_count > 1:
            assert isinstance(dd2.model.module, RNNClassifier)
        else:
            assert isinstance(dd2.model, RNNClassifier)
Ejemplo n.º 10
0
def test_train_model():
    dd = DGADetector()
    dd.init_model()
    total_loss = dd.train_model(test_partitioned_dfs, test_dataset_len)
    assert isinstance(total_loss, (int, float))
Ejemplo n.º 11
0
def test_evaluate_model():
    dd = DGADetector()
    dd.init_model()
    accuracy = dd.evaluate_model(test_partitioned_dfs, test_dataset_len)
    assert isinstance(accuracy, (int, float))
Ejemplo n.º 12
0
def test_evaluate_model():
    if torch.cuda.is_available():
        dd = DGADetector()
        dd.init_model()
        accuracy = dd.evaluate_model(dataset)
        assert isinstance(accuracy, (int, float))
Ejemplo n.º 13
0
def test_train_model():
    if torch.cuda.is_available():
        dd = DGADetector()
        dd.init_model()
        total_loss = dd.train_model(dataset)
        assert isinstance(total_loss, (int, float))
Ejemplo n.º 14
0
def test_load_model():
    if torch.cuda.is_available():
        dd = DGADetector()
        dd.load_model(MODEL_FILENAME)
        assert isinstance(dd.model, RNNClassifier)
Ejemplo n.º 15
0
test_dataset_len = 4
test_batchsize = 2
test_df = cudf.DataFrame(
    {
        "domain": [
            "studytour.com.tw",
            "cnn.com",
            "bakercityherald.com",
            "bankmobile.com",
        ],
        "type": [1, 1, 0, 1],
    }
)
dataset = DetectorDataset(test_df, test_batchsize)

dd = DGADetector()
dd.init_model()


def test_train_model():
    if torch.cuda.is_available():
        # train model
        total_loss = dd.train_model(dataset)
        assert isinstance(total_loss, (int, float))


def test_evaluate_model():
    if torch.cuda.is_available():
        # evaluate model
        accuracy = dd.evaluate_model(dataset)
        assert isinstance(accuracy, (int, float))