def train_autoencoder(x_train: Dict, x_test: Dict, y_train: np.array,
                      y_test: np.array) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    y_train = get_labels_from_csv(y_train, x_train.keys())
    y_test = get_labels_from_csv(y_test, x_test.keys())
    x_train = fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    model = AutoEncoder()
    n_experiments = 100
    params = {
        'epochs':
        np.random.choice(np.arange(1, 10), size=n_experiments).tolist(),
        'learning_rate':
        np.random.choice(10**np.linspace(-4, -0.1),
                         size=n_experiments).tolist(),
        'batch_size':
        np.random.choice([2**i for i in range(3, 8)],
                         size=n_experiments).tolist(),
        'input_dim': [48] * n_experiments,
        'layers':
        generate_layer_settings(n_experiments),
        'dropout':
        np.random.uniform(0, 0.5, size=n_experiments).tolist()
    }
    evaluated_hyperparams = random_search(
        (x_train[y_train == 0], x_test, None, y_test), model, params)
    return evaluated_hyperparams
def evaluate_lof(x_train: Dict, x_test: Dict, y_test: np.array) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    y_test = get_labels_from_csv(y_test, x_test.keys())
    fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    training_stats = load_experiment(
        '../../models/lof_baseline/experiments.json')
    score = evaluate_unsupervised(x_test, y_test,
                                  training_stats['experiments'])
    return score
Ejemplo n.º 3
0
def train_iso_forest(x_train: Dict, x_test: Dict,  y_train: pd.DataFrame, y_test: pd.DataFrame) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    y_test = get_labels_from_csv(y_test, x_test.keys())
    x_train = fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    clf = IsolationForest(bootstrap=True, n_jobs=os.cpu_count(), random_state=SEED)

    experiments = load_experiment('../../models/IF-hyperparameters-Drain3-HDFS1.json')
    evaluated_hyperparams = random_search_unsupervised((x_train, x_test, None, y_test), clf, experiments)
    return evaluated_hyperparams
Ejemplo n.º 4
0
def train_lof(x_train: Dict, x_test: Dict,  y_train: pd.DataFrame, y_test: pd.DataFrame) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    y_test = get_labels_from_csv(y_test, x_test.keys())
    fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    clf = LocalOutlierFactor(n_jobs=os.cpu_count())

    experiments = load_experiment('../../models/LOF-hyperparameters-Drain3-HDFS1.json')
    evaluated_hyperparams = random_search_unsupervised((None, x_test, None, y_test), clf, experiments)
    return evaluated_hyperparams
Ejemplo n.º 5
0
def train_autoencoder(x_train: Dict, x_test: Dict, y_train: pd.DataFrame, y_test: pd.DataFrame) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    y_train = get_labels_from_csv(y_train, x_train.keys())
    y_test = get_labels_from_csv(y_test, x_test.keys())
    x_train = fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    model = AutoEncoder()

    experiments = load_experiment('../../models/AE-hyperparameters-Drain3-HDFS1.json')
    evaluated_hyperparams = random_search((x_train[y_train == 0], x_test, None, y_test), model, experiments)
    return evaluated_hyperparams
def train_lof(x_train: Dict, x_test: Dict, y_train: np.array,
              y_test: np.array) -> Dict:
    """
    Novelty detection represents the detection of anomalous data based on a training set consisting of only
    the normal data.
    """
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    clf = LocalOutlierFactor(n_jobs=os.cpu_count())
    params = {
        'n_neighbors': np.linspace(50, 650, num=10, dtype=np.int32).tolist(),
        'metric':
        ['cosine', 'euclidean', 'manhattan', 'chebyshev', 'minkowski']
    }
    evaluated_hyperparams = grid_search((None, x_test, None, y_test), clf,
                                        params)
    return evaluated_hyperparams
def train_iso_forest(x_train: Dict, x_test: Dict, y_train: np.array,
                     y_test: np.array) -> Dict:
    fe = FeatureExtractor(method='tf-idf', preprocessing='mean')
    x_train = fe.fit_transform(x_train)
    x_test = fe.transform(x_test)

    clf = IsolationForest(bootstrap=True,
                          n_jobs=os.cpu_count(),
                          random_state=SEED)
    params = {
        'n_estimators':
        np.linspace(10, 750, num=8, dtype=np.int32).tolist(),
        'max_samples':
        np.linspace(0.01, 1, num=7, dtype=np.float32).tolist(),
        'max_features':
        np.linspace(1, x_train.shape[1], num=10, dtype=np.int32).tolist()
    }
    evaluated_hyperparams = grid_search((x_train, x_test, None, y_test), clf,
                                        params)
    return evaluated_hyperparams