Exemple #1
0
    def test_kdd_cup():
        def detectors():
            return [DAGMM(num_epochs=10, sequence_length=1)]

        evaluator = Evaluator(
            [KDDCup(21),
             KDDCup(22),
             KDDCup(23),
             KDDCup(24),
             KDDCup(25)], detectors)
        df_evaluation = pd.DataFrame(columns=[
            'dataset', 'algorithm', 'accuracy', 'precision', 'recall',
            'F1-score', 'F0.1-score'
        ])

        evaluator.evaluate()
        df = evaluator.benchmarks()
        df_evaluation = df_evaluation.append(df)

        print(df_evaluation.to_string())
        assert (df_evaluation
                == 0).sum().sum() == 0  # No zeroes in the DataFrame
        assert df_evaluation['F1-score'].std() > 0  # Not always the same value
        # Values reported in the paper -1% each
        assert df_evaluation['precision'].mean() >= 0.91
        assert df_evaluation['recall'].mean() >= 0.93
        assert df_evaluation['F1-score'].mean() >= 0.92
from src.helper.data_structures import izip
from src.evaluation.print_methods import print_evaluator

if __name__ == "__main__":
    benchmark_name = parameters["benchmark"]
    benchmark_subset = SUBSETS[parameters["set"]]
    benchmark = Benchmark(benchmark_name, subset=benchmark_subset)

    sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)
    if parameters["file"] == "corrupt.txt":
        predicted_sequences = [corrupt for _, corrupt in sequence_pairs]
    else:
        predicted_sequences = benchmark.get_predicted_sequences(
            parameters["file"])

    evaluator = Evaluator()

    for s_i, (correct, corrupt), predicted in izip(sequence_pairs,
                                                   predicted_sequences):
        if s_i == parameters["sequences"]:
            break
        evaluator.evaluate(benchmark,
                           s_i,
                           correct,
                           corrupt,
                           predicted,
                           evaluate_ed=False)
        evaluator.print_sequence()

    print_evaluator(evaluator)
Exemple #3
0
def run_experiment_evaluation(detectors,
                              seeds,
                              runs,
                              output_dir,
                              anomaly_type,
                              steps=5,
                              outlier_type='extreme_1',
                              store_results=True):
    # get synthetic anomaly dataset from agots
    datasets = list(
        get_datasets_for_multiple_runs(anomaly_type, seeds, steps,
                                       outlier_type))
    results = pd.DataFrame()
    evaluator = None
    ipdb.set_trace()
    for index, seed in enumerate(seeds):
        evaluator = Evaluator(datasets[index],
                              detectors,
                              output_dir,
                              seed=seed)
        evaluator.evaluate()
        result = evaluator.benchmarks()
        evaluator.plot_roc_curves(store=store_results)
        evaluator.plot_threshold_comparison(store=store_results)
        evaluator.plot_scores(store=store_results)
        evaluator.set_benchmark_results(result)
        evaluator.export_results(f'experiment-run-{index}-{seed}')
        results = results.append(result, ignore_index=True)

    if not store_results:
        return

    # set average results from multiple pipeline runs for evaluation
    avg_results = results.groupby(['dataset', 'algorithm'],
                                  as_index=False).mean()
    evaluator.set_benchmark_results(avg_results)
    evaluator.export_results(f'experiment-{anomaly_type}')

    # Plots which need the whole data (not averaged)
    evaluator.create_boxplots(runs=runs,
                              data=results,
                              detectorwise=True,
                              store=store_results)
    evaluator.create_boxplots(runs=runs,
                              data=results,
                              detectorwise=False,
                              store=store_results)
    evaluator.gen_merged_tables(results,
                                f'for_{anomaly_type}',
                                store=store_results)

    # Plots using 'self.benchmark_results' -> using the averaged results
    evaluator.create_bar_charts(runs=runs,
                                detectorwise=True,
                                store=store_results)
    evaluator.create_bar_charts(runs=runs,
                                detectorwise=False,
                                store=store_results)
    evaluator.plot_auroc(
        title=f'Area under the curve for differing {anomaly_type} anomalies',
        store=store_results)

    # Plots using 'self.results' (need the score) -> only from the last run
    evaluator.plot_threshold_comparison(store=store_results)
    evaluator.plot_scores(store=store_results)
    evaluator.plot_roc_curves(store=store_results)

    return evaluator
Exemple #4
0
def run_different_window_sizes_evaluator(detectors, seeds, runs):
    results = pd.DataFrame()
    for seed in seeds:
        datasets = [
            SyntheticDataGenerator.long_term_dependencies_width(seed),
            SyntheticDataGenerator.long_term_dependencies_height(seed),
            SyntheticDataGenerator.long_term_dependencies_missing(seed)
        ]
        evaluator = Evaluator(datasets, detectors, seed=seed)
        evaluator.evaluate()
        evaluator.plot_scores()
        result = evaluator.benchmarks()
        results = results.append(result, ignore_index=True)
    evaluator.set_benchmark_results(results)
    evaluator.export_results('run_different_windows')
    evaluator.create_boxplots(runs=runs, data=results, detectorwise=False)
    evaluator.create_boxplots(runs=runs, data=results, detectorwise=True)
    return evaluator
    ) == 1 or "-h" in sys.argv or "-help" in sys.argv or "help" in sys.argv:
        print_help()
        exit(0)

    benchmark, subset, file_name = get_arguments()

    benchmark = Benchmark(benchmark, subset)
    correct_sequences = benchmark.get_sequences(BenchmarkFiles.CORRECT)
    corrupt_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
    if file_name == "corrupt.txt":
        predicted_sequences = corrupt_sequences
    else:
        predicted_sequences = benchmark.get_predicted_sequences(file_name)
    original_sequences = correct_sequences

    evaluator = Evaluator()
    for seq_id, (original, correct, corrupt, predicted) in \
            enumerate(zip(original_sequences, correct_sequences, corrupt_sequences, predicted_sequences)):

        if benchmark.name == "acl" and original.startswith("#"):
            print(original)
            continue

        correct_processed, corrupt_processed, predicted_processed = \
            tolerant_preprocess_sequences(original, correct, corrupt, predicted)

        evaluator.evaluate(None,
                           None,
                           original_sequence=correct_processed,
                           corrupt_sequence=corrupt_processed,
                           predicted_sequence=predicted_processed,
            benchmark = Benchmark(benchmark_name, benchmark_subset)
            sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)

            if file_name == "corrupt.txt":
                predicted_sequences = benchmark.get_sequences(BenchmarkFiles.CORRUPT)
                mean_runtime = 0
            else:
                try:
                    predicted_sequences = benchmark.get_predicted_sequences(file_name)[:len(sequence_pairs)]
                    mean_runtime = benchmark.get_mean_runtime(file_name)
                except FileNotFoundError:
                    predicted_sequences = []
                    mean_runtime = 0

            if len(predicted_sequences) == len(sequence_pairs):
                evaluator = Evaluator()

                for i, (correct, corrupt) in enumerate(sequence_pairs):
                    predicted = predicted_sequences[i]
                    evaluator.evaluate(file_name=None,
                                       line=None,
                                       original_sequence=correct,
                                       corrupt_sequence=corrupt,
                                       predicted_sequence=predicted,
                                       evaluate_ed=False)
                f1 = evaluator.f1()
                acc = evaluator.sequence_accuracy()
                print("f1  = %2.2f" % (f1 * 100))
                print("acc = %2.2f" % (acc * 100))
                print("t   = %.2f" % mean_runtime)
            else:
Exemple #7
0
    for benchmark in benchmarks:
        original_sequences = {
            Subset.TUNING: read_sequences(paths.WIKI_TUNING_SENTENCES),
            Subset.DEVELOPMENT: Wikipedia.development_sequences(),
            Subset.TEST: Wikipedia.test_sequences()
        }[subset]

        sequence_pairs = benchmark.get_sequence_pairs(BenchmarkFiles.CORRUPT)
        if predictions_file_name == "corrupt.txt":
            predicted_sequences = [corrupt for _, corrupt in sequence_pairs]
        else:
            predicted_sequences = benchmark.get_predicted_sequences(
                predictions_file_name)

        evaluator = Evaluator()

        for s_i, original, (correct, corrupt), predicted in izip(
                original_sequences, sequence_pairs, predicted_sequences):
            if s_i == n_sequences:
                break

            correct_processed, corrupt_processed, predicted_processed = \
                tolerant_preprocess_sequences(original, correct, corrupt, predicted)

            evaluator.evaluate(predictions_file_name,
                               s_i,
                               original_sequence=correct_processed,
                               corrupt_sequence=corrupt_processed,
                               predicted_sequence=predicted_processed,
                               evaluate_ed=False)
Exemple #8
0
def main(params):

    # initialize the multi-GPU / multi-node training
    init_distributed_mode(params)

    # initialize the experiment
    logger = initialize_exp(params)

    # initialize SLURM signal handler for time limit / pre-emption
    init_signal_handler()

    # load data
    data = load_data(params)

    # build model
    model = build_model(params, data['dico'])

    # build trainer, reload potential checkpoints / build evaluator
    trainer = Trainer(model, data, params)
    evaluator = Evaluator(trainer, data, params)

    # evaluation
    if params.eval_only:
        scores = evaluator.run_all_evals(trainer)
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        logger.info("__log__:%s" % json.dumps(scores))
        exit()

    # set sampling probabilities for training
    set_sampling_probs(data, params)

    # language model training
    for _ in range(params.max_epoch):

        logger.info("============ Starting epoch %i ... ============" %
                    trainer.epoch)

        trainer.n_sentences = 0

        while trainer.n_sentences < trainer.epoch_size:
            # MLM steps
            trainer.mlm_step(params.lambda_mlm)

            trainer.iter()

        logger.info("============ End of epoch %i ============" %
                    trainer.epoch)

        # evaluate perplexity
        scores = evaluator.run_all_evals(trainer)

        # print / JSON log
        for k, v in scores.items():
            logger.info("%s -> %.6f" % (k, v))
        if params.is_master:
            logger.info("__log__:%s" % json.dumps(scores))

        # end of epoch
        trainer.save_best_model(scores)
        trainer.save_periodic()
        trainer.end_epoch(scores)