Ejemplo n.º 1
0
def analyze(user_id, mix_foreign):
    # Retrieve status updates
    user_statuses = get_status_updates("twitter",
                                       user_id=user_id,
                                       tweet_limit=TWEET_LIMIT)
    ext_statuses = get_status_updates(app.config['data_source'],
                                      dataset_path=app.config['dataset_path'])
    ext_training_statuses, ext_testing_statuses = split_by_author(
        ext_statuses, [user_id])

    # Add some tweets from another user for testing purposes
    if mix_foreign:
        mixed_statuses = random_insert_seq(user_statuses[START_BATCH_SIZE:],
                                           ext_testing_statuses)[0]
        user_statuses = user_statuses[:START_BATCH_SIZE] + mixed_statuses

    # Analyze tweets
    if len(ext_training_statuses) > len(user_statuses):
        ext_training_statuses = sample(ext_training_statuses,
                                       len(user_statuses))
    analyzer = StatusUpdateAnalyzer(user_statuses, ext_training_statuses,
                                    app.config['classifier'], SCALE_FEATURES)
    analyzer.analyze()

    return analyzer
Ejemplo n.º 2
0
def tune_cli(argv):
    # Create argument parser
    parser = argparse.ArgumentParser(
        description=
        "This application determines the best suited hyper-parameter combinations for a certain classifier based on a given data set."
    )
    parser.add_argument(
        "--data-source",
        "-s",
        help=
        "The data source that should be used for classifier analysis. Possible values are 'fth', 'mp' and 'twitter'."
    )
    parser.add_argument(
        "--dataset-path",
        "-p",
        help=
        "The path of the dataset that should be used for classifier analysis.")
    parser.add_argument(
        "--classifier",
        "-c",
        help=
        "The classifier to be analyzed. Possible values are 'decision_tree' and 'perceptron'."
    )
    args = parser.parse_args(argv)

    # Get status updates
    print("Retrieving status updates...")
    status_updates = get_status_updates(args.data_source,
                                        dataset_path=args.dataset_path)

    status_updates = sorted(status_updates, key=lambda x: x.author)
    grouped_status_updates = [
        list(g)
        for k, g in itertools.groupby(status_updates, lambda x: x.author)
    ]
    n = 500

    ClassifierOptimizer(args.classifier, grouped_status_updates[0][:n],
                        grouped_status_updates[1][:n]).execute()
Ejemplo n.º 3
0
def evaluate_cli(argv):
    # Create argument parser
    parser = argparse.ArgumentParser(
        description="This application evaluates the anomaly detection approach."
    )
    parser.add_argument(
        "--data-source",
        "-s",
        help=
        "The data source that should be used for cross-validation. Possible values are 'fth', 'mp' and 'twitter'."
    )
    parser.add_argument(
        "--dataset-path",
        "-p",
        help="The path of the dataset that should be used for cross-validation."
    )
    parser.add_argument(
        "--classifier",
        "-c",
        help=
        "The classifier to be trained. Possible values are 'decision_tree', 'one_class_svm', 'isolation_forest' and 'perceptron'."
    )
    parser.add_argument(
        "--evaluation-rounds",
        type=int,
        default=10,
        help=
        "Number of rounds the evaluation is executed. Reduces the variation caused by sampling."
    )
    parser.add_argument("--no-scaling",
                        dest='scale_features',
                        action='store_false',
                        help="Disables feature scaling.")
    parser.add_argument(
        "--output-path",
        "-o",
        default="evaluation.xlsx",
        help="The path of the file the results should be written to.")
    args = parser.parse_args(argv)

    # Get status updates
    print("Retrieving status updates...")
    status_updates = get_status_updates(args.data_source,
                                        dataset_path=args.dataset_path)

    status_updates = sorted(status_updates, key=lambda x: x.author)
    grouped_status_updates = [
        list(g)
        for k, g in itertools.groupby(status_updates, lambda x: x.author)
    ]
    n_user = 500
    evaluation_data = []
    for r in range(args.evaluation_rounds):
        round_data = {}
        for i in range(len(grouped_status_updates)):
            user = grouped_status_updates[i][0].author
            user_status_updates = grouped_status_updates[i][:n_user]
            ext_status_updates = list(
                itertools.chain(*[
                    x for j, x in enumerate(grouped_status_updates) if j != i
                ]))
            print("Round %s/%s: Analyzing @%s (%s/%s)" %
                  (r + 1, args.evaluation_rounds, user, i + 1,
                   len(grouped_status_updates)))

            # Adapt number of likes and shares of half of external status updates
            for j in np.random.choice(
                    len(ext_status_updates),
                    int(math.ceil(len(ext_status_updates) / 2)),
                    replace=False):
                random_status_update = random.choice(grouped_status_updates[i])
                ext_status_updates[
                    j]._number_of_likes = random_status_update.number_of_likes
                ext_status_updates[
                    j]._number_of_shares = random_status_update.number_of_shares

            # Construct test & training sets
            ext_training_status_updates, ext_testing_status_updates = split_by_author(
                ext_status_updates, [user])
            if len(ext_training_status_updates) > len(user_status_updates):
                ext_training_status_updates = sample(
                    ext_training_status_updates, len(user_status_updates))

            # Add some status updates from other users
            safe_user_status_updates = user_status_updates[:START_BATCH_SIZE]
            mixed_user_status_updates, ext_testing_status_updates = random_insert_seq(
                user_status_updates[START_BATCH_SIZE:],
                ext_testing_status_updates)

            # Run classifier
            analyzer = StatusUpdateAnalyzer(
                safe_user_status_updates + mixed_user_status_updates,
                ext_training_status_updates, args.classifier,
                args.scale_features)
            analyzer.analyze()

            # Evaluation metrics
            metrics = calculate_metrics(user_status_updates[START_BATCH_SIZE:],
                                        ext_testing_status_updates,
                                        analyzer.suspicious_statuses)
            round_data[user] = metrics

            tp, tn, fp, fn, prec, rec, fm, acc = metrics
            print("TP: %i, TN: %i, FP: %i, FN: %i" % (tp, tn, fp, fn))
            print("Prec: %.2f, Rec: %.2f, F: %.2f, Acc: %.2f" %
                  (prec, rec, fm, acc))
            print()

        evaluation_data.append(round_data)

    write_evaluation_results(evaluation_data, args.output_path)