def analyze(user_id, mix_foreign): # Retrieve status updates user_statuses = get_status_updates("twitter", user_id=user_id, tweet_limit=TWEET_LIMIT) ext_statuses = get_status_updates(app.config['data_source'], dataset_path=app.config['dataset_path']) ext_training_statuses, ext_testing_statuses = split_by_author( ext_statuses, [user_id]) # Add some tweets from another user for testing purposes if mix_foreign: mixed_statuses = random_insert_seq(user_statuses[START_BATCH_SIZE:], ext_testing_statuses)[0] user_statuses = user_statuses[:START_BATCH_SIZE] + mixed_statuses # Analyze tweets if len(ext_training_statuses) > len(user_statuses): ext_training_statuses = sample(ext_training_statuses, len(user_statuses)) analyzer = StatusUpdateAnalyzer(user_statuses, ext_training_statuses, app.config['classifier'], SCALE_FEATURES) analyzer.analyze() return analyzer
def tune_cli(argv): # Create argument parser parser = argparse.ArgumentParser( description= "This application determines the best suited hyper-parameter combinations for a certain classifier based on a given data set." ) parser.add_argument( "--data-source", "-s", help= "The data source that should be used for classifier analysis. Possible values are 'fth', 'mp' and 'twitter'." ) parser.add_argument( "--dataset-path", "-p", help= "The path of the dataset that should be used for classifier analysis.") parser.add_argument( "--classifier", "-c", help= "The classifier to be analyzed. Possible values are 'decision_tree' and 'perceptron'." ) args = parser.parse_args(argv) # Get status updates print("Retrieving status updates...") status_updates = get_status_updates(args.data_source, dataset_path=args.dataset_path) status_updates = sorted(status_updates, key=lambda x: x.author) grouped_status_updates = [ list(g) for k, g in itertools.groupby(status_updates, lambda x: x.author) ] n = 500 ClassifierOptimizer(args.classifier, grouped_status_updates[0][:n], grouped_status_updates[1][:n]).execute()
def evaluate_cli(argv): # Create argument parser parser = argparse.ArgumentParser( description="This application evaluates the anomaly detection approach." ) parser.add_argument( "--data-source", "-s", help= "The data source that should be used for cross-validation. Possible values are 'fth', 'mp' and 'twitter'." ) parser.add_argument( "--dataset-path", "-p", help="The path of the dataset that should be used for cross-validation." ) parser.add_argument( "--classifier", "-c", help= "The classifier to be trained. Possible values are 'decision_tree', 'one_class_svm', 'isolation_forest' and 'perceptron'." ) parser.add_argument( "--evaluation-rounds", type=int, default=10, help= "Number of rounds the evaluation is executed. Reduces the variation caused by sampling." ) parser.add_argument("--no-scaling", dest='scale_features', action='store_false', help="Disables feature scaling.") parser.add_argument( "--output-path", "-o", default="evaluation.xlsx", help="The path of the file the results should be written to.") args = parser.parse_args(argv) # Get status updates print("Retrieving status updates...") status_updates = get_status_updates(args.data_source, dataset_path=args.dataset_path) status_updates = sorted(status_updates, key=lambda x: x.author) grouped_status_updates = [ list(g) for k, g in itertools.groupby(status_updates, lambda x: x.author) ] n_user = 500 evaluation_data = [] for r in range(args.evaluation_rounds): round_data = {} for i in range(len(grouped_status_updates)): user = grouped_status_updates[i][0].author user_status_updates = grouped_status_updates[i][:n_user] ext_status_updates = list( itertools.chain(*[ x for j, x in enumerate(grouped_status_updates) if j != i ])) print("Round %s/%s: Analyzing @%s (%s/%s)" % (r + 1, args.evaluation_rounds, user, i + 1, len(grouped_status_updates))) # Adapt number of likes and shares of half of external status updates for j in np.random.choice( len(ext_status_updates), int(math.ceil(len(ext_status_updates) / 2)), replace=False): random_status_update = random.choice(grouped_status_updates[i]) ext_status_updates[ j]._number_of_likes = random_status_update.number_of_likes ext_status_updates[ j]._number_of_shares = random_status_update.number_of_shares # Construct test & training sets ext_training_status_updates, ext_testing_status_updates = split_by_author( ext_status_updates, [user]) if len(ext_training_status_updates) > len(user_status_updates): ext_training_status_updates = sample( ext_training_status_updates, len(user_status_updates)) # Add some status updates from other users safe_user_status_updates = user_status_updates[:START_BATCH_SIZE] mixed_user_status_updates, ext_testing_status_updates = random_insert_seq( user_status_updates[START_BATCH_SIZE:], ext_testing_status_updates) # Run classifier analyzer = StatusUpdateAnalyzer( safe_user_status_updates + mixed_user_status_updates, ext_training_status_updates, args.classifier, args.scale_features) analyzer.analyze() # Evaluation metrics metrics = calculate_metrics(user_status_updates[START_BATCH_SIZE:], ext_testing_status_updates, analyzer.suspicious_statuses) round_data[user] = metrics tp, tn, fp, fn, prec, rec, fm, acc = metrics print("TP: %i, TN: %i, FP: %i, FN: %i" % (tp, tn, fp, fn)) print("Prec: %.2f, Rec: %.2f, F: %.2f, Acc: %.2f" % (prec, rec, fm, acc)) print() evaluation_data.append(round_data) write_evaluation_results(evaluation_data, args.output_path)