Beispiel #1
0
def heuristic_1a(split_mode, experiment_matrix_kfold, mode, classifiers, model, users):
	results = []
	for emsk in experiment_matrix_kfold:
		train = trainer.split_user_accounts(emsk[0].copy(), split_mode)
		test = trainer.split_user_accounts(emsk[1].copy(), split_mode)
		r = trainer.dopplegeanger_detection([train, test], mode, classifiers)
		results.append(r)
	results = np.concatenate(results, axis=0)
	tfpn = trainer.get_number_true_false_positive_negative(results)
	print("Total numbers true/false positives/negatives for Heuristic 1a, " + str(users) + " Users, " + str(split_mode) + " Testing Split, " + str(model) + ": ")
	print(tfpn)
	f = open("misc/experiment_results/experiment_heuristic_1a_-_" + str(model) + "_-_users_" + str(
		users) + "_-_split_mode_" + split_mode + "_-_" + str(datetime.now()) + ".pkl", "wb")
	pickle.dump([results, tfpn], f)
	f.close()
Beispiel #2
0
def classification_comments(pc,users2,comments,model,split_mode_2,mode):
	experiment_matrix = trainer.get_matrix_experiment_one(pc, users2, comments=comments, text_length=250)
	experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
	experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
	classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
	experiment_matrix_kfold = trainer.k_fold_cross_validation(experiment_matrix, 3)

	results = []
	for emsk in experiment_matrix_kfold:
		train = trainer.split_user_accounts(emsk[0].copy(), split_mode_2)
		test = trainer.split_user_accounts(emsk[1].copy(), split_mode_2)
		r = trainer.dopplegeanger_detection([train, test], mode, classifiers)
		results.append(r)
	results = np.concatenate(results, axis=0)
	tfpn = trainer.get_number_true_false_positive_negative(results)
	print("Total numbers true/false positives/negatives: ")
	print(tfpn)
Beispiel #3
0
def classification_users(pc,users,model,split_mode_2,mode):
	print("\n== Executing Performance Measurements for Task 3a: Machine Learning Model: " + str(model) + "; " + str(users) + " Users ==\n")

	experiment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=250)
	experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
	experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
	classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
	experiment_matrix_kfold = trainer.k_fold_cross_validation(experiment_matrix, 3)

	results = []
	for emsk in experiment_matrix_kfold:
		train = trainer.split_user_accounts(emsk[0].copy(), split_mode_2)
		test = trainer.split_user_accounts(emsk[1].copy(), split_mode_2)
		r = trainer.dopplegeanger_detection([train, test], mode, classifiers)
		results.append(r)
	results = np.concatenate(results, axis=0)
	tfpn = trainer.get_number_true_false_positive_negative(results)
	print("Total numbers true/false positives/negatives: ")
	print(tfpn)
Beispiel #4
0
def main(spider="guardianSpider", log=False, size=0):
	#Database declaration and connection
	size = None
	log = None
	mode = None
	database = r'database/dopplegaenger.db'
	conn_article = create_connection(database)
	conn_comments = create_connection(database)
	conn_user = create_connection(database)
	messaging_logger.setLevel(LOG_LEVEL)
	messaging_logger_file_handler = FileHandler(MESSAGING_LOG_FILE)
	messaging_logger_file_handler.setLevel(LOG_LEVEL)
	messaging_logger_file_handler.setFormatter(Formatter(LOG_FORMAT))
	messaging_logger.addHandler(messaging_logger_file_handler)
	#Debug Log
	configure_logging(install_root_handler = False)
	# logging.basicConfig(filename='logs/webapp.log', level=logging.DEBUG, format=f"%(asctime)s %(levelname)s %(name)s %(threadName)s : %(message)s")
	settings = Settings()
	os.environ['SCRAPY_SETTINGS_MODULE'] = 'settings'
	settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
	settings.setmodule(settings_module_path, priority='default')

	if args.size:
		size = args.size
	if args.log:
		log = args.log
	if args.mode:
		mode = args.mode
	if args.clean:
		drop_all(conn_article)
		if os.path.isfile('logs/webapp.log'):
			open('logs/webapp.log', 'w').close()
		else:
			logging.log(logging.WARNING, "WARNING! Webapp log doesn't exist.")
		if os.path.isfile('logs/report.log'):
			open('logs/report.log', 'w').close()
		else:
			logging.log(logging.WARNING, "WARNING! Report log doesn't exist.")
	if args.version:
		print("GuardianBot version 1.0")
	if args.run:
		if conn_article is not None and conn_comments is not None:
			create_article_table(conn_article)
			create_user_table(conn_article)
			create_comment_table(conn_article)
			create_stats_table(conn_article)
			runner = CrawlerRunner(settings)
			runner.crawl(guardianSpider,connection=conn_article)
			runner.crawl(commentSpider,connection=conn_comments)
			d = runner.join()
			d.addBoth(lambda _: reactor.stop())
			reactor.run()  # the script will block here until the crawling is finished
		else:
			logging.log(logging.ERROR, "Fatal Error! Database Tables Not Created. Exiting!")
	elif args.info:
		# datetime object containing current date and time
		now = datetime.now()
		# dd/mm/YY H:M:S
		dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
		messaging_logger.info("================ Executed at " + dt_string + " ================")
		try:
			cur = sql_count_articles(conn_article)
			number_articles = cur.fetchall()[0][0]
			print("Articles: " + str(number_articles))
			messaging_logger.info("Articles: " + str(number_articles))
		except sql.Error as error:
			logging.log(logging.ERROR, "Fatal Error! Articles Table Not Accessible. Exiting!")
		try:
			cur = sql_count_comments(conn_comments)
			number_comments = cur.fetchall()[0][0]
			print("Comments: " + str(number_comments))
			messaging_logger.info("Comments: " + str(number_comments))
		except sql.Error as error:
			logging.log(logging.ERROR, "Fatal Error! Comments Table Not Accessible. Exiting!")
		try:
			cur = sql_count_users(conn_user)
			number_users = cur.fetchall()[0][0]
			print("Users: " + str(number_users))
			messaging_logger.info("Users: " + str(number_users))
		except sql.Error as error:
			logging.log(logging.ERROR, "Fatal Error! Users Table Not Accessible. Exiting!")
		print("Average Comments per User: "******"Average Comments per User: "******"\n")
	elif size:
		try:
			#Returns a dictionary cursor instead of tuple
			conn_comments.row_factory = sql.Row
			cursor = sql_select_all_users(conn_comments)
			rows_user = cursor.fetchall();
			for user in rows_user:
				print("Next User: "******"--------------------------------------------------")
				logging.log(logging.INFO, 'Next User: %s', user['username'])
				try:
					#Returns a dictionary cursor instead of tuple
					conn_comments.row_factory = sql.Row
					cur = sql_select_comments_from_user(conn_comments,user['username'],args.size)
					rows = cur.fetchall();
					for row in rows:
						print(" Article Title: ", row['article_title'], "\n" , "Article URL: ", row['article_url'], "\n\n" " User Comment: ", row['comment_text'] , "\n")
				except sql.Error as error:
					logging.log(logging.ERROR, "Fatal Error! Comment Table Not Accessible. Exiting!")
		except sql.Error as error:
			logging.log(logging.ERROR, "Fatal Error! Users Table Not Accessible. Exiting!")
	elif args.user:
		try:
			#Returns a dictionary cursor instead of tuple
			conn_comments.row_factory = sql.Row
			print("User: "******"--------------------------------------------------")
			try:
				#Returns a dictionary cursor instead of tuple
				conn_comments.row_factory = sql.Row
				cur = sql_select_comments_from_user(conn_comments,args.user[0],int(args.user[1]))
				rows = cur.fetchall();
				for row in rows:
					print(" Article Title: ", row['article_title'], "\n" , "Article URL: ", row['article_url'], "\n\n" " User Comment: ", row['comment_text'] , "\n")
			except sql.Error as error:
				logging.log(logging.ERROR, "Fatal Error! Comment Table Not Accessible. Exiting!")
		except sql.Error as error:
			logging.log(logging.ERROR, "Fatal Error! Users Table Not Accessible. Exiting!")
	#TODO Use ForEach logic to execute multiple modes in succession.
	if mode:
		mode_execute(mode)
	if args.features:
		logging.log(logging.INFO, "Now computing statistics")
		cur_comments_and_id = db.sql_return_comments_users_hundred(conn_article)
		datad = cur_comments_and_id.fetchall()
		comment_id_bulk = [d[0] for d in datad]
		comment_article_id_bulk = [d[5] for d in datad]
		comment_user_id_bulk = [d[3] for d in datad]
		comment_text_bulk = [d[1] for d in datad]

		# comment first four lines of this block if feature matrix should be loaded from file
		# comment last two lines of this block if feature matrix should be computed new
		#statistics = fmatrix.feature_matrix(comment_text_bulk[:100],comment_user_id_bulk[:100],comment_id_bulk[:100],comment_article_id_bulk[:100])
		#f = open("data.pkl", "wb")
		#pickle.dump(statistics, f)
		#f.close()
		f = open("data.pkl", "rb")
		statistics = pickle.load(f)
		pc = pca.execute_pca(statistics)
		pc = trainer.get_matrix_experiment_one(pc, users=4, text_length=500)

		pc = trainer.split_user_accounts(pc)
		pcs =  trainer.k_fold_cross_validation(pc, 3)

		yes = set(['yes','y', 'ye', ''])
		no = set(['no','n'])
		choice = input('Would you like to execute the dopplegaenger analysis as well?: ').lower()
		if choice in yes:
			#EndlessLoop until value submitted
			mode = input('Which mode would you like to use; average, multiplication, squaredaverage: ').lower()
			#Return list of authors with possible dopplegaenger identities
			modelist = set(['average', 'multiplication', 'squaredaverage'])
			results= []
			if mode in modelist:
				for p in pcs:
					r = trainer.dopplegeanger_detection(p, mode)
					#r = trainer.dopplegaenger_detection_euclid(pc, 1)
					results.append(r)
					for row in r:
						print(row)
			else:
				logging.log(logging.INFO, "Please select either: average, multiplication, squaredaverage")
		elif choice in no:
			pass
		else:
			logging.log(logging.INFO, "Please respond with 'yes' or 'no'")
	if args.plot:
		print("This mode plots the experiment results from task sheet 5. There are plots available for 5, 10, 20, 50 and 100 Users.")
		users = int(input('Enter user number: '))
		plot.show_all_plots(users)

	if args.experiments:
		f = open("data_large.pkl", "rb")
		statistics = pickle.load(f)
		pc = pca.execute_pca(statistics)

		mode = input('Which mode would you like to use to compute the pairwise probability; average, multiplication, squaredaverage: ').lower()
		split_modes_list = ["i", "ii", "iii", "iv"]
		split_modes_list_bc = ["ii", "iii", "iv"]
		models_list = ["svc", "randomforest", "knearestneighbors"]
		users_list = [50, 100]

		jobs = []

		## Task 1-2
		for model in models_list:
			for users in users_list:
				experiment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=750)
				experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
				experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
				classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
				experiment_matrix_kfold = trainer.k_fold_cross_validation(experiment_matrix, 3)

				#### Heuristic 1 a) Experiment 1-4
				for split_mode in split_modes_list:
					p = multiprocessing.Process(target=heuristic_1a, args=(split_mode,experiment_matrix_kfold,mode,classifiers,model,users,))
					jobs.append(p)
					p.start()

				#### Heuristic 1 b) Experiment 1-4
				for split_mode in split_modes_list:
					p = multiprocessing.Process(target=heuristic_1b, args=(
					split_mode, experiment_matrix_kfold, mode, classifiers, model, users,split_modes_list_bc,))
					jobs.append(p)
					p.start()

				#### Heuristic 1 c) Experiment 1-4
				for split_mode in split_modes_list:
					p = multiprocessing.Process(target=heuristic_1c, args=(
					split_mode, experiment_matrix_kfold, mode, classifiers, model, users,split_modes_list_bc,))
					jobs.append(p)
					p.start()


		## Task 3
		users_list_2 = [25,50,75,100]
		comments_list_2 = [10, 15, 20, 25]
		split_mode_2 = "iv"
		users2 = 50

		#### Task 3a
		###### Performance Feature Extraction
		for users in users_list_2:
			print(
				"\n== Executing Performance Measurements for Task 3a Feature Extraction: " + str(users) + " Users ==\n")
			feature_extraction_users(conn_article,users)

		###### Performance Doppelgaenger Detection
		for model in models_list:
			for users in users_list_2:
					print(
						"\n== Executing Performance Measurements for Task 3a Doppelgaenger Detection: Machine Learning Model: " + str(
							model) + "; " + str(users) + " Users ==\n")
					classification_users(pc,users,model,split_mode_2,mode)

		#### Task 3b
		###### Performance Feature Extraction
		for comments in comments_list_2:
			print(
				"\n== Executing Performance Measurements for Task 3b Feature Extraction: " + str(comments) + " Comments ==\n")
			feature_extraction_comments(conn_article,comments)

		###### Performance Doppelgaenger Detection
		for model in models_list:
			for comments in comments_list_2:
					print("\n== Executing Performance Measurements for Task 3b Doppelgaenger Detection: Machine Learning Model: " + str(model) + "; " + str(users2) + " Users; " + str(comments) + " comments ==\n")
					classification_comments(pc,users2,comments,model,split_mode_2,mode)




	#### OLD EXPERIMENTS FROM PRIOR TASK SHEET BELOW

	# ## Task 2 a) Experiment 1
		# print("\n===== Executing Task 2 a) Experiment 1 =====")
		# experiment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=250)
		# experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
		# experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
		# classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
		# experiment_matrix_split_kfold = trainer.k_fold_cross_validation(experiment_matrix_split, 3)
		# results = []
		# for emsk in experiment_matrix_split_kfold:
		# 	r = trainer.dopplegeanger_detection(emsk, mode, classifiers)
		# 	results.append(r)
		# results = np.concatenate(results, axis=0)
		# tfpn = trainer.get_number_true_false_positive_negative(results)
		# print("Total numbers true/false positives/negatives: ")
		# print(tfpn)
		# cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# trainer.plot_heatmap(cm,"Task 2a ex1")
		# f = open("2a_experiment_1.pkl", "wb")
		# pickle.dump([results, tfpn], f)
		# f.close()
		#
		# ## Task 2 a) Experiment 2
		# print("\n===== Executing Task 2 a) Experiment 2 =====")
		# experiment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=500)
		# experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
		# experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
		# classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
		# experiment_matrix_split_kfold = trainer.k_fold_cross_validation(experiment_matrix_split, 3)
		# results = []
		# for emsk in experiment_matrix_split_kfold:
		# 	r = trainer.dopplegeanger_detection(emsk, mode, classifiers)
		# 	results.append(r)
		# results = np.concatenate(results, axis=0)
		# tfpn = trainer.get_number_true_false_positive_negative(results)
		# print("Total numbers true/false positives/negatives: ")
		# print(tfpn)
		# cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# trainer.plot_heatmap(cm,"Task 2 ex 2")
		# f = open("2a_experiment_2.pkl", "wb")
		# pickle.dump([results, tfpn], f)
		# f.close()
		#
		# ## Task 2 a) Experiment 3
		# print("\n===== Executing Task 2 a) Experiment 3 =====")
		# experiment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=750)
		# experiment_matrix_split = trainer.split_user_accounts(experiment_matrix.copy())
		# experiment_matrix_combined_training = np.append(experiment_matrix, experiment_matrix_split, axis=0)
		# classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
		# experiment_matrix_split_kfold = trainer.k_fold_cross_validation(experiment_matrix_split, 3)
		# results = []
		# for emsk in experiment_matrix_split_kfold:
		# 	r = trainer.dopplegeanger_detection(emsk, mode, classifiers)
		# 	results.append(r)
		# results = np.concatenate(results, axis=0)
		# tfpn = trainer.get_number_true_false_positive_negative(results)
		# print("Total numbers true/false positives/negatives: ")
		# print(tfpn)
		# cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# trainer.plot_heatmap(cm,"Task 2a ex 3")
		# f = open("2a_experiment_3.pkl", "wb")
		# pickle.dump([results, tfpn], f)
		# f.close()
		#
		# ## Task 2 b) Experiment 1-3
		# experiment_matrices = trainer.get_matrix_experiment_two(pc)
		# i = 1
		# for exm in experiment_matrices:
		# 	print("\n===== Executing Task 2 b) Experiment " + str(i) + " =====")
		# 	exm_split = trainer.split_user_accounts(exm.copy(), split_mode)
		#
		# 	experiment_matrix_combined_training = np.append(exm, exm_split, axis=0)
		# 	classifiers = trainer.get_classifiers(experiment_matrix_combined_training, model)
		# 	exm_split_kfold = trainer.k_fold_cross_validation(exm_split, 3)
		# 	results = []
		# 	for emsk in exm_split_kfold:
		# 		r = trainer.dopplegeanger_detection(emsk, mode, classifiers)
		# 		results.append(r)
		# 	results = np.concatenate(results, axis=0)
		# 	tfpn = trainer.get_number_true_false_positive_negative(results)
		# 	print("Total numbers true/false positives/negatives: ")
		# 	print(tfpn)
		# 	cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# 	title = "Task 2b ex " + str(i)
		# 	trainer.plot_heatmap(cm,title)
		# 	f = open("2b_experiment_" + str(i) + ".pkl", "wb")
		# 	pickle.dump([results, tfpn], f)
		# 	f.close()
		# 	i += 1
		#
		#
		# ## Task 3 a)
		# print("\n===== Executing Task 3 a) =====")
		# threshold_euclid = input('Select threshold for Euclid: ')
		# expirment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=750)
		# expirment_matrix_split = trainer.split_user_accounts(expirment_matrix, split_mode)
		# r = trainer.dopplegaenger_detection_euclid(expirment_matrix_split, threshold=float(threshold_euclid))
		# tfpn = trainer.get_number_true_false_positive_negative(r)
		# print("Total numbers true/false positives/negatives: ")
		# print(tfpn)
		# print(len(r))
		# cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# trainer.plot_heatmap(cm,"Task 3a")
		# f = open("3a_experiment.pkl", "wb")
		# pickle.dump([r, tfpn], f)
		# f.close()
		#
		# ## Task 3 b)
		# print("\n===== Executing Task 3 b)====")
		# expirment_matrix = trainer.get_matrix_experiment_one(pc, users, text_length=750)
		# expirment_matrix_split = trainer.split_user_accounts(expirment_matrix, split_mode)
		# expirment_matrix_split_kfold = trainer.k_fold_cross_validation(expirment_matrix_split, 3)
		# results = []
		# for emsk in expirment_matrix_split_kfold:
		# 	print("==== Executing Three Fold Cross Valication")
		# 	threshold = trainer.get_optimal_distance_euclid(emsk[0])
		# 	r = trainer.dopplegaenger_detection_euclid(emsk[1], threshold)
		# 	results.append(r)
		# results = np.concatenate(results, axis=0)
		# tfpn = trainer.get_number_true_false_positive_negative(results)
		# print("Total numbers true/false positives/negatives: ")
		# print(tfpn)
		# cm = [[tfpn["true_positive"],tfpn["false_positive"]],[tfpn["false_negative"],tfpn["true_negative"]]]
		# trainer.plot_heatmap(cm,"Task 3b")
		# f = open("3b_experiment.pkl", "wb")
		# pickle.dump([results, tfpn], f)
		# f.close()
		# #Close writing to PDF after calling plot_heatmap
		# trainer.closepdf()
	close_db_connection(conn_article)
	close_db_connection(conn_comments)
	close_db_connection(conn_user)