save_location = '../../experiment_data/experiment_12' k_list = [25, 50, 75, 100, 150, 200, 250, 300] kernel_list = ["custom", "linear", "poly_2", "poly_3", "rbf"] # results -> k -> kernal results = [] limit_1 = 400 #training set limit_2 = 400 #validation set for k in k_list: print "Running experiment for k=" + str(k) + "..." ab = ABClassifier() ab.download_cursors(limit_unlabeled=5000, limit_labeled=5000) #ab.download_tweet_cursors(limit_unlabeled = 800, limit_labeled = 1000) ab.run_lsa(k=k) #pos_cursor_training = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).limit(400) #neg_cursor_training = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).limit(400) pos_cursor_training = ab.labeled_collection.find({ "bully": True }, timeout=False).limit(400) neg_cursor_training = ab.labeled_collection.find({ "bully": False }, timeout=False).limit(400)
import sys sys.path.append('..') from ABClassifier.ABClassifier import ABClassifier import numpy as np ab = ABClassifier() ab.download_cursors(limit_unlabeled = 1000, limit_labeled = 1000) ab.run_lsa(k=100) ab.compute_context_vectors() pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list) neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list) unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list) print "done getting pws" x = np.array(pos_labeled_pws.values()) a = np.asarray(x) np.savetxt('pos_labeled.csv', a, delimiter=",") y = np.array(neg_labeled_pws.values()) b = np.asarray(y) np.savetxt('neg_labeled.csv', b, delimiter=",") z = np.array(unlabeled_pws.values()) c = np.asarray(z) np.savetxt('unlabeled.csv', c, delimiter=",")
save_location = '../../experiment_data/experiment_12' k_list = [25, 50, 75, 100, 150, 200, 250, 300] kernel_list = ["custom", "linear", "poly_2","poly_3", "rbf"] # results -> k -> kernal results = [] limit_1 = 400 #training set limit_2 = 400 #validation set for k in k_list: print "Running experiment for k=" + str(k) + "..." ab = ABClassifier() ab.download_cursors(limit_unlabeled = 5000, limit_labeled = 5000) #ab.download_tweet_cursors(limit_unlabeled = 800, limit_labeled = 1000) ab.run_lsa(k=k) #pos_cursor_training = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).limit(400) #neg_cursor_training = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).limit(400) pos_cursor_training = ab.labeled_collection.find({"bully":True},timeout=False).limit(400) neg_cursor_training = ab.labeled_collection.find({"bully":False},timeout=False).limit(400) training = [] tlabels = [] pos_validation = [] pos_vlabels = []
results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[k] = results_obj for i in xrange(0,20): for k in k_list: print "Running experiment for k = " + str(k) ab = ABClassifier() ab.download_cursors(limit_unlabeled = limit_1, limit_labeled = limit_1) ab.run_lsa(k=k) print "Starting classification..." unlabeled_cursor = ab.db.tweets.find({"bullying_label" : {'$exists' :True}}, timeout = False) pos_cursor = ab.labeled_collection.find({"bully":True},timeout=False).limit(unlabeled_cursor.count()) neg_cursor = ab.labeled_collection.find({"bully":False},timeout=False).limit(unlabeled_cursor.count()) unl = [] pos = [] neg = [] for u,p,n in zip(unlabeled_cursor, pos_cursor, neg_cursor):
results = {} for t in thresholds: results_obj = {} results_obj['t'] = t results_obj['pos_list_size'] = 0 results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[int(t*100)] = results_obj ab = ABClassifier() ab.download_tweet_cursors(limit_unlabeled = 5000, limit_labeled = 5000) ab.run_lsa(k=k) context_tweets = [ "Literally go f**k yourself, because you're honestly pathetic.", "f**k you f****n w***e go f**k yourself stupid bitch", "but this bad I want to kick her ass cuz she thinks she's a hard chola like sit ur fat ass down lol", "course he did he's a whipped bitch that will say anything to make u happy, unlike ur mum who called u fat", "Fat pig. You're disgusting.", "From some illiterate online keyboard warrior? Go back to sucking your butt buddy's fat junk.", "God Says; Evil Don't Know The Way. You are gay with AIDS & your sin cost you your anointing! This is why you use DUST!" "f**k you stupid f****t f*g" ] tweet_cvs = []
results_obj['k'] = k results_obj['pos_list_size'] = 0 results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[k] = results_obj for i in xrange(0, 20): for k in k_list: print "Running experiment for k = " + str(k) ab = ABClassifier() ab.download_cursors(limit_unlabeled=limit_1, limit_labeled=limit_1) ab.run_lsa(k=k) print "Starting classification..." unlabeled_cursor = ab.db.tweets.find( {"bullying_label": { '$exists': True }}, timeout=False) pos_cursor = ab.labeled_collection.find({ "bully": True }, timeout=False).limit(unlabeled_cursor.count()) neg_cursor = ab.labeled_collection.find({ "bully": False },
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85] results = {} for t in thresholds: results_obj = {} results_obj['t'] = t results_obj['pos_list_size'] = 0 results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[int(t * 100)] = results_obj ab = ABClassifier() ab.download_cursors(limit_unlabeled=limit_1, limit_labeled=limit_1) ab.run_lsa(k=k) context_tweets = [ "Literally go f**k yourself, because you're honestly pathetic.", "f**k you f****n w***e go f**k yourself stupid bitch", "but this bad I want to kick her ass cuz she thinks she's a hard chola like sit ur fat ass down lol", "course he did he's a whipped bitch that will say anything to make u happy, unlike ur mum who called u fat", "Fat pig. You're disgusting.", "From some illiterate online keyboard warrior? Go back to sucking your butt buddy's fat junk.", "God Says; Evil Don't Know The Way. You are gay with AIDS & your sin cost you your anointing! This is why you use DUST!" "f**k you stupid f****t f*g" ] tweet_cvs = [] for c in context_tweets:
-Positive Examples, Negative Examples, Unlabeled -Vary the number of input tweets to the Co-Occurrence Matrix """ import sys sys.path.append('../..') from ABClassifier.ABClassifier import ABClassifier import numpy as np from sklearn.metrics.pairwise import cosine_similarity save_location = '../../experiment_data/experiment_2' k_list = [5, 10, 25, 50, 100, 150, 250, 500] for k in k_list: ab = ABClassifier() ab.download_cursors(limit_unlabeled=1000, limit_labeled=1000) ab.run_lsa(k=k) ab.compute_context_vectors(save_location=save_location) print "Performing pairwise similarity measures..." pos_labeled_pws = cosine_similarity(ab.pos_labeled_cv_list).flatten() neg_labeled_pws = cosine_similarity(ab.neg_labeled_cv_list).flatten() unlabeled_pws = cosine_similarity(ab.unlabeled_cv_list).flatten() print "Done." print "Saving..." np.savetxt(save_location + '/pw_pos_' + str(k) + '.csv',
import sys sys.path.append('..') from ABClassifier.ABClassifier import ABClassifier import numpy as np ab = ABClassifier() ab.download_cursors(limit_unlabeled=10000, limit_labeled=10000) ab.run_lsa(k=100) ab.compute_context_vectors() pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list) neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list) unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list) print "done getting pws" x = np.array(pos_labeled_pws.values()) a = np.asarray(x) np.savetxt('pos_labeled.csv', a, delimiter=",") y = np.array(neg_labeled_pws.values()) b = np.asarray(y) np.savetxt('neg_labeled.csv', b, delimiter=",") z = np.array(unlabeled_pws.values()) c = np.asarray(z) np.savetxt('unlabeled.csv', c, delimiter=",")
-Positive Examples, Negative Examples, Unlabeled -Vary the number of input tweets to the Co-Occurrence Matrix -Uses ONLY twitter data for training/validation """ import sys sys.path.append('../..') from ABClassifier.ABClassifier import ABClassifier import numpy as np import os save_location = '../../experiment_data/experiment_9' ab = ABClassifier() ab.download_tweet_cursors(limit_unlabeled=2500, limit_labeled=2500) ab.run_lsa(k=150) ab.compute_context_vectors(save_location=save_location) print "Performing pairwise similarity measures..." pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list) neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list) unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list) print "Done." print "Saving..." x = np.array(pos_labeled_pws.values())
""" import sys sys.path.append('../..') from ABClassifier.ABClassifier import ABClassifier import numpy as np from sklearn.metrics.pairwise import cosine_similarity save_location = '../../experiment_data/experiment_2' k_list = [5, 10, 25, 50, 100, 150, 250, 500] for k in k_list: ab = ABClassifier() ab.download_cursors(limit_unlabeled = 1000, limit_labeled = 1000) ab.run_lsa(k=k) ab.compute_context_vectors(save_location = save_location) print "Performing pairwise similarity measures..." pos_labeled_pws = cosine_similarity(ab.pos_labeled_cv_list).flatten() neg_labeled_pws = cosine_similarity(ab.neg_labeled_cv_list).flatten() unlabeled_pws = cosine_similarity(ab.unlabeled_cv_list).flatten() print "Done." print "Saving..." np.savetxt(save_location + '/pw_pos_' + str(k) + '.csv', pos_labeled_pws, delimiter=",")
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85] results = {} for t in thresholds: results_obj = {} results_obj['t'] = t results_obj['pos_list_size'] = 0 results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[int(t*100)] = results_obj ab = ABClassifier() ab.download_tweet_cursors(limit_unlabeled = 2500, limit_labeled = 2500) ab.run_lsa(k=k) for i in xrange(0,1): for t in thresholds: print "Running experiment for t = " + str(t) print "Starting classification..." unlabeled_cursor = ab.db.tweets.find({"bullying_label" : {'$exists' :True}}, timeout = False).limit(limit_1) pos_cursor = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).skip(limit_1).limit(limit_2) neg_cursor = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).skip(limit_1).limit(limit_2)
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85] results = {} for t in thresholds: results_obj = {} results_obj['t'] = t results_obj['pos_list_size'] = 0 results_obj['neg_list_size'] = 0 results_obj['true_pos'] = 0 results_obj['true_neg'] = 0 results_obj['true_pos_rate'] = 0 results_obj['true_neg_rate'] = 0 results_obj['accuracy'] = 0 results_obj['num_iterations'] = 0 results[int(t * 100)] = results_obj ab = ABClassifier() ab.download_tweet_cursors(limit_unlabeled=2500, limit_labeled=2500) ab.run_lsa(k=k) for i in xrange(0, 1): for t in thresholds: print "Running experiment for t = " + str(t) print "Starting classification..." unlabeled_cursor = ab.db.tweets.find( { "bullying_label": { '$exists': True } }, timeout=False).limit(limit_1)
-Positive Examples, Negative Examples, Unlabeled -Vary the number of input tweets to the Co-Occurrence Matrix -Uses ONLY twitter data for training/validation """ import sys sys.path.append('../..') from ABClassifier.ABClassifier import ABClassifier import numpy as np import os save_location = '../../experiment_data/experiment_9' ab = ABClassifier() ab.download_tweet_cursors(limit_unlabeled = 2500, limit_labeled = 2500) ab.run_lsa(k=150) ab.compute_context_vectors(save_location = save_location) print "Performing pairwise similarity measures..." pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list) neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list) unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list) print "Done." print "Saving..."