Ejemplo n.º 1
0
save_location = '../../experiment_data/experiment_12'

k_list = [25, 50, 75, 100, 150, 200, 250, 300]

kernel_list = ["custom", "linear", "poly_2", "poly_3", "rbf"]

# results -> k -> kernal
results = []

limit_1 = 400  #training set
limit_2 = 400  #validation set

for k in k_list:
    print "Running experiment for k=" + str(k) + "..."
    ab = ABClassifier()
    ab.download_cursors(limit_unlabeled=5000, limit_labeled=5000)
    #ab.download_tweet_cursors(limit_unlabeled = 800, limit_labeled = 1000)
    ab.run_lsa(k=k)

    #pos_cursor_training = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).limit(400)
    #neg_cursor_training = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).limit(400)

    pos_cursor_training = ab.labeled_collection.find({
        "bully": True
    },
                                                     timeout=False).limit(400)
    neg_cursor_training = ab.labeled_collection.find({
        "bully": False
    },
                                                     timeout=False).limit(400)
Ejemplo n.º 2
0
import sys
sys.path.append('..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np


ab = ABClassifier()

ab.download_cursors(limit_unlabeled = 1000, limit_labeled = 1000)
ab.run_lsa(k=100)
ab.compute_context_vectors()

pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list)
neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list)
unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list)

print "done getting pws"

x = np.array(pos_labeled_pws.values())
a = np.asarray(x)
np.savetxt('pos_labeled.csv', a, delimiter=",")

y = np.array(neg_labeled_pws.values())
b = np.asarray(y)
np.savetxt('neg_labeled.csv', b, delimiter=",")

z = np.array(unlabeled_pws.values())
c = np.asarray(z)
np.savetxt('unlabeled.csv', c, delimiter=",")

Ejemplo n.º 3
0
save_location = '../../experiment_data/experiment_12'

k_list = [25, 50, 75, 100, 150, 200, 250, 300]

kernel_list = ["custom", "linear", "poly_2","poly_3", "rbf"]

# results -> k -> kernal
results = []

limit_1 = 400 #training set
limit_2 = 400 #validation set


for k in k_list:
	print "Running experiment for k=" + str(k) + "..."
	ab = ABClassifier()
	ab.download_cursors(limit_unlabeled = 5000, limit_labeled = 5000)
	#ab.download_tweet_cursors(limit_unlabeled = 800, limit_labeled = 1000)
	ab.run_lsa(k=k)

	#pos_cursor_training = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).limit(400)
	#neg_cursor_training = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).limit(400)

	pos_cursor_training = ab.labeled_collection.find({"bully":True},timeout=False).limit(400)
	neg_cursor_training = ab.labeled_collection.find({"bully":False},timeout=False).limit(400)

	training = []
	tlabels = []

	pos_validation = []
	pos_vlabels = []
Ejemplo n.º 4
0
	results_obj['neg_list_size'] = 0
	results_obj['true_pos'] = 0
	results_obj['true_neg'] = 0
	results_obj['true_pos_rate'] = 0
	results_obj['true_neg_rate'] = 0
	results_obj['accuracy'] = 0
	results_obj['num_iterations'] = 0
	results[k] = results_obj



for i in xrange(0,20):
	for k in k_list:

		print "Running experiment for k = " + str(k)
		ab = ABClassifier()
		ab.download_cursors(limit_unlabeled = limit_1, limit_labeled = limit_1)
		ab.run_lsa(k=k)

		print "Starting classification..."

		unlabeled_cursor   = ab.db.tweets.find({"bullying_label" : {'$exists' :True}}, timeout = False)
		pos_cursor = ab.labeled_collection.find({"bully":True},timeout=False).limit(unlabeled_cursor.count())
		neg_cursor = ab.labeled_collection.find({"bully":False},timeout=False).limit(unlabeled_cursor.count())

		unl = []
		pos = []
		neg = []

		for u,p,n in zip(unlabeled_cursor, pos_cursor, neg_cursor):
			
Ejemplo n.º 5
0
results = {}
for t in thresholds:
	results_obj = {}
	results_obj['t'] = t
	results_obj['pos_list_size'] = 0
	results_obj['neg_list_size'] = 0
	results_obj['true_pos'] = 0
	results_obj['true_neg'] = 0
	results_obj['true_pos_rate'] = 0
	results_obj['true_neg_rate'] = 0
	results_obj['accuracy'] = 0
	results_obj['num_iterations'] = 0
	results[int(t*100)] = results_obj


ab = ABClassifier()
ab.download_tweet_cursors(limit_unlabeled = 5000, limit_labeled = 5000)
ab.run_lsa(k=k)
context_tweets = [
"Literally go f**k yourself, because you're honestly pathetic.", 
"f**k you f****n w***e go f**k yourself stupid bitch",
"but this bad I want to kick her ass cuz she thinks she's a hard chola like sit ur fat ass down lol",
"course he did he's a whipped bitch that will say anything to make u happy, unlike ur mum who called u fat",
"Fat pig. You're disgusting.",
"From some illiterate online keyboard warrior? Go back to sucking your butt buddy's fat junk.",
"God Says; Evil Don't Know The Way. You are gay with AIDS & your sin cost you your anointing! This is why you use DUST!"
"f**k you stupid f****t f*g"
]


tweet_cvs = []
Ejemplo n.º 6
0
    results_obj['k'] = k
    results_obj['pos_list_size'] = 0
    results_obj['neg_list_size'] = 0
    results_obj['true_pos'] = 0
    results_obj['true_neg'] = 0
    results_obj['true_pos_rate'] = 0
    results_obj['true_neg_rate'] = 0
    results_obj['accuracy'] = 0
    results_obj['num_iterations'] = 0
    results[k] = results_obj

for i in xrange(0, 20):
    for k in k_list:

        print "Running experiment for k = " + str(k)
        ab = ABClassifier()
        ab.download_cursors(limit_unlabeled=limit_1, limit_labeled=limit_1)
        ab.run_lsa(k=k)

        print "Starting classification..."

        unlabeled_cursor = ab.db.tweets.find(
            {"bullying_label": {
                '$exists': True
            }}, timeout=False)
        pos_cursor = ab.labeled_collection.find({
            "bully": True
        }, timeout=False).limit(unlabeled_cursor.count())
        neg_cursor = ab.labeled_collection.find({
            "bully": False
        },
Ejemplo n.º 7
0
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85]
results = {}
for t in thresholds:
    results_obj = {}
    results_obj['t'] = t
    results_obj['pos_list_size'] = 0
    results_obj['neg_list_size'] = 0
    results_obj['true_pos'] = 0
    results_obj['true_neg'] = 0
    results_obj['true_pos_rate'] = 0
    results_obj['true_neg_rate'] = 0
    results_obj['accuracy'] = 0
    results_obj['num_iterations'] = 0
    results[int(t * 100)] = results_obj

ab = ABClassifier()
ab.download_cursors(limit_unlabeled=limit_1, limit_labeled=limit_1)
ab.run_lsa(k=k)
context_tweets = [
    "Literally go f**k yourself, because you're honestly pathetic.",
    "f**k you f****n w***e go f**k yourself stupid bitch",
    "but this bad I want to kick her ass cuz she thinks she's a hard chola like sit ur fat ass down lol",
    "course he did he's a whipped bitch that will say anything to make u happy, unlike ur mum who called u fat",
    "Fat pig. You're disgusting.",
    "From some illiterate online keyboard warrior? Go back to sucking your butt buddy's fat junk.",
    "God Says; Evil Don't Know The Way. You are gay with AIDS & your sin cost you your anointing! This is why you use DUST!"
    "f**k you stupid f****t f*g"
]

tweet_cvs = []
for c in context_tweets:
Ejemplo n.º 8
0
    -Positive Examples, Negative Examples, Unlabeled
    -Vary the number of input tweets to the Co-Occurrence Matrix
"""

import sys
sys.path.append('../..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

save_location = '../../experiment_data/experiment_2'

k_list = [5, 10, 25, 50, 100, 150, 250, 500]

for k in k_list:
    ab = ABClassifier()
    ab.download_cursors(limit_unlabeled=1000, limit_labeled=1000)
    ab.run_lsa(k=k)
    ab.compute_context_vectors(save_location=save_location)

    print "Performing pairwise similarity measures..."

    pos_labeled_pws = cosine_similarity(ab.pos_labeled_cv_list).flatten()
    neg_labeled_pws = cosine_similarity(ab.neg_labeled_cv_list).flatten()
    unlabeled_pws = cosine_similarity(ab.unlabeled_cv_list).flatten()

    print "Done."

    print "Saving..."

    np.savetxt(save_location + '/pw_pos_' + str(k) + '.csv',
Ejemplo n.º 9
0
import sys
sys.path.append('..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np

ab = ABClassifier()

ab.download_cursors(limit_unlabeled=10000, limit_labeled=10000)
ab.run_lsa(k=100)
ab.compute_context_vectors()

pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list)
neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list)
unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list)

print "done getting pws"

x = np.array(pos_labeled_pws.values())
a = np.asarray(x)
np.savetxt('pos_labeled.csv', a, delimiter=",")

y = np.array(neg_labeled_pws.values())
b = np.asarray(y)
np.savetxt('neg_labeled.csv', b, delimiter=",")

z = np.array(unlabeled_pws.values())
c = np.asarray(z)
np.savetxt('unlabeled.csv', c, delimiter=",")
Ejemplo n.º 10
0
    -Positive Examples, Negative Examples, Unlabeled
    -Vary the number of input tweets to the Co-Occurrence Matrix
    -Uses ONLY twitter data for training/validation

"""

import sys

sys.path.append('../..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np
import os

save_location = '../../experiment_data/experiment_9'

ab = ABClassifier()
ab.download_tweet_cursors(limit_unlabeled=2500, limit_labeled=2500)
ab.run_lsa(k=150)
ab.compute_context_vectors(save_location=save_location)

print "Performing pairwise similarity measures..."

pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list)
neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list)
unlabeled_pws = ab.pairwise_similarity(ab.unlabeled_cv_list)

print "Done."

print "Saving..."

x = np.array(pos_labeled_pws.values())
Ejemplo n.º 11
0
"""


import sys
sys.path.append('../..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


save_location = '../../experiment_data/experiment_2'

k_list = [5, 10, 25, 50, 100, 150, 250, 500]

for k in k_list:
	ab = ABClassifier()
	ab.download_cursors(limit_unlabeled = 1000, limit_labeled = 1000)
	ab.run_lsa(k=k)
	ab.compute_context_vectors(save_location = save_location)

	print "Performing pairwise similarity measures..."

	pos_labeled_pws = cosine_similarity(ab.pos_labeled_cv_list).flatten()
	neg_labeled_pws = cosine_similarity(ab.neg_labeled_cv_list).flatten()
	unlabeled_pws   = cosine_similarity(ab.unlabeled_cv_list).flatten()

	print "Done."

	print "Saving..."

	np.savetxt(save_location + '/pw_pos_' + str(k) + '.csv', pos_labeled_pws, delimiter=",")
Ejemplo n.º 12
0
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85]
results = {}
for t in thresholds:
	results_obj = {}
	results_obj['t'] = t
	results_obj['pos_list_size'] = 0
	results_obj['neg_list_size'] = 0
	results_obj['true_pos'] = 0
	results_obj['true_neg'] = 0
	results_obj['true_pos_rate'] = 0
	results_obj['true_neg_rate'] = 0
	results_obj['accuracy'] = 0
	results_obj['num_iterations'] = 0
	results[int(t*100)] = results_obj

ab = ABClassifier()
ab.download_tweet_cursors(limit_unlabeled = 2500, limit_labeled = 2500)
ab.run_lsa(k=k)


for i in xrange(0,1):
	for t in thresholds:

		print "Running experiment for t = " + str(t)
		print "Starting classification..."

		unlabeled_cursor   = ab.db.tweets.find({"bullying_label" : {'$exists' :True}}, timeout = False).limit(limit_1)
		pos_cursor = ab.labeled_collection.find({"bullying_label":"1"},timeout=False).skip(limit_1).limit(limit_2)
		neg_cursor = ab.labeled_collection.find({"bullying_label":"0"},timeout=False).skip(limit_1).limit(limit_2)

Ejemplo n.º 13
0
thresholds = [0.5, 0.6, 0.7, 0.75, 0.8, 0.85]
results = {}
for t in thresholds:
    results_obj = {}
    results_obj['t'] = t
    results_obj['pos_list_size'] = 0
    results_obj['neg_list_size'] = 0
    results_obj['true_pos'] = 0
    results_obj['true_neg'] = 0
    results_obj['true_pos_rate'] = 0
    results_obj['true_neg_rate'] = 0
    results_obj['accuracy'] = 0
    results_obj['num_iterations'] = 0
    results[int(t * 100)] = results_obj

ab = ABClassifier()
ab.download_tweet_cursors(limit_unlabeled=2500, limit_labeled=2500)
ab.run_lsa(k=k)

for i in xrange(0, 1):
    for t in thresholds:

        print "Running experiment for t = " + str(t)
        print "Starting classification..."

        unlabeled_cursor = ab.db.tweets.find(
            {
                "bullying_label": {
                    '$exists': True
                }
            }, timeout=False).limit(limit_1)
Ejemplo n.º 14
0
    -Positive Examples, Negative Examples, Unlabeled
    -Vary the number of input tweets to the Co-Occurrence Matrix
    -Uses ONLY twitter data for training/validation

"""


import sys
sys.path.append('../..')
from ABClassifier.ABClassifier import ABClassifier
import numpy as np
import os

save_location = '../../experiment_data/experiment_9'

ab = ABClassifier()
ab.download_tweet_cursors(limit_unlabeled = 2500, limit_labeled = 2500)
ab.run_lsa(k=150)
ab.compute_context_vectors(save_location = save_location)

print "Performing pairwise similarity measures..."

pos_labeled_pws = ab.pairwise_similarity(ab.pos_labeled_cv_list)
neg_labeled_pws = ab.pairwise_similarity(ab.neg_labeled_cv_list)
unlabeled_pws   = ab.pairwise_similarity(ab.unlabeled_cv_list)

print "Done."


print "Saving..."