Esempio n. 1
0
def classify_with_existing_model(filepath, examples):
    reload(sys)
    sys.setdefaultencoding("utf-8") 

    #read in settings
    f = open(filepath+'settings.txt', 'r')
    label = f.readline().strip()
    feats = f.readline().strip().split(' ')
    f.close()

    #get label sets
    aggress, loss = get_label_sets()

    #set params, features based on input variables
    if label == 'loss': sought_label = loss
    elif label == 'aggress': sought_label = aggress
    elif label == 'loss_aggress': sought_label= loss+aggress
    else: sought_label = label
    print sought_label
    class_weights = 'balanced'
    top_k = feats[5]

    _unigrams = feats[0]
    _bigrams = feats[1]

    _postags = feats[2]
    if _postags:
        pos_tagger = train_tagger()
    else:
        pos_tagger = None

    _description = feats[3]

    _emotion = feats[4]

    description = []
    tweets = []
    index = 0

    X = get_feats(examples, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description)

    #vectorize features
    v = DictVectorizer()
    X_test = v.fit_transform(X).todense().tolist()

    #score on exisiting SVM
    fs = joblib.load(filepath+'selection.pkl')
    clf = joblib.load(filepath+'model.pkl')
    X_test = fs.transform(X_test)
    predictions = clf.predict(X_test)
    #use log prob predict instead to get likelihood instead of binary classification?

    return predictions
Esempio n. 2
0
def classify(train_file,
             test_file,
             model,
             label,
             feats,
             pos_tagger=None,
             C=False,
             svm_loss=False,
             n=False,
             verbose=True):
    reload(sys)
    sys.setdefaultencoding("utf-8")

    print 'in classify()'

    #start timer
    start_time = time.time()

    #get label sets
    aggress, loss = get_label_sets()

    #set params, features based on input variables
    if label == 'loss': sought_label = loss
    elif label == 'aggress': sought_label = aggress
    elif label == 'loss_aggress': sought_label = loss + aggress
    else: sought_label = label
    print sought_label
    class_weights = 'balanced'
    top_k = feats[5]

    _unigrams = feats[0]
    _bigrams = feats[1]

    _postags = feats[2]

    _description = feats[3]

    _emotion = feats[4]

    description = []
    tweets = []
    index = 0

    #load in training tweets
    tweets = read_in_data(train_file)  #[:2000]
    len_training = len(tweets)
    #load in test tweets
    tweets = tweets + read_in_data(test_file)

    # get features
    if verbose:
        print 'Obtaining features...'

    X_o = [i[0] for i in tweets]

    # TESTING PURPOSES ONLY
    #    X_o = X_o [:100]
    #    len_training = 80

    X_train = X_o[:len_training]
    X_test = X_o[len_training:]

    #    print X_train

    X_train = get_feats(X_train,
                        _unigrams=_unigrams,
                        _bigrams=_bigrams,
                        _postags=_postags,
                        pos_tagger=pos_tagger,
                        _emotion=_emotion,
                        description=description)
    #    wfeats = get_wfeats(X_o)

    #vectorize features
    if verbose:
        print 'Vectorizing...'

    v = DictVectorizer()

    X_train = v.fit_transform(X_train)  #.toarray()

    print X_train.shape
    print X_train.getnnz()
    #    exit(0)

    #    print v.feature_names_

    X_test = get_feats(X_test,
                       _unigrams=_unigrams,
                       _bigrams=_bigrams,
                       _postags=_postags,
                       pos_tagger=pos_tagger,
                       _emotion=_emotion,
                       description=description,
                       feature_names=v.feature_names_)

    X_test = v.transform(X_test)  #.toarray()

    y = []
    for t in tweets:
        # -------TERRA'S ORIGINAL CODE
        #        if len([1 for s in sought_label if s in t[1]])>0: y.append(1)
        #        else: y.append(0)
        if len([1 for s in loss if s in t[1]]) > 0: y.append(0)
        elif len([1 for s in aggress if s in t[1]]) > 0: y.append(2)
        else: y.append(1)
    y_train = y[:len_training]
    y_test = y[len_training:]
    #    print len(y_test)
    #    print len(y_train)

    # split
    if verbose:
        print 'Splitting...'

#    X_train = v.fit_transform(X_train).toarray()
#    X_test = v.transform(X_test).toarray()

    pickle.dump(v, open('terra_dictvectorizer.pkl', 'w'))

    #    X_new = []
    #
    #    # add word embedding features
    #    if verbose:
    #        print 'Adding word embeddings...'
    #
    #    for i in range(len(X_vec)):
    #        x = X_vec[i]
    ##        print x
    #        x = numpy.concatenate((x, wfeats[0][i]))
    ##        x = numpy.concatenate((x, wfeats[1][i]))
    #        X_new.append(x)
    #
    #    #X_vec = numpy.array(X_new)

    # Terra's original code
    ##    X_train = X_vec[:len_training] #X_new
    #    X_test = X_vec[len_training:] #X_new

    #train, score on SVM
    predictions, threat_p, threat_r, nthreat_p, nthreat_r, sthreat_p, sthreat_r = modeling(
        X_train,
        X_test,
        y_train,
        y_test,
        top_k,
        v,
        model=model,
        C=C,
        svm_loss=svm_loss,
        n=n)

    #    for i in range(len(predictions)):
    #        print tweets[i] + ':\t' + str(predictions[i]) + '; ' + str(y_test[i])

    #output
    end_time = time.time()
    print 'time elapsed: ' + str(end_time - start_time) + ' seconds.'

    return threat_p, threat_r, fscore(
        threat_p, threat_r), nthreat_p, nthreat_r, fscore(
            nthreat_p,
            nthreat_r), sthreat_p, sthreat_r, fscore(sthreat_p,
                                                     sthreat_r), predictions
Esempio n. 3
0
    def importance(self, tweets, num_indicators):
        no_fly = ['�e_',
                  '“USER_HANDLE:']  #things we don't want to use as indicators

        aggress, loss = get_label_sets()
        aggress_tf, loss_tf, other_tf = {}, {}, {}
        aggress_df, loss_df, other_df = {}, {}, {}
        vocab = set()

        #get vocab
        for t in tweets:
            vocab.update(tweet_preprocessing(t[0]))

        #initalize td, df scores
        for v in vocab:
            aggress_tf[v], loss_tf[v], other_tf[v] = 0.0, 0.0, 0.0
            aggress_df[v], loss_df[v], other_df[v] = 0.0, 0.0, 0.0

        #get term, doc counts for vocab on each label
        for tweet in tweets:
            found = False
            for l in aggress:
                if l in tweet[1].lower() and found == False:
                    found = True
                    aggress_tf = self.add_term_frequencies(
                        tweet[0], aggress_tf)
                    aggress_df = self.add_doc_frequencies(tweet[0], aggress_df)
            for l in loss:
                if l in tweet[1].lower() and found == False:
                    found = True
                    loss_tf = self.add_term_frequencies(tweet[0], loss_tf)
                    loss_df = self.add_doc_frequencies(tweet[0], loss_df)
            if found == False:
                other_tf = self.add_term_frequencies(tweet[0], other_tf)
                other_df = self.add_doc_frequencies(tweet[0], other_df)

        #normalization counts
        aggress_total_tf = sum([aggress_tf[v] for v in vocab])
        loss_total_tf = sum([loss_tf[v] for v in vocab])
        other_total_tf = sum([other_tf[v] for v in vocab])

        aggress_total_df = sum([aggress_df[v] for v in vocab])
        if aggress_total_df == 0: aggress_total_df = 1
        loss_total_df = sum([loss_df[v] for v in vocab])
        if loss_total_df == 0: loss_total_df = 1
        other_total_df = sum([other_df[v] for v in vocab])
        if other_total_df == 0: other_total_df = 1

        #caluclate aggress, loss, other importance score: (tf_pos. * log(df_pos.)) - (tf_neg. * log(df_neg.))

        aggress_scores, loss_scores, other_scores = {}, {}, {}
        for v in vocab:
            #aggress importance score
            aggress_scores[v] = (aggress_tf[v] / aggress_total_tf) * (
                aggress_df[v] / aggress_total_df)
            aggress_scores[v] -= ((loss_tf[v] + other_tf[v]) /
                                  (loss_total_tf + other_total_tf)) * (
                                      (loss_df[v] + other_df[v]) /
                                      (loss_total_df + other_total_df))

            #loss importance score
            loss_scores[v] = (loss_tf[v] / loss_total_tf) * (loss_df[v] /
                                                             loss_total_df)
            loss_scores[v] -= ((aggress_tf[v] + other_tf[v]) /
                               (aggress_total_tf + other_total_tf)) * (
                                   (aggress_df[v] + other_df[v]) /
                                   (aggress_total_df + other_total_df))

            #other importance score
            other_scores[v] = (other_tf[v] / other_total_tf) * (other_df[v] /
                                                                other_total_df)
            other_scores[v] -= ((loss_tf[v] + aggress_tf[v]) /
                                (loss_total_tf + aggress_total_tf)) * (
                                    (loss_df[v] + aggress_df[v]) /
                                    (loss_total_df + aggress_total_df))

        #choose indicators based on importance score
        sorted_aggress = [
            k[0] for k in sorted(aggress_scores.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True) if k[0] not in no_fly
        ]
        sorted_loss = [
            k[0] for k in sorted(
                loss_scores.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]
        sorted_other = [
            k[0] for k in sorted(
                other_scores.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]

        sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators(
            num_indicators, sorted_aggress, sorted_loss, sorted_other)

        for s in sorted_aggress[:num_indicators]:
            print 'aggress ' + str(s)  #TESTING
        for s in sorted_loss[:num_indicators]:
            print 'loss ' + str(s)  #TESTING
        for s in sorted_other[:num_indicators]:
            print 'other ' + str(s)  #TESTING

        return sorted_aggress[:
                              num_indicators], sorted_loss[:
                                                           num_indicators], sorted_other[:
                                                                                         num_indicators]
Esempio n. 4
0
    def indicators_with_tfidf(self, tweets, num_indicators):
        no_fly = ['�e_',
                  '“USER_HANDLE:']  #things we don't want to use as indicators

        aggress, loss = get_label_sets()
        aggress_counts, loss_counts, other_counts, = {}, {}, {}

        #get counts for each label
        for tweet in tweets:
            found = False
            for l in aggress:
                if l in tweet[1].lower() and found == False:
                    found = True
                    aggress_counts = self.add_term_frequencies(
                        tweet[0], aggress_counts)
            for l in loss:
                if l in tweet[1].lower() and found == False:
                    found = True
                    loss_counts = self.add_term_frequencies(
                        tweet[0], loss_counts)
            if found == False:
                other_counts = self.add_term_frequencies(
                    tweet[0], other_counts)

        #get tf-idf scores for each of the 3 groups
        aggress_tfidf, loss_tfidf, other_tfidf = {}, {}, {}
        for key in aggress_counts.keys():
            doc_count = 1
            if key in loss_counts.keys(): doc_count += 1
            if key in other_counts.keys(): doc_count += 1
            aggress_tfidf[key] = aggress_counts[key] * (math.log(
                3 / float(doc_count)))

        for key in loss_counts.keys():
            doc_count = 1
            if key in aggress_counts.keys(): doc_count += 1
            if key in other_counts.keys(): doc_count += 1
            loss_tfidf[key] = loss_counts[key] * (math.log(
                3 / float(doc_count)))

        for key in other_counts.keys():
            doc_count = 1
            if key in aggress_counts.keys(): doc_count += 1
            if key in loss_counts.keys(): doc_count += 1
            other_tfidf[key] = other_counts[key] * (math.log(
                3 / float(doc_count)))

        #choose indicators based on tf-idf
        sorted_aggress = [
            k[0] for k in sorted(aggress_tfidf.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True) if k[0] not in no_fly
        ]
        sorted_loss = [
            k[0] for k in sorted(
                loss_tfidf.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]
        sorted_other = [
            k[0] for k in sorted(
                other_tfidf.items(), key=operator.itemgetter(1), reverse=True)
            if k[0] not in no_fly
        ]

        #		if _only_emojis:
        #			try:
        #			# UCS-4
        #				patt = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
        #			except re.error:
        #			# UCS-2
        #				patt = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])')
        #			sorted_aggress = [s for s in sorted_aggress if patt.match(s.decode('utf-8'))]
        #			sorted_loss = [s for s in sorted_loss if patt.match(s.decode('utf-8'))]
        #			sorted_other = [s for s in sorted_other if patt.match(s.decode('utf-8'))]

        sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators(
            num_indicators, sorted_aggress, sorted_loss, sorted_other)

        for s in sorted_aggress[:num_indicators]:
            print 'aggress ' + str(s)  #TESTING
        for s in sorted_loss[:num_indicators]:
            print 'loss ' + str(s)  #TESTING
        for s in sorted_other[:num_indicators]:
            print 'other ' + str(s)  #TESTING

        return sorted_aggress[:
                              num_indicators], sorted_loss[:
                                                           num_indicators], sorted_other[:
                                                                                         num_indicators]
Esempio n. 5
0
def classify(train_file, test_file, model, label, feats, pos_tagger=None, C=False, svm_loss=False, n=False):
    reload(sys)
    sys.setdefaultencoding("utf-8")

    #start timer
    start_time = time.time()

    #get label sets
    aggress, loss = get_label_sets()

    #set params, features based on input variables
    if label == 'loss': sought_label = loss
    elif label == 'aggress': sought_label = aggress
    elif label == 'loss_aggress': sought_label= loss+aggress
    else: sought_label = label
    print sought_label
    class_weights = 'balanced'
    top_k = feats[5]

    _unigrams = feats[0]
    _bigrams = feats[1]

    _postags = feats[2]

    _description = feats[3]

    _emotion = feats[4]

    description = []
    tweets = []
    index = 0

    #load in training tweets
    tweets = read_in_data(train_file)
    len_training = len(tweets)
    #load in test tweets
    tweets = tweets + read_in_data(test_file)

    # get features
    X = [i[0] for i in tweets]
    X = get_feats(X, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description)

    y = []
    for t in tweets:
        if len([1 for s in sought_label if s in t[1]])>0: y.append(1)
        else: y.append(0)
    y_train = y[:len_training]
    y_test = y[len_training:]
    print len(y_test)
    print len(y_train)

    #vectorize features
    v = DictVectorizer()
    X_vec = v.fit_transform(X)
    X_train = X_vec[:len_training]
    X_test = X_vec[len_training:]


    #train, score on SVM
    predictions, threat_p, threat_r, nthreat_p, nthreat_r = modeling(X_train, X_test, y_train, y_test, top_k, v, model=model, C=C, svm_loss=svm_loss, n=n)

    #output
    end_time = time.time()
    print 'time elapsed: '+str(end_time-start_time)+' seconds.'

    return threat_p, threat_r, fscore(threat_p, threat_r), nthreat_p, nthreat_r, fscore(nthreat_p, nthreat_r), predictions
Esempio n. 6
0
import sys
sys.path.append('code/tools')
from tools import get_label_sets, get_other_labels, tweet_preprocessing
import csv
import re

data_file = 'data/_train/train_full.csv'
out_file = 'data/_train/train_full_no_rip.csv'
f = open(data_file, 'rU')
reader = csv.DictReader(f)

aggress, loss = get_label_sets()

w = open(out_file, 'wb')
writer = csv.DictWriter(w,
                        ['AUTHOR', 'CONTENT', 'LABEL', 'DATE', 'URL', 'DESC'])

writer.writeheader()
for row in reader:
    label = row['LABEL'].lower()
    d = {
        'AUTHOR': row['AUTHOR'],
        'CONTENT': row['CONTENT'],
        'LABEL': label,
        'DATE': row['DATE'],
        'URL': row['URL'],
        'DESC': row['DESC']
    }

    tokens = tweet_preprocessing(row['CONTENT'].lower())
    good_tokens = []