def classify_with_existing_model(filepath, examples): reload(sys) sys.setdefaultencoding("utf-8") #read in settings f = open(filepath+'settings.txt', 'r') label = f.readline().strip() feats = f.readline().strip().split(' ') f.close() #get label sets aggress, loss = get_label_sets() #set params, features based on input variables if label == 'loss': sought_label = loss elif label == 'aggress': sought_label = aggress elif label == 'loss_aggress': sought_label= loss+aggress else: sought_label = label print sought_label class_weights = 'balanced' top_k = feats[5] _unigrams = feats[0] _bigrams = feats[1] _postags = feats[2] if _postags: pos_tagger = train_tagger() else: pos_tagger = None _description = feats[3] _emotion = feats[4] description = [] tweets = [] index = 0 X = get_feats(examples, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description) #vectorize features v = DictVectorizer() X_test = v.fit_transform(X).todense().tolist() #score on exisiting SVM fs = joblib.load(filepath+'selection.pkl') clf = joblib.load(filepath+'model.pkl') X_test = fs.transform(X_test) predictions = clf.predict(X_test) #use log prob predict instead to get likelihood instead of binary classification? return predictions
def classify(train_file, test_file, model, label, feats, pos_tagger=None, C=False, svm_loss=False, n=False, verbose=True): reload(sys) sys.setdefaultencoding("utf-8") print 'in classify()' #start timer start_time = time.time() #get label sets aggress, loss = get_label_sets() #set params, features based on input variables if label == 'loss': sought_label = loss elif label == 'aggress': sought_label = aggress elif label == 'loss_aggress': sought_label = loss + aggress else: sought_label = label print sought_label class_weights = 'balanced' top_k = feats[5] _unigrams = feats[0] _bigrams = feats[1] _postags = feats[2] _description = feats[3] _emotion = feats[4] description = [] tweets = [] index = 0 #load in training tweets tweets = read_in_data(train_file) #[:2000] len_training = len(tweets) #load in test tweets tweets = tweets + read_in_data(test_file) # get features if verbose: print 'Obtaining features...' X_o = [i[0] for i in tweets] # TESTING PURPOSES ONLY # X_o = X_o [:100] # len_training = 80 X_train = X_o[:len_training] X_test = X_o[len_training:] # print X_train X_train = get_feats(X_train, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description) # wfeats = get_wfeats(X_o) #vectorize features if verbose: print 'Vectorizing...' v = DictVectorizer() X_train = v.fit_transform(X_train) #.toarray() print X_train.shape print X_train.getnnz() # exit(0) # print v.feature_names_ X_test = get_feats(X_test, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description, feature_names=v.feature_names_) X_test = v.transform(X_test) #.toarray() y = [] for t in tweets: # -------TERRA'S ORIGINAL CODE # if len([1 for s in sought_label if s in t[1]])>0: y.append(1) # else: y.append(0) if len([1 for s in loss if s in t[1]]) > 0: y.append(0) elif len([1 for s in aggress if s in t[1]]) > 0: y.append(2) else: y.append(1) y_train = y[:len_training] y_test = y[len_training:] # print len(y_test) # print len(y_train) # split if verbose: print 'Splitting...' # X_train = v.fit_transform(X_train).toarray() # X_test = v.transform(X_test).toarray() pickle.dump(v, open('terra_dictvectorizer.pkl', 'w')) # X_new = [] # # # add word embedding features # if verbose: # print 'Adding word embeddings...' # # for i in range(len(X_vec)): # x = X_vec[i] ## print x # x = numpy.concatenate((x, wfeats[0][i])) ## x = numpy.concatenate((x, wfeats[1][i])) # X_new.append(x) # # #X_vec = numpy.array(X_new) # Terra's original code ## X_train = X_vec[:len_training] #X_new # X_test = X_vec[len_training:] #X_new #train, score on SVM predictions, threat_p, threat_r, nthreat_p, nthreat_r, sthreat_p, sthreat_r = modeling( X_train, X_test, y_train, y_test, top_k, v, model=model, C=C, svm_loss=svm_loss, n=n) # for i in range(len(predictions)): # print tweets[i] + ':\t' + str(predictions[i]) + '; ' + str(y_test[i]) #output end_time = time.time() print 'time elapsed: ' + str(end_time - start_time) + ' seconds.' return threat_p, threat_r, fscore( threat_p, threat_r), nthreat_p, nthreat_r, fscore( nthreat_p, nthreat_r), sthreat_p, sthreat_r, fscore(sthreat_p, sthreat_r), predictions
def importance(self, tweets, num_indicators): no_fly = ['�e_', '“USER_HANDLE:'] #things we don't want to use as indicators aggress, loss = get_label_sets() aggress_tf, loss_tf, other_tf = {}, {}, {} aggress_df, loss_df, other_df = {}, {}, {} vocab = set() #get vocab for t in tweets: vocab.update(tweet_preprocessing(t[0])) #initalize td, df scores for v in vocab: aggress_tf[v], loss_tf[v], other_tf[v] = 0.0, 0.0, 0.0 aggress_df[v], loss_df[v], other_df[v] = 0.0, 0.0, 0.0 #get term, doc counts for vocab on each label for tweet in tweets: found = False for l in aggress: if l in tweet[1].lower() and found == False: found = True aggress_tf = self.add_term_frequencies( tweet[0], aggress_tf) aggress_df = self.add_doc_frequencies(tweet[0], aggress_df) for l in loss: if l in tweet[1].lower() and found == False: found = True loss_tf = self.add_term_frequencies(tweet[0], loss_tf) loss_df = self.add_doc_frequencies(tweet[0], loss_df) if found == False: other_tf = self.add_term_frequencies(tweet[0], other_tf) other_df = self.add_doc_frequencies(tweet[0], other_df) #normalization counts aggress_total_tf = sum([aggress_tf[v] for v in vocab]) loss_total_tf = sum([loss_tf[v] for v in vocab]) other_total_tf = sum([other_tf[v] for v in vocab]) aggress_total_df = sum([aggress_df[v] for v in vocab]) if aggress_total_df == 0: aggress_total_df = 1 loss_total_df = sum([loss_df[v] for v in vocab]) if loss_total_df == 0: loss_total_df = 1 other_total_df = sum([other_df[v] for v in vocab]) if other_total_df == 0: other_total_df = 1 #caluclate aggress, loss, other importance score: (tf_pos. * log(df_pos.)) - (tf_neg. * log(df_neg.)) aggress_scores, loss_scores, other_scores = {}, {}, {} for v in vocab: #aggress importance score aggress_scores[v] = (aggress_tf[v] / aggress_total_tf) * ( aggress_df[v] / aggress_total_df) aggress_scores[v] -= ((loss_tf[v] + other_tf[v]) / (loss_total_tf + other_total_tf)) * ( (loss_df[v] + other_df[v]) / (loss_total_df + other_total_df)) #loss importance score loss_scores[v] = (loss_tf[v] / loss_total_tf) * (loss_df[v] / loss_total_df) loss_scores[v] -= ((aggress_tf[v] + other_tf[v]) / (aggress_total_tf + other_total_tf)) * ( (aggress_df[v] + other_df[v]) / (aggress_total_df + other_total_df)) #other importance score other_scores[v] = (other_tf[v] / other_total_tf) * (other_df[v] / other_total_df) other_scores[v] -= ((loss_tf[v] + aggress_tf[v]) / (loss_total_tf + aggress_total_tf)) * ( (loss_df[v] + aggress_df[v]) / (loss_total_df + aggress_total_df)) #choose indicators based on importance score sorted_aggress = [ k[0] for k in sorted(aggress_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_loss = [ k[0] for k in sorted( loss_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_other = [ k[0] for k in sorted( other_scores.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators( num_indicators, sorted_aggress, sorted_loss, sorted_other) for s in sorted_aggress[:num_indicators]: print 'aggress ' + str(s) #TESTING for s in sorted_loss[:num_indicators]: print 'loss ' + str(s) #TESTING for s in sorted_other[:num_indicators]: print 'other ' + str(s) #TESTING return sorted_aggress[: num_indicators], sorted_loss[: num_indicators], sorted_other[: num_indicators]
def indicators_with_tfidf(self, tweets, num_indicators): no_fly = ['�e_', '“USER_HANDLE:'] #things we don't want to use as indicators aggress, loss = get_label_sets() aggress_counts, loss_counts, other_counts, = {}, {}, {} #get counts for each label for tweet in tweets: found = False for l in aggress: if l in tweet[1].lower() and found == False: found = True aggress_counts = self.add_term_frequencies( tweet[0], aggress_counts) for l in loss: if l in tweet[1].lower() and found == False: found = True loss_counts = self.add_term_frequencies( tweet[0], loss_counts) if found == False: other_counts = self.add_term_frequencies( tweet[0], other_counts) #get tf-idf scores for each of the 3 groups aggress_tfidf, loss_tfidf, other_tfidf = {}, {}, {} for key in aggress_counts.keys(): doc_count = 1 if key in loss_counts.keys(): doc_count += 1 if key in other_counts.keys(): doc_count += 1 aggress_tfidf[key] = aggress_counts[key] * (math.log( 3 / float(doc_count))) for key in loss_counts.keys(): doc_count = 1 if key in aggress_counts.keys(): doc_count += 1 if key in other_counts.keys(): doc_count += 1 loss_tfidf[key] = loss_counts[key] * (math.log( 3 / float(doc_count))) for key in other_counts.keys(): doc_count = 1 if key in aggress_counts.keys(): doc_count += 1 if key in loss_counts.keys(): doc_count += 1 other_tfidf[key] = other_counts[key] * (math.log( 3 / float(doc_count))) #choose indicators based on tf-idf sorted_aggress = [ k[0] for k in sorted(aggress_tfidf.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_loss = [ k[0] for k in sorted( loss_tfidf.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] sorted_other = [ k[0] for k in sorted( other_tfidf.items(), key=operator.itemgetter(1), reverse=True) if k[0] not in no_fly ] # if _only_emojis: # try: # # UCS-4 # patt = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])') # except re.error: # # UCS-2 # patt = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])') # sorted_aggress = [s for s in sorted_aggress if patt.match(s.decode('utf-8'))] # sorted_loss = [s for s in sorted_loss if patt.match(s.decode('utf-8'))] # sorted_other = [s for s in sorted_other if patt.match(s.decode('utf-8'))] sorted_aggress, sorted_loss, sorted_other = self.no_conficting_indicators( num_indicators, sorted_aggress, sorted_loss, sorted_other) for s in sorted_aggress[:num_indicators]: print 'aggress ' + str(s) #TESTING for s in sorted_loss[:num_indicators]: print 'loss ' + str(s) #TESTING for s in sorted_other[:num_indicators]: print 'other ' + str(s) #TESTING return sorted_aggress[: num_indicators], sorted_loss[: num_indicators], sorted_other[: num_indicators]
def classify(train_file, test_file, model, label, feats, pos_tagger=None, C=False, svm_loss=False, n=False): reload(sys) sys.setdefaultencoding("utf-8") #start timer start_time = time.time() #get label sets aggress, loss = get_label_sets() #set params, features based on input variables if label == 'loss': sought_label = loss elif label == 'aggress': sought_label = aggress elif label == 'loss_aggress': sought_label= loss+aggress else: sought_label = label print sought_label class_weights = 'balanced' top_k = feats[5] _unigrams = feats[0] _bigrams = feats[1] _postags = feats[2] _description = feats[3] _emotion = feats[4] description = [] tweets = [] index = 0 #load in training tweets tweets = read_in_data(train_file) len_training = len(tweets) #load in test tweets tweets = tweets + read_in_data(test_file) # get features X = [i[0] for i in tweets] X = get_feats(X, _unigrams=_unigrams, _bigrams=_bigrams, _postags=_postags, pos_tagger=pos_tagger, _emotion=_emotion, description=description) y = [] for t in tweets: if len([1 for s in sought_label if s in t[1]])>0: y.append(1) else: y.append(0) y_train = y[:len_training] y_test = y[len_training:] print len(y_test) print len(y_train) #vectorize features v = DictVectorizer() X_vec = v.fit_transform(X) X_train = X_vec[:len_training] X_test = X_vec[len_training:] #train, score on SVM predictions, threat_p, threat_r, nthreat_p, nthreat_r = modeling(X_train, X_test, y_train, y_test, top_k, v, model=model, C=C, svm_loss=svm_loss, n=n) #output end_time = time.time() print 'time elapsed: '+str(end_time-start_time)+' seconds.' return threat_p, threat_r, fscore(threat_p, threat_r), nthreat_p, nthreat_r, fscore(nthreat_p, nthreat_r), predictions
import sys sys.path.append('code/tools') from tools import get_label_sets, get_other_labels, tweet_preprocessing import csv import re data_file = 'data/_train/train_full.csv' out_file = 'data/_train/train_full_no_rip.csv' f = open(data_file, 'rU') reader = csv.DictReader(f) aggress, loss = get_label_sets() w = open(out_file, 'wb') writer = csv.DictWriter(w, ['AUTHOR', 'CONTENT', 'LABEL', 'DATE', 'URL', 'DESC']) writer.writeheader() for row in reader: label = row['LABEL'].lower() d = { 'AUTHOR': row['AUTHOR'], 'CONTENT': row['CONTENT'], 'LABEL': label, 'DATE': row['DATE'], 'URL': row['URL'], 'DESC': row['DESC'] } tokens = tweet_preprocessing(row['CONTENT'].lower()) good_tokens = []