def main_wordcloudsFollowing(): ##WORDCLOUD FOR EVERY AGE RANGE db_access = MongoDBUtils() ageRanges = db_access.getAgeRanges() stopwords = getSpanishStopwords() for ar in ageRanges: #Decode data df_subscription = pd.read_csv(DATASET_PATH + "/subscriptionLists_" + ar + ".csv", sep=",", dtype=str) text = ' '.join(df_subscription['subscriptionLists']) for stop in stopwords: stop = ' ' + stop.encode('utf-8') + ' ' text = text.replace(stop, ' ').encode('utf-8', 'ignore') wordcloud = WordCloud(width=1600, height=800).generate(text.decode("utf-8")) print "Dibujando wordcloud para ", ar, " ..." # Open a plot of the generated image. plt.figure(figsize=(20, 10), facecolor='k') plt.title('wordcloud subscription lists:' + ar) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig('wordcloud_subscriptions' + ar + ".png", facecolor='k', bbox_inches='tight')
def getLatestProfilePic(self, screen_name, image): print screen_name, "- getting latest profile pic" try: user = self.lookup_user(screen_name=screen_name) profilePic = user[0]["profile_image_url_https"] db_access = MongoDBUtils() db_access.updateProfilePicture(screen_name, profilePic) return profilePic.replace("normal", "400x400") except: return image
def convertToCategory(ageRanges,typeOp): db_access = MongoDBUtils() if typeOp =='normal': ages = db_access.getAgeRanges() else: ages=['10-17','18-24','25-xx'] result=[] for ar in ageRanges: result.append(ages[ar].encode("utf-8")) return result
def main_wordcloudsTweets(): ##WORDCLOUD FOR EVERY AGE RANGE db_access = MongoDBUtils() ageRanges = db_access.getAgeRanges() #ageRanges=['50-64'] stopwords = getCustomStopwords() stopwords.append(u'jajaja') stopwords.append(u'gracia') stopwords.append(u'asi') stopwords.append(u'via') stopwords.append(u'dia') stopwords.append(u'tambien') stopsAux = [] for stop in stopwords: stopsAux.append(stop.encode('utf-8')) for ar in ageRanges: print ar #Decode data df_tweets = pd.read_csv(DATASET_PATH + "/tweets_" + ar + ".csv", sep=",") text = '' for tw in df_tweets['tweets']: tw = tw.translate(None, string.punctuation) tw = tw.replace('¿', ' ') tw = tw.replace('¡', ' ') tw = tw.replace('á', 'a') tw = tw.replace('é', 'e') tw = tw.replace('í', 'i') tw = tw.replace('ó', 'o') tw = tw.replace('ú', 'u') # Replace all stop words from the tweet text += removeStopWords(tw, stopwords) text = removeStopWords(text, stopwords) wordcloud = WordCloud(width=1600, height=800).generate(text.decode("utf-8")) print "Dibujando wordcloud para ", ar, " ..." # Open a plot of the generated image. plt.figure(figsize=(20, 10), facecolor='k') plt.title('wordcloud ages:' + ar) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig('wordcloud_' + ar + ".png", facecolor='k', bbox_inches='tight')
def convertToInt(ageRanges,typeOp): db_access = MongoDBUtils() ages=[] result=[] if typeOp =='normal': ages = db_access.getAgeRanges() else: ages=['10-17','18-24','25-xx'] for ar in ageRanges: result.append(ages.index(ar)) return result
def updateOtherNetworks(self): db_access = MongoDBUtils() cont = 0 for user in db_access.get_users("users"): cont = cont + 1 if (cont > 0): print cont try: userToSave = self.populateOtherNetworks(user) db_access.save_other_network(userToSave['screen_name'], 'facebook', userToSave['facebook']) except Exception as e: print "error"
def analyzeProfilePicture(): db_access = MongoDBUtils() users = db_access.get_usersWithNoProfilePicAge() cont = 0 for user in users: cont = cont + 1 print "-----------------------" profilePic = user["profile_image_url_https"].replace( "normal", "400x400") age = getAgeGenderFromProfilePicture(user['screen_name'], profilePic) print user['screen_name'], ' - ', age if cont % 18 == 0: print "Esperando..." time.sleep(60)
def updateTweets(self): db_access = MongoDBUtils() cont = 0 for user in db_access.get_users("users"): cont = cont + 1 if (len(user['tweets']) == 0): print cont oldTweets = len(user['tweets']) try: userToSave = self.getUserTweets(user) except Exception as e: print "Usuario con perfil restringido" userToSave = populate_mentions_hashtags_urls(user) #print "ANTES:",oldTweets , "- AHORA: ", len(userToSave['tweets']) if oldTweets != len(userToSave['tweets']): try: db_access.save_user_tweets(user["screen_name"], userToSave['tweets']) except pymongo.errors.DocumentTooLarge as e: while True: print "********* Doc muy grande, eliminando 50 tweets..." try: userToSave['tweets'] = userToSave[ 'tweets'][:len(userToSave['tweets']) - 50] db_access.save_user_tweets( user["screen_name"], userToSave['tweets']) break except pymongo.errors.DocumentTooLarge as e: pass
def getAgeGenderFromProfilePicture(screen_name, image): db_access = MongoDBUtils() users = db_access.get_users('users') KEY = '80420a0d0de14f4d9fa2f1c6027afc38' # Replace with a valid subscription key (keeping the quotes in place). CF.Key.set(KEY) #KEY: https://azure.microsoft.com/en-us/try/cognitive-services/?apiSlug=face-api&country=Uruguay&allowContact=true #TEST ONLINE: https://westcentralus.dev.cognitive.microsoft.com/docs/services/563879b61984550e40cbbe8d/operations/563879b61984550f30395236/console #TUTORIAL: https://docs.microsoft.com/en-us/azure/cognitive-services/face/tutorials/faceapiinpythontutorial BASE_URL = 'https://westcentralus.api.cognitive.microsoft.com/face/v1.0/' CF.BaseUrl.set(BASE_URL) # You can use this example JPG or replace the URL below with your own URL to a JPEG image. img_url = image resultAge = -1 resultGender = -1 try: faces = CF.face.detect(img_url, False, False, 'age,gender') #print faces #print "age: ", faces[0]['faceAttributes']['age'] if len(faces) > 0: resultAge = int(round(faces[0]['faceAttributes']['age'], 0)) resultGender = faces[0]['faceAttributes']['gender'] except Exception as ex: print "User: "******" - Error while calculating age from profile pic: ", image print ex if resultAge == -1: streamer = TwitterStreamer(Twython) newProfilePic = streamer.getLatestProfilePic(screen_name, image) isNew = image != newProfilePic print "profile pic updated: ", isNew if isNew: resultAge = getAgeGenderFromProfilePicture(screen_name, newProfilePic) print resultAge db_access.set_profilePic_age_gender_user(screen_name, resultAge, resultGender) return resultAge
def process_twitter_data(worker_id, queue, module_name, source): """ This is the worker thread function. It processes items in the queue one after another. These daemon threads go into an infinite loop, and only exit when the main thread ends. """ logger = logging.getLogger(LOGGING_ROOT_NAME + '.processor.' + str(worker_id)) logger.debug("Worker" + str(worker_id) + " looking for data...") db_access = MongoDBUtils() while True: data = queue.get() if 'text' in data: # Guarda el Tweet db_access.save_tweet(data, source) #print data print "Tweet guardado..." logger.debug('TWEET | id: ' + str(data['id']) + ': ' + data['text'].encode('utf-8')) elif 'delete' in data: logger.debug('DELETION NOTICE | ' + str(data).encode('utf-8')) elif 'warning' in data: logger.debug('STALL WARNING | ' + str(data).encode('utf-8')) elif 'limit' in data: logger.debug('LIMIT NOTICE | ' + str(data).encode('utf-8')) elif 'disconnect' in data: logger.debug('DISCONNECTION MESSAGE | ' + str(data).encode('utf-8')) elif 'status_withheld' in data: logger.debug('STATUS WITHHELD | ' + str(data).encode('utf-8')) elif 'user_withheld' in data: logger.debug('USER WITHHELD | ' + str(data).encode('utf-8')) else: logger.debug('PRETTY ODD | Data: ' + str(data)) queue.task_done()
def run(self): db_access = MongoDBUtils() users = db_access.get_usersWithNoSubscriptionLists(); contador=1 for user in users: try: if contador < 15: contador = contador + 1 print '----------------------------' print user["screen_name"] lists=self.get_list_subscriptions(screen_name=user['screen_name'],count=1000) print len(lists["lists"]) save_listSubscriptions(user["screen_name"], lists["lists"]) contador=contador+1 else: print "esperando" time.sleep(900) contador=0 except Exception as e : print 'Error subscription lists user: ',user["screen_name"] print e save_listSubscriptions(user["screen_name"], -1)
#!/usr/bin/python # -*- coding: utf8 -*- import os, sys sys.path.append(os.path.abspath(os.pardir)) from configs.settings import * from data_access.mongo_utils import MongoDBUtils from requests.exceptions import ChunkedEncodingError from twython import Twython from threading import Thread from Queue import Queue import streamer_logging import traceback import logging import sys import time import pymongo from pymongo import MongoClient import imp from extractUsers import TwitterStreamer from bio import etiquetarUsuarios from howold_extractor.scrapingHowold import analyzeProfilePicture from extractListsSubscriptions import TwitterStreamerSubscriptions ##Mover usuarios etiquetados en paso anterior a la collection "users" #print "Ejecutando extractUsers.py" db_access = MongoDBUtils() db_access.populate_mentions_hashtags_urls()
def main_tweetNgrams(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] else: #train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)[['screen_name','tweets','age']] #test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)[['screen_name','tweets','age']] #EXPERIMENT 4 train_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_faceAPI_tweets_train.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] test_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_faceAPI_tweets_test.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) ##STOPWORDS EN SPANISH, SCIKIT TRAE SOLO EN INGLES stopwords = getCustomStopwords() #count_vect = CountVectorizer(stop_words=stopwords, max_features=5000 ) #Para hacer bag of words #X_train_counts = count_vect.fit_transform(train_data.tweets) # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. transformer_tfidf = TfidfVectorizer(smooth_idf=False, lowercase=False, stop_words=stopwords, max_features=5000, ngram_range=(1, 3)) tfidf = transformer_tfidf.fit_transform(train_data.tweets) ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_features = tfidf.toarray() #print len(train_data) #186 users en train # Take a look at the words in the vocabulary vocab = transformer_tfidf.get_feature_names() #print vocab # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 10, 'gamma': 0.1} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 160, 'max_depth': 20, 'min_samples_leaf': 3} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 50, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## # ********* ENTRENO LOS MODELOS CON LA DATA EN TRAIN*********# print "Training the Classifiers..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=160, max_depth=20, min_samples_leaf=3) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C=10, gamma=0.1) sgd = SGDClassifier(loss='log', penalty='l2', random_state=42, alpha=0.0001, n_iter=60) regr = LinearRegression() # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable #forest = forest.fit( train_data_features, train_data["age"] ) #bayes = bayes.fit( train_data_features, train_data["age"] ) #svm = svm.fit(train_data_features, train_data["age"] ) #sgd= sgd.fit(train_data_features, train_data["age"] ) regr = regr.fit(train_data_features, train_data["age"]) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_features = transformer_tfidf.transform(test_data.tweets) test_data_features = test_data_features.toarray() # Use the random forest to make age range predictions #resultForest = forest.predict(test_data_features) #resultBayes = bayes.predict(test_data_features) #print "resultbayes: ", resultBayes #resultSVM= svm.predict(test_data_features) #resultSGD= sgd.predict(test_data_features) resultLR = regr.predict(test_data_features) outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes, "ageSVM": resultSVM, "ageSGD": resultSGD }) # Use pandas to write the comma-separated output file outname = 'tweets_ngrams_results.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) ################################### #******* MODEL EVALUATION ********* ################################### print "Evaluating the model --> Calculating metrics ..." db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[['screen_name', 'tweets']] data = transformer_tfidf.fit_transform(data.tweets) y_complete = df_complete['age'] name_prefix = 'tweetNgrams_' + typeOp + '_' + balanced #-------------- ##BAYES #-------------- #print "Metrics for Naive Bayes:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir) #print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) #scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracyNB = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- #print "Metrics for Random Forest:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir) #print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) #scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracyRF = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- #print "Metrics for SVM:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir) #print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) #scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracySVM = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- #print "Metrics for SGD:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir) #print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) #scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracySGD = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> N.Bayes:", 0, "|RForest:", 0, "|SVM:", 0, "|SGD:", 0 #print result return result
def etiquetarUsuarios(): db_access = MongoDBUtils() print "Etiquetando usuarios con la edad en la bio......" db_access.getBioWithAge("users")
def main_customFields(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv( DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] else: train_data = pd.read_csv( DATASET_PATH + "/" + typeOp + "_tweets_train.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] test_data = pd.read_csv( DATASET_PATH + "/" + typeOp + "_tweets_test.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) features = train_data.columns[1:(len(train_data.columns) - 1)] train_data_features = train_data[features] test_data_features = test_data[features] import ml_utils as ml_utils # convert age ranges into integers y = ml_utils.convertToInt(train_data['age'], typeOp) ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 140, 'max_depth': 20, 'min_samples_leaf': 2} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 50, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## print "Training the classifiers ..." forest = RandomForestClassifier(n_estimators=140, max_depth=20, min_samples_leaf=2) bayes = MultinomialNB() svm = SVC(kernel='rbf', C=8, gamma=0.01) sgd = SGDClassifier(loss='log', penalty='l2', random_state=42, alpha=0.001, n_iter=50) # Train the Classifier to take the training features and learn how they relate to the age forest.fit(train_data_features, y) bayes.fit(train_data_features, y) svm = svm.fit(train_data_features, y) sgd = sgd.fit(train_data_features, y) # Apply the Classifier we trained to the test data # Create actual english names for the ages for each predicted age range resultForest = ml_utils.convertToCategory( forest.predict(test_data_features), typeOp) resultBayes = ml_utils.convertToCategory(bayes.predict(test_data_features), typeOp) resultSVM = ml_utils.convertToCategory(svm.predict(test_data_features), typeOp) resultSGD = ml_utils.convertToCategory(sgd.predict(test_data_features), typeOp) # View the predicted probabilities of the first 10 observations forest.predict_proba(test_data_features)[0:10] outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes, "ageSVM": resultSVM, "ageSGD": resultSGD }) # Use pandas to write the comma-separated output file outname = 'tweets_customFields_results.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) # View a list of the features and their importance scores headers = ["name", "score"] print "Importance of Features: " #, sorted(list(zip(train_data[features], forest.feature_importances_)), key=lambda x: x[1]) values = sorted(zip(train_data_features, forest.feature_importances_), key=lambda x: x[1] * -1) print tabulate(values, headers, tablefmt="plain") ############################################# # EVALUATE THE MODEL ############################################# print "Evaluating the model --> Calculating metrics ..." db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[[ 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender' ]] y_complete = ml_utils.convertToInt(df_complete['age'], typeOp) #-------------- ##BAYES #-------------- name_prefix = 'customFields_' + typeOp + '_' + balanced print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes, ageRanges, name_prefix, 'NaiveBayes', outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest, ageRanges, name_prefix, 'RandomForest', outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM, ageRanges, name_prefix, 'SVM', outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD, ageRanges, name_prefix, 'SGD', outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> N.Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD print result return result # Copy the results to a pandas dataframe output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes })
def main_tweetNgramsAndCustomFields(typeOp,balanced): if balanced == 'balanced': train_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_train.csv", sep=",",dtype=str) test_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_test.csv", sep=",",dtype=str) else: train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str) test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str) # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:',len(test_data) frames = [train_data, test_data] df_complete= pd.concat(frames) print 'Number of observations in the whole dataset:',len(df_complete) stopwords = getCustomStopwords() transformer_tfidf = TfidfVectorizer(smooth_idf=False,lowercase=False,stop_words=stopwords,max_features=5000, ngram_range=(1,3)) tfidf = transformer_tfidf.fit_transform(train_data.tweets) ''' headers = ["name", "score"] idf = transformer_tfidf.idf_ print "Most frequent TFIDF terms in dataset: " valuesTfIdf = sorted(zip(idf,transformer_tfidf.get_feature_names()), key=lambda x: x[0]) print(tabulate(valuesTfIdf, headers, tablefmt="plain")) ''' # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_feat = tfidf.toarray() #print len(train_data) #186 users en train train_data_features = np.c_[train_data_feat, train_data['friends_count'],train_data['tweets_count'], train_data['linkedin'],train_data['snapchat'], train_data['instagram'],train_data['facebook'],train_data['followers_count'],train_data['favourites_count'],train_data['qtyMentions'],train_data['qtyHashtags'],train_data['qtyUrls'], train_data['qtyEmojis'], train_data['qtyUppercase'],train_data['profile_pic_gender']] # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 3} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 40, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## print "Training the models..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators = 120, max_depth= 30, min_samples_leaf= 3) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C= 8, gamma = 0.01) sgd = SGDClassifier(loss='log', penalty='l2', alpha=0.001,n_iter=40) # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable forest = forest.fit( train_data_features, train_data["age"] ) bayes = bayes.fit( train_data_features, train_data["age"] ) svm = svm.fit(train_data_features, train_data["age"] ) sgd= sgd.fit(train_data_features, train_data["age"] ) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_feat = transformer_tfidf.transform(test_data.tweets) test_data_feat = test_data_feat.toarray() test_data_features = np.c_[test_data_feat, test_data['friends_count'],test_data['tweets_count'], test_data['linkedin'],test_data['snapchat'], test_data['instagram'],test_data['facebook'],test_data['followers_count'],test_data['favourites_count'],test_data['qtyMentions'],test_data['qtyHashtags'],test_data['qtyUrls'], test_data['qtyEmojis'], test_data['qtyUppercase'],test_data['profile_pic_gender']] # Use the random forest to make age range predictions resultForest = forest.predict(test_data_features) resultBayes = bayes.predict(test_data_features) resultSVM= svm.predict(test_data_features) resultSGD= sgd.predict(test_data_features) outdir =time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir +"/"+typeOp): os.mkdir(outdir +"/"+typeOp) outdir=outdir +"/"+typeOp # Copy the results to a pandas dataframe with an "id" column and # a "age" column output = pd.DataFrame( data={"id":test_data["screen_name"], "realAge":test_data["age"], "ageRandomForest":resultForest,"ageNaiveBayes":resultBayes}) #print output # Use pandas to write the comma-separated output file outname = 'Bigram_model_ForestAndBayes.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname,index=False) ################################### #******* MODEL EVALUATION ********* ################################### import ml_utils as ml_utils db_access = MongoDBUtils() ageRanges=[] if typeOp=='normal': ageRanges=db_access.getAgeRanges() else: ageRanges=db_access.get3AgeRanges() target_names=ageRanges data_aux = transformer_tfidf.fit_transform(df_complete.tweets) data_aux = data_aux.toarray() data = np.c_[data_aux, df_complete['friends_count'],df_complete['tweets_count'], df_complete['linkedin'],df_complete['snapchat'], df_complete['instagram'],df_complete['facebook'],df_complete['followers_count'],df_complete['favourites_count'],df_complete['qtyMentions'],df_complete['qtyHashtags'],df_complete['qtyUrls'], df_complete['qtyEmojis'], df_complete['qtyUppercase'],df_complete['profile_pic_gender']] y_complete = df_complete['age'] name_prefix='tweetNgramsAndCustomFields_'+typeOp+'_'+balanced print data.shape #-------------- ##BAYES #-------------- print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result= "ACCURACY--> N.Bayes:",accuracyNB,"|RForest:", accuracyRF,"|SVM:", accuracySVM,"|SGD:", accuracySGD print result return result
import os, sys import os.path import numpy as np import pandas as pd sys.path.append(os.path.abspath(os.pardir)) from sklearn.model_selection import train_test_split from configs.settings import * from data_access.mongo_utils import MongoDBUtils from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn import svm, datasets from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB from stop_words import get_stop_words import re import imp from nltk.corpus import stopwords db_access = MongoDBUtils() db_access.export_tweetsLabeled()
def run(self): db_access = MongoDBUtils() output = self.lookup_user(screen_name='michael_sorano') print output
def main_subscriptionNgrams(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv(DATASET_PATH + "/subscriptionLists_balanced_train.csv", sep=",", dtype=str) test_data = pd.read_csv(DATASET_PATH + "/subscriptionLists_balanced_test.csv", sep=",", dtype=str) else: train_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_subscriptionLists_train.csv", sep=",", dtype=str) test_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_subscriptionLists_test.csv", sep=",", dtype=str) # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) stopwords = getSpanishStopwords() count_vect = CountVectorizer(stop_words=stopwords, max_features=5000, ngram_range=(1, 3), token_pattern=r'\b\w+\b') X_train_counts = count_vect.fit_transform(train_data.subscriptionLists) # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_features = X_train_counts.toarray() #print len(train_data) #186 users en train #print train_data_features.shape #(186, 500) --> It has 212 rows and 500 features (500 most frequent words). # Take a look at the words in the vocabulary vocab = count_vect.get_feature_names() #print vocab # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT:{'kernel': 'rbf', 'C': 10, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 1} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 40, 'loss': 'log'} # ********* APLICO MODELOS Y LOS ENTRENO CON LA DATA EN TRAIN*********# print "Training the models..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=120, max_depth=30, min_samples_leaf=1) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C=10, gamma=0.01) sgd = SGDClassifier(penalty='elasticnet', alpha=0.0001, n_iter=40, loss='log') # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable forest = forest.fit(train_data_features, train_data["age"]) bayes = bayes.fit(train_data_features, train_data["age"]) svm = svm.fit(train_data_features, train_data["age"]) sgd = sgd.fit(train_data_features, train_data["age"]) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_features = count_vect.transform(test_data.subscriptionLists) test_data_features = test_data_features.toarray() # Use the random forest to make age range predictions resultForest = forest.predict(test_data_features) resultBayes = bayes.predict(test_data_features) resultSVM = svm.predict(test_data_features) resultSGD = sgd.predict(test_data_features) outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes }) #print output # Use pandas to write the comma-separated output file outname = 'subscriptionLists_Bag_of_Words_ForestAndBayes.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) ################################### #******* MODEL EVALUATION ********* ################################### import ml_utils as ml_utils db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[['screen_name', 'subscriptionLists']] data = count_vect.fit_transform(data.subscriptionLists) y_complete = df_complete['age'] name_prefix = 'subscriptionNgrams_' + typeOp + '_' + balanced #-------------- ##BAYES #-------------- print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes, ageRanges, name_prefix, 'NaiveBayes', outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest, ageRanges, name_prefix, 'RandomForest', outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM, ageRanges, name_prefix, 'SVM', outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD, ageRanges, name_prefix, 'SGD', outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD print result return result
def save_listSubscriptions(screen_name,lists): db_access = MongoDBUtils() db_access.save_listSubscriptions(screen_name,lists)
def save_user(self, data): db_access = MongoDBUtils() db_access.save_user(data)
#!/usr/bin/python # -*- coding: utf8 -*- import os, sys sys.path.append(os.path.abspath(os.pardir)) from configs.settings import * from data_access.mongo_utils import MongoDBUtils from nlp_features.customStopwords import generateCustomStopWordsForallAgeRanges db_access = MongoDBUtils() ''' db_access.export_tweetsText_toCSV('normal','') db_access.export_tweetsText_toCSV('normal','faceAPI') db_access.export_tweetsText_toCSV('pedophilia','') db_access.export_tweetsText_toCSV('pedophilia','faceAPI') db_access.export_subscriptionLists_toCSV('normal','') db_access.export_subscriptionLists_toCSV('normal','faceAPI') db_access.export_subscriptionLists_toCSV('pedophilia','') db_access.export_subscriptionLists_toCSV('pedophilia','faceAPI') ''' db_access.export_tweetsText_toCSV_balanced() db_access.export_subscriptionLists_toCSV_balanced() ''' ages=db_access.getAgeRanges() for age in ages: db_access.export_tweetsTextFromAgeRange(age)
def set_age(screen_name,age): db_access = MongoDBUtils() db_access.set_age_user(screen_name,age)
def run(self): db_access = MongoDBUtils() usersUnlabeled = db_access.get_unlabeled_users_with_age() cont = 0 screen_names = '' for user_unlab in usersUnlabeled: age = user_unlab['ageRange'] if not db_access.userExistsInDb(user_unlab['screen_name'].lower(), 'users'): cont = cont + 1 print user_unlab['screen_name'] if screen_names == '': screen_names = user_unlab['screen_name'] else: screen_names = screen_names + ',' + user_unlab[ 'screen_name'] if cont == 99: #lookup_user: 100 requests every 15 min print 1 output = self.lookup_user(screen_name=screen_names) print "USUARIOS GUARDADOS EN BD:" for user in output: print user['screen_name'] try: try: userToSave = self.getUserTweets(user) except Exception as e: print "Usuario con perfil restringido" userToSave = user userToSave = self.populateOtherNetworks(userToSave) userToSave = self.populate_mentions_hashtags_urls( userToSave) userToSave["age"] = db_access.getEdad( userToSave['screen_name'], "unlabeled_users") userToSave["exactAge"] = db_access.getExactAge( userToSave['screen_name']) print userToSave['screen_name'] try: self.save_user(userToSave) self.markUnlabeledAsLabeled(userToSave) except pymongo.errors.DocumentTooLarge as e: while True: print "********* Doc muy grande, eliminando 50 tweets..." try: userToSave['tweets'] = userToSave[ 'tweets'][:len(userToSave['tweets'] ) - 50] self.save_user(userToSave) self.markUnlabeledAsLabeled(userToSave) break except pymongo.errors.DocumentTooLarge as e: pass except Exception as e: print "Error al intentar guardar usuario: ", user[ "screen_name"] print(e) pass screenName = '' print "esperando" time.sleep(900) cont = 0 screen_names = "" if len(screen_names) != 0: output = self.lookup_user(screen_name=screen_names) print "USUARIOS GUARDADOS EN BD:" for user in output: print user['screen_name'] try: try: userToSave = self.getUserTweets(user) except Exception as e: print "Usuario con perfil restringido" userToSave = user userToSave = self.populateOtherNetworks(userToSave) userToSave = self.populate_mentions_hashtags_urls( userToSave) userToSave["age"] = db_access.getEdad( userToSave['screen_name'], "unlabeled_users") userToSave["exactAge"] = db_access.getExactAge( userToSave['screen_name']) print userToSave['screen_name'] userToSave['screen_name'] = user['screen_name'].lower() try: self.save_user(userToSave) self.markUnlabeledAsLabeled(userToSave) except pymongo.errors.DocumentTooLarge as e: while True: print "********* Doc muy grande, eliminando 50 tweets..." try: userToSave['tweets'] = userToSave[ 'tweets'][:len(userToSave['tweets']) - 50] userToSave = self.populate_mentions_hashtags_urls( userToSave) self.save_user(userToSave) self.markUnlabeledAsLabeled(userToSave) break except pymongo.errors.DocumentTooLarge as e: pass except Exception as e: print "Error al intentar guardar usuario: ", user[ "screen_name"] print(e) pass
def markUnlabeledAsLabeled(self, userUnlabeled): db_access = MongoDBUtils() db_access.markUnlabeledAsLabeled(userUnlabeled['screen_name'])