Ejemplo n.º 1
0
def main_wordcloudsFollowing():
    ##WORDCLOUD FOR EVERY AGE RANGE
    db_access = MongoDBUtils()
    ageRanges = db_access.getAgeRanges()
    stopwords = getSpanishStopwords()

    for ar in ageRanges:
        #Decode data
        df_subscription = pd.read_csv(DATASET_PATH + "/subscriptionLists_" +
                                      ar + ".csv",
                                      sep=",",
                                      dtype=str)

        text = ' '.join(df_subscription['subscriptionLists'])

        for stop in stopwords:
            stop = ' ' + stop.encode('utf-8') + ' '
            text = text.replace(stop, ' ').encode('utf-8', 'ignore')

        wordcloud = WordCloud(width=1600,
                              height=800).generate(text.decode("utf-8"))
        print "Dibujando wordcloud para ", ar, " ..."
        # Open a plot of the generated image.
        plt.figure(figsize=(20, 10), facecolor='k')
        plt.title('wordcloud subscription lists:' + ar)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig('wordcloud_subscriptions' + ar + ".png",
                    facecolor='k',
                    bbox_inches='tight')
Ejemplo n.º 2
0
 def getLatestProfilePic(self, screen_name, image):
     print screen_name, "- getting latest profile pic"
     try:
         user = self.lookup_user(screen_name=screen_name)
         profilePic = user[0]["profile_image_url_https"]
         db_access = MongoDBUtils()
         db_access.updateProfilePicture(screen_name, profilePic)
         return profilePic.replace("normal", "400x400")
     except:
         return image
Ejemplo n.º 3
0
def convertToCategory(ageRanges,typeOp):
    db_access = MongoDBUtils()
    if typeOp =='normal':
        ages = db_access.getAgeRanges()
    else:
        ages=['10-17','18-24','25-xx']
	
    result=[]
    for ar in ageRanges:
        result.append(ages[ar].encode("utf-8"))
    return result
Ejemplo n.º 4
0
def main_wordcloudsTweets():
    ##WORDCLOUD FOR EVERY AGE RANGE
    db_access = MongoDBUtils()
    ageRanges = db_access.getAgeRanges()
    #ageRanges=['50-64']
    stopwords = getCustomStopwords()
    stopwords.append(u'jajaja')
    stopwords.append(u'gracia')
    stopwords.append(u'asi')
    stopwords.append(u'via')
    stopwords.append(u'dia')
    stopwords.append(u'tambien')
    stopsAux = []
    for stop in stopwords:
        stopsAux.append(stop.encode('utf-8'))

    for ar in ageRanges:
        print ar
        #Decode data
        df_tweets = pd.read_csv(DATASET_PATH + "/tweets_" + ar + ".csv",
                                sep=",")

        text = ''
        for tw in df_tweets['tweets']:
            tw = tw.translate(None, string.punctuation)
            tw = tw.replace('¿', ' ')
            tw = tw.replace('¡', ' ')
            tw = tw.replace('á', 'a')
            tw = tw.replace('é', 'e')
            tw = tw.replace('í', 'i')
            tw = tw.replace('ó', 'o')
            tw = tw.replace('ú', 'u')
            # Replace all stop words from the tweet
            text += removeStopWords(tw, stopwords)

        text = removeStopWords(text, stopwords)

        wordcloud = WordCloud(width=1600,
                              height=800).generate(text.decode("utf-8"))
        print "Dibujando wordcloud para ", ar, " ..."

        # Open a plot of the generated image.
        plt.figure(figsize=(20, 10), facecolor='k')
        plt.title('wordcloud ages:' + ar)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig('wordcloud_' + ar + ".png",
                    facecolor='k',
                    bbox_inches='tight')
Ejemplo n.º 5
0
def convertToInt(ageRanges,typeOp):
    db_access = MongoDBUtils()
    ages=[]
    result=[]
    
    if typeOp =='normal':
        ages = db_access.getAgeRanges()
    else:
        ages=['10-17','18-24','25-xx']

    for ar in ageRanges:
        result.append(ages.index(ar))

    return result
Ejemplo n.º 6
0
    def updateOtherNetworks(self):
        db_access = MongoDBUtils()
        cont = 0
        for user in db_access.get_users("users"):
            cont = cont + 1
            if (cont > 0):
                print cont

                try:
                    userToSave = self.populateOtherNetworks(user)
                    db_access.save_other_network(userToSave['screen_name'],
                                                 'facebook',
                                                 userToSave['facebook'])
                except Exception as e:
                    print "error"
Ejemplo n.º 7
0
def analyzeProfilePicture():
    db_access = MongoDBUtils()
    users = db_access.get_usersWithNoProfilePicAge()
    cont = 0
    for user in users:
        cont = cont + 1
        print "-----------------------"

        profilePic = user["profile_image_url_https"].replace(
            "normal", "400x400")
        age = getAgeGenderFromProfilePicture(user['screen_name'], profilePic)

        print user['screen_name'], ' - ', age

        if cont % 18 == 0:
            print "Esperando..."
            time.sleep(60)
Ejemplo n.º 8
0
    def updateTweets(self):
        db_access = MongoDBUtils()
        cont = 0
        for user in db_access.get_users("users"):
            cont = cont + 1
            if (len(user['tweets']) == 0):
                print cont
                oldTweets = len(user['tweets'])
                try:
                    userToSave = self.getUserTweets(user)
                except Exception as e:
                    print "Usuario con perfil restringido"
                    userToSave = populate_mentions_hashtags_urls(user)
                #print "ANTES:",oldTweets , "- AHORA: ", len(userToSave['tweets'])

                if oldTweets != len(userToSave['tweets']):
                    try:
                        db_access.save_user_tweets(user["screen_name"],
                                                   userToSave['tweets'])
                    except pymongo.errors.DocumentTooLarge as e:
                        while True:
                            print "********* Doc muy grande, eliminando 50 tweets..."
                            try:
                                userToSave['tweets'] = userToSave[
                                    'tweets'][:len(userToSave['tweets']) - 50]
                                db_access.save_user_tweets(
                                    user["screen_name"], userToSave['tweets'])
                                break
                            except pymongo.errors.DocumentTooLarge as e:
                                pass
Ejemplo n.º 9
0
def getAgeGenderFromProfilePicture(screen_name, image):
    db_access = MongoDBUtils()
    users = db_access.get_users('users')
    KEY = '80420a0d0de14f4d9fa2f1c6027afc38'  # Replace with a valid subscription key (keeping the quotes in place).
    CF.Key.set(KEY)
    #KEY: https://azure.microsoft.com/en-us/try/cognitive-services/?apiSlug=face-api&country=Uruguay&allowContact=true
    #TEST ONLINE: https://westcentralus.dev.cognitive.microsoft.com/docs/services/563879b61984550e40cbbe8d/operations/563879b61984550f30395236/console
    #TUTORIAL: https://docs.microsoft.com/en-us/azure/cognitive-services/face/tutorials/faceapiinpythontutorial

    BASE_URL = 'https://westcentralus.api.cognitive.microsoft.com/face/v1.0/'
    CF.BaseUrl.set(BASE_URL)

    # You can use this example JPG or replace the URL below with your own URL to a JPEG image.
    img_url = image
    resultAge = -1
    resultGender = -1

    try:
        faces = CF.face.detect(img_url, False, False, 'age,gender')
        #print faces
        #print "age: ", faces[0]['faceAttributes']['age']
        if len(faces) > 0:
            resultAge = int(round(faces[0]['faceAttributes']['age'], 0))
            resultGender = faces[0]['faceAttributes']['gender']
    except Exception as ex:
        print "User: "******" - Error while calculating age from profile pic: ", image
        print ex

    if resultAge == -1:
        streamer = TwitterStreamer(Twython)
        newProfilePic = streamer.getLatestProfilePic(screen_name, image)
        isNew = image != newProfilePic
        print "profile pic updated: ", isNew

        if isNew:
            resultAge = getAgeGenderFromProfilePicture(screen_name,
                                                       newProfilePic)
            print resultAge

    db_access.set_profilePic_age_gender_user(screen_name, resultAge,
                                             resultGender)
    return resultAge
Ejemplo n.º 10
0
def process_twitter_data(worker_id, queue, module_name, source):
    """
    This is the worker thread function.
    It processes items in the queue one after
    another.  These daemon threads go into an
    infinite loop, and only exit when
    the main thread ends.
    """

    logger = logging.getLogger(LOGGING_ROOT_NAME + '.processor.' +
                               str(worker_id))
    logger.debug("Worker" + str(worker_id) + " looking for data...")

    db_access = MongoDBUtils()

    while True:
        data = queue.get()
        if 'text' in data:
            # Guarda el Tweet
            db_access.save_tweet(data, source)
            #print data
            print "Tweet guardado..."
            logger.debug('TWEET | id: ' + str(data['id']) + ': ' +
                         data['text'].encode('utf-8'))
        elif 'delete' in data:
            logger.debug('DELETION NOTICE | ' + str(data).encode('utf-8'))
        elif 'warning' in data:
            logger.debug('STALL WARNING | ' + str(data).encode('utf-8'))
        elif 'limit' in data:
            logger.debug('LIMIT NOTICE | ' + str(data).encode('utf-8'))
        elif 'disconnect' in data:
            logger.debug('DISCONNECTION MESSAGE | ' +
                         str(data).encode('utf-8'))
        elif 'status_withheld' in data:
            logger.debug('STATUS WITHHELD | ' + str(data).encode('utf-8'))
        elif 'user_withheld' in data:
            logger.debug('USER WITHHELD | ' + str(data).encode('utf-8'))
        else:
            logger.debug('PRETTY ODD | Data: ' + str(data))

        queue.task_done()
Ejemplo n.º 11
0
    def run(self):
        db_access = MongoDBUtils()
        users = db_access.get_usersWithNoSubscriptionLists();
        contador=1
        for user in users:
            try:
                if contador < 15:
                    contador = contador + 1
                    print '----------------------------'
                    print user["screen_name"]
                    lists=self.get_list_subscriptions(screen_name=user['screen_name'],count=1000)                                        
                    print len(lists["lists"])
                    save_listSubscriptions(user["screen_name"], lists["lists"])
                    contador=contador+1
                else:
                    print "esperando"
                    time.sleep(900)
                    contador=0

            except Exception as e : 
                print 'Error subscription lists user: ',user["screen_name"]
                print e
                save_listSubscriptions(user["screen_name"], -1)
Ejemplo n.º 12
0
#!/usr/bin/python
# -*- coding: utf8 -*-
import os, sys

sys.path.append(os.path.abspath(os.pardir))

from configs.settings import *
from data_access.mongo_utils import MongoDBUtils
from requests.exceptions import ChunkedEncodingError
from twython import Twython
from threading import Thread
from Queue import Queue
import streamer_logging
import traceback
import logging
import sys
import time
import pymongo
from pymongo import MongoClient
import imp
from extractUsers import TwitterStreamer
from bio import etiquetarUsuarios
from howold_extractor.scrapingHowold import analyzeProfilePicture
from extractListsSubscriptions import TwitterStreamerSubscriptions

##Mover usuarios etiquetados en paso anterior a la collection "users"
#print "Ejecutando extractUsers.py"
db_access = MongoDBUtils()
db_access.populate_mentions_hashtags_urls()
Ejemplo n.º 13
0
def main_tweetNgrams(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_train.csv",
                                 sep=",",
                                 dtype=str)[['screen_name', 'tweets', 'age']]
        test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv",
                                sep=",",
                                dtype=str)[['screen_name', 'tweets', 'age']]
    else:
        #train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)[['screen_name','tweets','age']]
        #test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)[['screen_name','tweets','age']]

        #EXPERIMENT 4
        train_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                 "_faceAPI_tweets_train.csv",
                                 sep=",",
                                 dtype=str)[['screen_name', 'tweets', 'age']]
        test_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                "_faceAPI_tweets_test.csv",
                                sep=",",
                                dtype=str)[['screen_name', 'tweets', 'age']]

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    ##STOPWORDS EN SPANISH, SCIKIT TRAE SOLO EN INGLES
    stopwords = getCustomStopwords()

    #count_vect = CountVectorizer(stop_words=stopwords, max_features=5000 ) #Para hacer bag of words
    #X_train_counts = count_vect.fit_transform(train_data.tweets)
    # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
    # into feature vectors.

    transformer_tfidf = TfidfVectorizer(smooth_idf=False,
                                        lowercase=False,
                                        stop_words=stopwords,
                                        max_features=5000,
                                        ngram_range=(1, 3))
    tfidf = transformer_tfidf.fit_transform(train_data.tweets)

    ##To see occurrences of a specific word:
    #print count_vect.vocabulary_.get(u'amigos')

    train_data_features = tfidf.toarray()
    #print len(train_data) #186 users en train

    # Take a look at the words in the vocabulary
    vocab = transformer_tfidf.get_feature_names()
    #print vocab

    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)

    # For each, print the vocabulary word and the number of times it
    # appears in the training set
    #for tag, count in zip(vocab, dist):
    #	print count, tag

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 10, 'gamma': 0.1}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 160, 'max_depth': 20, 'min_samples_leaf': 3}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 50, 'loss': 'log'}

    ########################################
    #******* MODEL TRAINING        *********
    ########################################

    # ********* ENTRENO LOS MODELOS CON LA DATA EN TRAIN*********#

    print "Training the Classifiers..."

    # Initialize Multinomial Naive Bayes
    bayes = MultinomialNB()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=160,
                                    max_depth=20,
                                    min_samples_leaf=3)
    # Fit the forest to the training set, using the bag of)

    svm = SVC(kernel='rbf', C=10, gamma=0.1)

    sgd = SGDClassifier(loss='log',
                        penalty='l2',
                        random_state=42,
                        alpha=0.0001,
                        n_iter=60)

    regr = LinearRegression()
    # Fit the forest to the training set, using the bag of words as
    # features and the age range as the response variable

    #forest = forest.fit( train_data_features, train_data["age"] )

    #bayes = bayes.fit( train_data_features, train_data["age"] )

    #svm = svm.fit(train_data_features, train_data["age"] )

    #sgd= sgd.fit(train_data_features, train_data["age"] )

    regr = regr.fit(train_data_features, train_data["age"])

    # Read the test data

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = transformer_tfidf.transform(test_data.tweets)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make age range predictions
    #resultForest = forest.predict(test_data_features)

    #resultBayes = bayes.predict(test_data_features)
    #print "resultbayes: ", resultBayes

    #resultSVM= svm.predict(test_data_features)

    #resultSGD= sgd.predict(test_data_features)

    resultLR = regr.predict(test_data_features)

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes,
            "ageSVM": resultSVM,
            "ageSGD": resultSGD
        })

    # Use pandas to write the comma-separated output file
    outname = 'tweets_ngrams_results.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    ###################################
    #******* MODEL EVALUATION *********
    ###################################

    print "Evaluating the model --> Calculating metrics ..."

    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[['screen_name', 'tweets']]
    data = transformer_tfidf.fit_transform(data.tweets)
    y_complete = df_complete['age']

    name_prefix = 'tweetNgrams_' + typeOp + '_' + balanced
    #--------------
    ##BAYES
    #--------------
    #print "Metrics for Naive Bayes:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir)
    #print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names)

    #scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracyNB = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    #print "Metrics for Random Forest:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir)
    #print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names)

    #scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracyRF = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    #print "Metrics for SVM:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir)
    #print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names)

    #scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracySVM = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    #print "Metrics for SGD:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir)
    #print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names)

    #scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracySGD = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracySGD

    #--------------
    ##OUTPUT
    #--------------
    result = "ACCURACY--> N.Bayes:", 0, "|RForest:", 0, "|SVM:", 0, "|SGD:", 0
    #print result
    return result
Ejemplo n.º 14
0
def etiquetarUsuarios():
    db_access = MongoDBUtils()

    print "Etiquetando usuarios con la edad en la bio......"
    db_access.getBioWithAge("users")
Ejemplo n.º 15
0
def main_customFields(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(
            DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]
        test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv",
                                sep=",",
                                dtype=str)[[
                                    'screen_name', 'friends_count',
                                    'tweets_count', 'linkedin', 'snapchat',
                                    'instagram', 'facebook', 'followers_count',
                                    'favourites_count', 'qtyMentions',
                                    'qtyHashtags', 'qtyUrls', 'qtyEmojis',
                                    'qtyUppercase', 'profile_pic_gender', 'age'
                                ]]
    else:
        train_data = pd.read_csv(
            DATASET_PATH + "/" + typeOp + "_tweets_train.csv",
            sep=",",
            dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]
        test_data = pd.read_csv(
            DATASET_PATH + "/" + typeOp + "_tweets_test.csv",
            sep=",",
            dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    features = train_data.columns[1:(len(train_data.columns) - 1)]
    train_data_features = train_data[features]
    test_data_features = test_data[features]

    import ml_utils as ml_utils

    # convert age ranges into integers
    y = ml_utils.convertToInt(train_data['age'], typeOp)

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 140, 'max_depth': 20, 'min_samples_leaf': 2}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 50, 'loss': 'log'}

    ########################################
    #******* MODEL TRAINING        *********
    ########################################
    print "Training the classifiers ..."

    forest = RandomForestClassifier(n_estimators=140,
                                    max_depth=20,
                                    min_samples_leaf=2)

    bayes = MultinomialNB()

    svm = SVC(kernel='rbf', C=8, gamma=0.01)

    sgd = SGDClassifier(loss='log',
                        penalty='l2',
                        random_state=42,
                        alpha=0.001,
                        n_iter=50)

    # Train the Classifier to take the training features and learn how they relate to the age
    forest.fit(train_data_features, y)

    bayes.fit(train_data_features, y)

    svm = svm.fit(train_data_features, y)

    sgd = sgd.fit(train_data_features, y)

    # Apply the Classifier we trained to the test data
    # Create actual english names for the ages for each predicted age range
    resultForest = ml_utils.convertToCategory(
        forest.predict(test_data_features), typeOp)

    resultBayes = ml_utils.convertToCategory(bayes.predict(test_data_features),
                                             typeOp)

    resultSVM = ml_utils.convertToCategory(svm.predict(test_data_features),
                                           typeOp)

    resultSGD = ml_utils.convertToCategory(sgd.predict(test_data_features),
                                           typeOp)

    # View the predicted probabilities of the first 10 observations
    forest.predict_proba(test_data_features)[0:10]

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes,
            "ageSVM": resultSVM,
            "ageSGD": resultSGD
        })

    # Use pandas to write the comma-separated output file
    outname = 'tweets_customFields_results.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    # View a list of the features and their importance scores
    headers = ["name", "score"]
    print "Importance of Features: "  #, sorted(list(zip(train_data[features], forest.feature_importances_)), key=lambda x: x[1])

    values = sorted(zip(train_data_features, forest.feature_importances_),
                    key=lambda x: x[1] * -1)
    print tabulate(values, headers, tablefmt="plain")

    #############################################
    # EVALUATE THE MODEL
    #############################################
    print "Evaluating the model --> Calculating metrics ..."

    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[[
        'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram',
        'facebook', 'followers_count', 'favourites_count', 'qtyMentions',
        'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase',
        'profile_pic_gender'
    ]]
    y_complete = ml_utils.convertToInt(df_complete['age'], typeOp)

    #--------------
    ##BAYES
    #--------------
    name_prefix = 'customFields_' + typeOp + '_' + balanced

    print "Metrics for Naive Bayes:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes,
                                   ageRanges, name_prefix, 'NaiveBayes',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultBayes,
                                target_names=target_names)

    scores = cross_val_score(bayes,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyNB = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    print "Metrics for Random Forest:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest,
                                   ageRanges, name_prefix, 'RandomForest',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultForest,
                                target_names=target_names)

    scores = cross_val_score(forest,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyRF = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    print "Metrics for SVM:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM,
                                   ageRanges, name_prefix, 'SVM', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSVM,
                                target_names=target_names)

    scores = cross_val_score(svm,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySVM = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    print "Metrics for SGD:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD,
                                   ageRanges, name_prefix, 'SGD', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSGD,
                                target_names=target_names)

    scores = cross_val_score(sgd,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySGD = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySGD

    #--------------
    ##OUTPUT
    #--------------
    result = "ACCURACY--> N.Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD
    print result
    return result  # Copy the results to a pandas dataframe
    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes
        })
Ejemplo n.º 16
0
def main_tweetNgramsAndCustomFields(typeOp,balanced):
	
	if balanced == 'balanced':
		train_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_train.csv", sep=",",dtype=str)
		test_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_test.csv", sep=",",dtype=str)
	else:
		train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)
		test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)

	# Show the number of observations for the test and training dataframes
	print 'Number of observations in the training data:', len(train_data)
	print 'Number of observations in the test data:',len(test_data)

	frames = [train_data, test_data]
	df_complete= pd.concat(frames)

	print 'Number of observations in the whole dataset:',len(df_complete)
	
	stopwords = getCustomStopwords()

	transformer_tfidf = TfidfVectorizer(smooth_idf=False,lowercase=False,stop_words=stopwords,max_features=5000, ngram_range=(1,3))
	tfidf = transformer_tfidf.fit_transform(train_data.tweets)
	'''
	headers = ["name", "score"]
	idf = transformer_tfidf.idf_
	print "Most frequent TFIDF terms in dataset: "
	valuesTfIdf = sorted(zip(idf,transformer_tfidf.get_feature_names()), key=lambda x: x[0])
	print(tabulate(valuesTfIdf, headers, tablefmt="plain"))
	'''
	# fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
	# into feature vectors. 

	##To see occurrences of a specific word:
	#print count_vect.vocabulary_.get(u'amigos')

	train_data_feat = tfidf.toarray()
	#print len(train_data) #186 users en train
	
	train_data_features = np.c_[train_data_feat, train_data['friends_count'],train_data['tweets_count'], train_data['linkedin'],train_data['snapchat'], train_data['instagram'],train_data['facebook'],train_data['followers_count'],train_data['favourites_count'],train_data['qtyMentions'],train_data['qtyHashtags'],train_data['qtyUrls'], train_data['qtyEmojis'], train_data['qtyUppercase'],train_data['profile_pic_gender']]
	
	# Sum up the counts of each vocabulary word
	dist = np.sum(train_data_features, axis=0)
	
	# Sum up the counts of each vocabulary word
	dist = np.sum(train_data_features, axis=0)

	# For each, print the vocabulary word and the number of times it 
	# appears in the training set
	#for tag, count in zip(vocab, dist):
	#	print count, tag

	########################################
	#******* HYPERPARAMETER TUNING *********
	########################################
	
	import ml_utils as ml_utils
	#PARAMETERS TUNING
	#print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01}
	#print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 3}
	#print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 40, 'loss': 'log'}

	########################################
	#******* MODEL TRAINING        *********
	########################################

	print "Training the models..."

	# Initialize Multinomial Naive Bayes
	bayes = MultinomialNB()

	# Initialize a Random Forest classifier with 100 trees
	forest = RandomForestClassifier(n_estimators = 120, max_depth= 30, min_samples_leaf= 3) 
	# Fit the forest to the training set, using the bag of)

	svm = SVC(kernel='rbf', C= 8, gamma =  0.01)

	sgd = SGDClassifier(loss='log', penalty='l2', alpha=0.001,n_iter=40)

	# Fit the forest to the training set, using the bag of words as 
	# features and the age range as the response variable

	forest = forest.fit( train_data_features, train_data["age"] ) 

	bayes = bayes.fit( train_data_features, train_data["age"] ) 

	svm = svm.fit(train_data_features, train_data["age"] ) 

	sgd= sgd.fit(train_data_features, train_data["age"] ) 

	# Read the test data

	# Get a bag of words for the test set, and convert to a numpy array
	test_data_feat = transformer_tfidf.transform(test_data.tweets)
	test_data_feat = test_data_feat.toarray()

	test_data_features = np.c_[test_data_feat, test_data['friends_count'],test_data['tweets_count'], test_data['linkedin'],test_data['snapchat'], test_data['instagram'],test_data['facebook'],test_data['followers_count'],test_data['favourites_count'],test_data['qtyMentions'],test_data['qtyHashtags'],test_data['qtyUrls'], test_data['qtyEmojis'], test_data['qtyUppercase'],test_data['profile_pic_gender']]

	# Use the random forest to make age range predictions
	resultForest = forest.predict(test_data_features)

	resultBayes = bayes.predict(test_data_features)

	resultSVM= svm.predict(test_data_features)

	resultSGD= sgd.predict(test_data_features)

	outdir =time.strftime("%d-%m-%Y")
	
	if not os.path.exists(outdir):
   		os.mkdir(outdir)

   	if not os.path.exists(outdir +"/"+typeOp):
   		os.mkdir(outdir +"/"+typeOp)

   	outdir=outdir +"/"+typeOp

	# Copy the results to a pandas dataframe with an "id" column and
	# a "age" column

	output = pd.DataFrame( data={"id":test_data["screen_name"], "realAge":test_data["age"], "ageRandomForest":resultForest,"ageNaiveBayes":resultBayes})
	#print output

	# Use pandas to write the comma-separated output file
	outname = 'Bigram_model_ForestAndBayes.csv'
	fullname = os.path.join(outdir, outname)    
	output.to_csv(fullname,index=False)

	###################################
	#******* MODEL EVALUATION *********
	###################################
	
	import ml_utils as ml_utils
	db_access = MongoDBUtils()

	ageRanges=[]
	if typeOp=='normal':
   		ageRanges=db_access.getAgeRanges()
   	else:
		ageRanges=db_access.get3AgeRanges()

	target_names=ageRanges


	data_aux = transformer_tfidf.fit_transform(df_complete.tweets)
	data_aux = data_aux.toarray()
	data = np.c_[data_aux, df_complete['friends_count'],df_complete['tweets_count'], df_complete['linkedin'],df_complete['snapchat'], df_complete['instagram'],df_complete['facebook'],df_complete['followers_count'],df_complete['favourites_count'],df_complete['qtyMentions'],df_complete['qtyHashtags'],df_complete['qtyUrls'], df_complete['qtyEmojis'], df_complete['qtyUppercase'],df_complete['profile_pic_gender']]
	
	y_complete = df_complete['age']

	name_prefix='tweetNgramsAndCustomFields_'+typeOp+'_'+balanced
	print data.shape
	#--------------
	##BAYES
	#--------------
	print "Metrics for Naive Bayes:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir)
	print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names)

	scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracyNB = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracyNB

	#--------------
	##RANDOM FOREST
	#--------------
	print "Metrics for Random Forest:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir)
	print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names)
	
	scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracyRF = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracyRF 

	#--------------
	##SVM
	#--------------
	print "Metrics for SVM:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir)
	print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names)
	
	scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracySVM = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracySVM

	#--------------
	##SGD
	#--------------
	print "Metrics for SGD:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir)
	print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names)

	scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracySGD = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracySGD 
	#--------------
	##OUTPUT
	#--------------
	result= "ACCURACY--> N.Bayes:",accuracyNB,"|RForest:", accuracyRF,"|SVM:", accuracySVM,"|SGD:", accuracySGD
	print result
	return result	
Ejemplo n.º 17
0
import os, sys
import os.path
import numpy as np
import pandas as pd
sys.path.append(os.path.abspath(os.pardir))
from sklearn.model_selection import train_test_split
from configs.settings import *
from data_access.mongo_utils import MongoDBUtils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from stop_words import get_stop_words
import re
import imp
from nltk.corpus import stopwords

db_access = MongoDBUtils()
db_access.export_tweetsLabeled()
Ejemplo n.º 18
0
 def run(self):
     db_access = MongoDBUtils()
     output = self.lookup_user(screen_name='michael_sorano')
     print output
Ejemplo n.º 19
0
def main_subscriptionNgrams(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(DATASET_PATH +
                                 "/subscriptionLists_balanced_train.csv",
                                 sep=",",
                                 dtype=str)
        test_data = pd.read_csv(DATASET_PATH +
                                "/subscriptionLists_balanced_test.csv",
                                sep=",",
                                dtype=str)
    else:
        train_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                 "_subscriptionLists_train.csv",
                                 sep=",",
                                 dtype=str)
        test_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                "_subscriptionLists_test.csv",
                                sep=",",
                                dtype=str)

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    stopwords = getSpanishStopwords()

    count_vect = CountVectorizer(stop_words=stopwords,
                                 max_features=5000,
                                 ngram_range=(1, 3),
                                 token_pattern=r'\b\w+\b')
    X_train_counts = count_vect.fit_transform(train_data.subscriptionLists)
    # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
    # into feature vectors.

    ##To see occurrences of a specific word:
    #print count_vect.vocabulary_.get(u'amigos')

    train_data_features = X_train_counts.toarray()
    #print len(train_data) #186 users en train

    #print train_data_features.shape
    #(186, 500) --> It has 212 rows and 500 features (500 most frequent words).

    # Take a look at the words in the vocabulary
    vocab = count_vect.get_feature_names()
    #print vocab

    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)

    # For each, print the vocabulary word and the number of times it
    # appears in the training set
    #for tag, count in zip(vocab, dist):
    #	print count, tag

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT:{'kernel': 'rbf', 'C': 10, 'gamma': 0.01}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 1}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 40, 'loss': 'log'}

    # ********* APLICO MODELOS Y LOS ENTRENO CON LA DATA EN TRAIN*********#

    print "Training the models..."

    # Initialize Multinomial Naive Bayes
    bayes = MultinomialNB()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=120,
                                    max_depth=30,
                                    min_samples_leaf=1)
    # Fit the forest to the training set, using the bag of)

    svm = SVC(kernel='rbf', C=10, gamma=0.01)

    sgd = SGDClassifier(penalty='elasticnet',
                        alpha=0.0001,
                        n_iter=40,
                        loss='log')

    # Fit the forest to the training set, using the bag of words as
    # features and the age range as the response variable

    forest = forest.fit(train_data_features, train_data["age"])

    bayes = bayes.fit(train_data_features, train_data["age"])

    svm = svm.fit(train_data_features, train_data["age"])

    sgd = sgd.fit(train_data_features, train_data["age"])
    # Read the test data

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = count_vect.transform(test_data.subscriptionLists)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make age range predictions
    resultForest = forest.predict(test_data_features)

    resultBayes = bayes.predict(test_data_features)

    resultSVM = svm.predict(test_data_features)

    resultSGD = sgd.predict(test_data_features)

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes
        })
    #print output

    # Use pandas to write the comma-separated output file
    outname = 'subscriptionLists_Bag_of_Words_ForestAndBayes.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    ###################################
    #******* MODEL EVALUATION *********
    ###################################
    import ml_utils as ml_utils
    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[['screen_name', 'subscriptionLists']]
    data = count_vect.fit_transform(data.subscriptionLists)
    y_complete = df_complete['age']

    name_prefix = 'subscriptionNgrams_' + typeOp + '_' + balanced

    #--------------
    ##BAYES
    #--------------
    print "Metrics for Naive Bayes:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes,
                                   ageRanges, name_prefix, 'NaiveBayes',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultBayes,
                                target_names=target_names)

    scores = cross_val_score(bayes,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyNB = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    print "Metrics for Random Forest:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest,
                                   ageRanges, name_prefix, 'RandomForest',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultForest,
                                target_names=target_names)

    scores = cross_val_score(forest,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyRF = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    print "Metrics for SVM:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM,
                                   ageRanges, name_prefix, 'SVM', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSVM,
                                target_names=target_names)

    scores = cross_val_score(svm,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySVM = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    print "Metrics for SGD:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD,
                                   ageRanges, name_prefix, 'SGD', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSGD,
                                target_names=target_names)

    scores = cross_val_score(sgd,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySGD = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySGD
    #--------------
    ##OUTPUT
    #--------------

    result = "ACCURACY--> Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD
    print result
    return result
Ejemplo n.º 20
0
def save_listSubscriptions(screen_name,lists):
    db_access = MongoDBUtils()
    db_access.save_listSubscriptions(screen_name,lists)
Ejemplo n.º 21
0
 def save_user(self, data):
     db_access = MongoDBUtils()
     db_access.save_user(data)
Ejemplo n.º 22
0
#!/usr/bin/python
# -*- coding: utf8 -*-
import os, sys
sys.path.append(os.path.abspath(os.pardir))

from configs.settings import *
from data_access.mongo_utils import MongoDBUtils
from nlp_features.customStopwords import generateCustomStopWordsForallAgeRanges

db_access = MongoDBUtils()
'''
db_access.export_tweetsText_toCSV('normal','')
db_access.export_tweetsText_toCSV('normal','faceAPI')

db_access.export_tweetsText_toCSV('pedophilia','')
db_access.export_tweetsText_toCSV('pedophilia','faceAPI')

db_access.export_subscriptionLists_toCSV('normal','')
db_access.export_subscriptionLists_toCSV('normal','faceAPI')

db_access.export_subscriptionLists_toCSV('pedophilia','')
db_access.export_subscriptionLists_toCSV('pedophilia','faceAPI')
'''
db_access.export_tweetsText_toCSV_balanced()
db_access.export_subscriptionLists_toCSV_balanced()
'''
ages=db_access.getAgeRanges()

for age in ages:
	db_access.export_tweetsTextFromAgeRange(age)
Ejemplo n.º 23
0
def set_age(screen_name,age):
    db_access = MongoDBUtils()
    db_access.set_age_user(screen_name,age)
Ejemplo n.º 24
0
    def run(self):
        db_access = MongoDBUtils()
        usersUnlabeled = db_access.get_unlabeled_users_with_age()
        cont = 0
        screen_names = ''

        for user_unlab in usersUnlabeled:
            age = user_unlab['ageRange']

            if not db_access.userExistsInDb(user_unlab['screen_name'].lower(),
                                            'users'):
                cont = cont + 1
                print user_unlab['screen_name']

                if screen_names == '':
                    screen_names = user_unlab['screen_name']
                else:
                    screen_names = screen_names + ',' + user_unlab[
                        'screen_name']

                if cont == 99:  #lookup_user: 100 requests every 15 min
                    print 1
                    output = self.lookup_user(screen_name=screen_names)

                    print "USUARIOS GUARDADOS EN BD:"
                    for user in output:
                        print user['screen_name']
                        try:
                            try:
                                userToSave = self.getUserTweets(user)
                            except Exception as e:
                                print "Usuario con perfil restringido"
                                userToSave = user

                            userToSave = self.populateOtherNetworks(userToSave)
                            userToSave = self.populate_mentions_hashtags_urls(
                                userToSave)

                            userToSave["age"] = db_access.getEdad(
                                userToSave['screen_name'], "unlabeled_users")
                            userToSave["exactAge"] = db_access.getExactAge(
                                userToSave['screen_name'])

                            print userToSave['screen_name']

                            try:
                                self.save_user(userToSave)
                                self.markUnlabeledAsLabeled(userToSave)
                            except pymongo.errors.DocumentTooLarge as e:
                                while True:
                                    print "********* Doc muy grande, eliminando 50 tweets..."
                                    try:
                                        userToSave['tweets'] = userToSave[
                                            'tweets'][:len(userToSave['tweets']
                                                           ) - 50]
                                        self.save_user(userToSave)
                                        self.markUnlabeledAsLabeled(userToSave)
                                        break
                                    except pymongo.errors.DocumentTooLarge as e:
                                        pass

                        except Exception as e:
                            print "Error al intentar guardar usuario: ", user[
                                "screen_name"]
                            print(e)
                            pass

                    screenName = ''
                    print "esperando"
                    time.sleep(900)
                    cont = 0
                    screen_names = ""

        if len(screen_names) != 0:
            output = self.lookup_user(screen_name=screen_names)
            print "USUARIOS GUARDADOS EN BD:"
            for user in output:
                print user['screen_name']
                try:
                    try:
                        userToSave = self.getUserTweets(user)
                    except Exception as e:
                        print "Usuario con perfil restringido"
                        userToSave = user

                    userToSave = self.populateOtherNetworks(userToSave)
                    userToSave = self.populate_mentions_hashtags_urls(
                        userToSave)
                    userToSave["age"] = db_access.getEdad(
                        userToSave['screen_name'], "unlabeled_users")
                    userToSave["exactAge"] = db_access.getExactAge(
                        userToSave['screen_name'])

                    print userToSave['screen_name']
                    userToSave['screen_name'] = user['screen_name'].lower()

                    try:
                        self.save_user(userToSave)
                        self.markUnlabeledAsLabeled(userToSave)
                    except pymongo.errors.DocumentTooLarge as e:
                        while True:
                            print "********* Doc muy grande, eliminando 50 tweets..."
                            try:
                                userToSave['tweets'] = userToSave[
                                    'tweets'][:len(userToSave['tweets']) - 50]
                                userToSave = self.populate_mentions_hashtags_urls(
                                    userToSave)
                                self.save_user(userToSave)
                                self.markUnlabeledAsLabeled(userToSave)
                                break
                            except pymongo.errors.DocumentTooLarge as e:
                                pass

                except Exception as e:
                    print "Error al intentar guardar usuario: ", user[
                        "screen_name"]
                    print(e)
                    pass
Ejemplo n.º 25
0
 def markUnlabeledAsLabeled(self, userUnlabeled):
     db_access = MongoDBUtils()
     db_access.markUnlabeledAsLabeled(userUnlabeled['screen_name'])