Exemple #1
0
def main_wordcloudsFollowing():
    ##WORDCLOUD FOR EVERY AGE RANGE
    db_access = MongoDBUtils()
    ageRanges = db_access.getAgeRanges()
    stopwords = getSpanishStopwords()

    for ar in ageRanges:
        #Decode data
        df_subscription = pd.read_csv(DATASET_PATH + "/subscriptionLists_" +
                                      ar + ".csv",
                                      sep=",",
                                      dtype=str)

        text = ' '.join(df_subscription['subscriptionLists'])

        for stop in stopwords:
            stop = ' ' + stop.encode('utf-8') + ' '
            text = text.replace(stop, ' ').encode('utf-8', 'ignore')

        wordcloud = WordCloud(width=1600,
                              height=800).generate(text.decode("utf-8"))
        print "Dibujando wordcloud para ", ar, " ..."
        # Open a plot of the generated image.
        plt.figure(figsize=(20, 10), facecolor='k')
        plt.title('wordcloud subscription lists:' + ar)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig('wordcloud_subscriptions' + ar + ".png",
                    facecolor='k',
                    bbox_inches='tight')
Exemple #2
0
def convertToCategory(ageRanges,typeOp):
    db_access = MongoDBUtils()
    if typeOp =='normal':
        ages = db_access.getAgeRanges()
    else:
        ages=['10-17','18-24','25-xx']
	
    result=[]
    for ar in ageRanges:
        result.append(ages[ar].encode("utf-8"))
    return result
Exemple #3
0
def convertToInt(ageRanges,typeOp):
    db_access = MongoDBUtils()
    ages=[]
    result=[]
    
    if typeOp =='normal':
        ages = db_access.getAgeRanges()
    else:
        ages=['10-17','18-24','25-xx']

    for ar in ageRanges:
        result.append(ages.index(ar))

    return result
Exemple #4
0
def main_wordcloudsTweets():
    ##WORDCLOUD FOR EVERY AGE RANGE
    db_access = MongoDBUtils()
    ageRanges = db_access.getAgeRanges()
    #ageRanges=['50-64']
    stopwords = getCustomStopwords()
    stopwords.append(u'jajaja')
    stopwords.append(u'gracia')
    stopwords.append(u'asi')
    stopwords.append(u'via')
    stopwords.append(u'dia')
    stopwords.append(u'tambien')
    stopsAux = []
    for stop in stopwords:
        stopsAux.append(stop.encode('utf-8'))

    for ar in ageRanges:
        print ar
        #Decode data
        df_tweets = pd.read_csv(DATASET_PATH + "/tweets_" + ar + ".csv",
                                sep=",")

        text = ''
        for tw in df_tweets['tweets']:
            tw = tw.translate(None, string.punctuation)
            tw = tw.replace('¿', ' ')
            tw = tw.replace('¡', ' ')
            tw = tw.replace('á', 'a')
            tw = tw.replace('é', 'e')
            tw = tw.replace('í', 'i')
            tw = tw.replace('ó', 'o')
            tw = tw.replace('ú', 'u')
            # Replace all stop words from the tweet
            text += removeStopWords(tw, stopwords)

        text = removeStopWords(text, stopwords)

        wordcloud = WordCloud(width=1600,
                              height=800).generate(text.decode("utf-8"))
        print "Dibujando wordcloud para ", ar, " ..."

        # Open a plot of the generated image.
        plt.figure(figsize=(20, 10), facecolor='k')
        plt.title('wordcloud ages:' + ar)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.savefig('wordcloud_' + ar + ".png",
                    facecolor='k',
                    bbox_inches='tight')
Exemple #5
0
def main_customFields(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(
            DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]
        test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv",
                                sep=",",
                                dtype=str)[[
                                    'screen_name', 'friends_count',
                                    'tweets_count', 'linkedin', 'snapchat',
                                    'instagram', 'facebook', 'followers_count',
                                    'favourites_count', 'qtyMentions',
                                    'qtyHashtags', 'qtyUrls', 'qtyEmojis',
                                    'qtyUppercase', 'profile_pic_gender', 'age'
                                ]]
    else:
        train_data = pd.read_csv(
            DATASET_PATH + "/" + typeOp + "_tweets_train.csv",
            sep=",",
            dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]
        test_data = pd.read_csv(
            DATASET_PATH + "/" + typeOp + "_tweets_test.csv",
            sep=",",
            dtype=str)[[
                'screen_name', 'friends_count', 'tweets_count', 'linkedin',
                'snapchat', 'instagram', 'facebook', 'followers_count',
                'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls',
                'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age'
            ]]

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    features = train_data.columns[1:(len(train_data.columns) - 1)]
    train_data_features = train_data[features]
    test_data_features = test_data[features]

    import ml_utils as ml_utils

    # convert age ranges into integers
    y = ml_utils.convertToInt(train_data['age'], typeOp)

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 140, 'max_depth': 20, 'min_samples_leaf': 2}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 50, 'loss': 'log'}

    ########################################
    #******* MODEL TRAINING        *********
    ########################################
    print "Training the classifiers ..."

    forest = RandomForestClassifier(n_estimators=140,
                                    max_depth=20,
                                    min_samples_leaf=2)

    bayes = MultinomialNB()

    svm = SVC(kernel='rbf', C=8, gamma=0.01)

    sgd = SGDClassifier(loss='log',
                        penalty='l2',
                        random_state=42,
                        alpha=0.001,
                        n_iter=50)

    # Train the Classifier to take the training features and learn how they relate to the age
    forest.fit(train_data_features, y)

    bayes.fit(train_data_features, y)

    svm = svm.fit(train_data_features, y)

    sgd = sgd.fit(train_data_features, y)

    # Apply the Classifier we trained to the test data
    # Create actual english names for the ages for each predicted age range
    resultForest = ml_utils.convertToCategory(
        forest.predict(test_data_features), typeOp)

    resultBayes = ml_utils.convertToCategory(bayes.predict(test_data_features),
                                             typeOp)

    resultSVM = ml_utils.convertToCategory(svm.predict(test_data_features),
                                           typeOp)

    resultSGD = ml_utils.convertToCategory(sgd.predict(test_data_features),
                                           typeOp)

    # View the predicted probabilities of the first 10 observations
    forest.predict_proba(test_data_features)[0:10]

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes,
            "ageSVM": resultSVM,
            "ageSGD": resultSGD
        })

    # Use pandas to write the comma-separated output file
    outname = 'tweets_customFields_results.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    # View a list of the features and their importance scores
    headers = ["name", "score"]
    print "Importance of Features: "  #, sorted(list(zip(train_data[features], forest.feature_importances_)), key=lambda x: x[1])

    values = sorted(zip(train_data_features, forest.feature_importances_),
                    key=lambda x: x[1] * -1)
    print tabulate(values, headers, tablefmt="plain")

    #############################################
    # EVALUATE THE MODEL
    #############################################
    print "Evaluating the model --> Calculating metrics ..."

    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[[
        'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram',
        'facebook', 'followers_count', 'favourites_count', 'qtyMentions',
        'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase',
        'profile_pic_gender'
    ]]
    y_complete = ml_utils.convertToInt(df_complete['age'], typeOp)

    #--------------
    ##BAYES
    #--------------
    name_prefix = 'customFields_' + typeOp + '_' + balanced

    print "Metrics for Naive Bayes:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes,
                                   ageRanges, name_prefix, 'NaiveBayes',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultBayes,
                                target_names=target_names)

    scores = cross_val_score(bayes,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyNB = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    print "Metrics for Random Forest:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest,
                                   ageRanges, name_prefix, 'RandomForest',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultForest,
                                target_names=target_names)

    scores = cross_val_score(forest,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyRF = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    print "Metrics for SVM:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM,
                                   ageRanges, name_prefix, 'SVM', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSVM,
                                target_names=target_names)

    scores = cross_val_score(svm,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySVM = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    print "Metrics for SGD:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD,
                                   ageRanges, name_prefix, 'SGD', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSGD,
                                target_names=target_names)

    scores = cross_val_score(sgd,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySGD = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySGD

    #--------------
    ##OUTPUT
    #--------------
    result = "ACCURACY--> N.Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD
    print result
    return result  # Copy the results to a pandas dataframe
    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes
        })
Exemple #6
0
def main_tweetNgramsAndCustomFields(typeOp,balanced):
	
	if balanced == 'balanced':
		train_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_train.csv", sep=",",dtype=str)
		test_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_test.csv", sep=",",dtype=str)
	else:
		train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)
		test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)

	# Show the number of observations for the test and training dataframes
	print 'Number of observations in the training data:', len(train_data)
	print 'Number of observations in the test data:',len(test_data)

	frames = [train_data, test_data]
	df_complete= pd.concat(frames)

	print 'Number of observations in the whole dataset:',len(df_complete)
	
	stopwords = getCustomStopwords()

	transformer_tfidf = TfidfVectorizer(smooth_idf=False,lowercase=False,stop_words=stopwords,max_features=5000, ngram_range=(1,3))
	tfidf = transformer_tfidf.fit_transform(train_data.tweets)
	'''
	headers = ["name", "score"]
	idf = transformer_tfidf.idf_
	print "Most frequent TFIDF terms in dataset: "
	valuesTfIdf = sorted(zip(idf,transformer_tfidf.get_feature_names()), key=lambda x: x[0])
	print(tabulate(valuesTfIdf, headers, tablefmt="plain"))
	'''
	# fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
	# into feature vectors. 

	##To see occurrences of a specific word:
	#print count_vect.vocabulary_.get(u'amigos')

	train_data_feat = tfidf.toarray()
	#print len(train_data) #186 users en train
	
	train_data_features = np.c_[train_data_feat, train_data['friends_count'],train_data['tweets_count'], train_data['linkedin'],train_data['snapchat'], train_data['instagram'],train_data['facebook'],train_data['followers_count'],train_data['favourites_count'],train_data['qtyMentions'],train_data['qtyHashtags'],train_data['qtyUrls'], train_data['qtyEmojis'], train_data['qtyUppercase'],train_data['profile_pic_gender']]
	
	# Sum up the counts of each vocabulary word
	dist = np.sum(train_data_features, axis=0)
	
	# Sum up the counts of each vocabulary word
	dist = np.sum(train_data_features, axis=0)

	# For each, print the vocabulary word and the number of times it 
	# appears in the training set
	#for tag, count in zip(vocab, dist):
	#	print count, tag

	########################################
	#******* HYPERPARAMETER TUNING *********
	########################################
	
	import ml_utils as ml_utils
	#PARAMETERS TUNING
	#print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01}
	#print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 3}
	#print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 40, 'loss': 'log'}

	########################################
	#******* MODEL TRAINING        *********
	########################################

	print "Training the models..."

	# Initialize Multinomial Naive Bayes
	bayes = MultinomialNB()

	# Initialize a Random Forest classifier with 100 trees
	forest = RandomForestClassifier(n_estimators = 120, max_depth= 30, min_samples_leaf= 3) 
	# Fit the forest to the training set, using the bag of)

	svm = SVC(kernel='rbf', C= 8, gamma =  0.01)

	sgd = SGDClassifier(loss='log', penalty='l2', alpha=0.001,n_iter=40)

	# Fit the forest to the training set, using the bag of words as 
	# features and the age range as the response variable

	forest = forest.fit( train_data_features, train_data["age"] ) 

	bayes = bayes.fit( train_data_features, train_data["age"] ) 

	svm = svm.fit(train_data_features, train_data["age"] ) 

	sgd= sgd.fit(train_data_features, train_data["age"] ) 

	# Read the test data

	# Get a bag of words for the test set, and convert to a numpy array
	test_data_feat = transformer_tfidf.transform(test_data.tweets)
	test_data_feat = test_data_feat.toarray()

	test_data_features = np.c_[test_data_feat, test_data['friends_count'],test_data['tweets_count'], test_data['linkedin'],test_data['snapchat'], test_data['instagram'],test_data['facebook'],test_data['followers_count'],test_data['favourites_count'],test_data['qtyMentions'],test_data['qtyHashtags'],test_data['qtyUrls'], test_data['qtyEmojis'], test_data['qtyUppercase'],test_data['profile_pic_gender']]

	# Use the random forest to make age range predictions
	resultForest = forest.predict(test_data_features)

	resultBayes = bayes.predict(test_data_features)

	resultSVM= svm.predict(test_data_features)

	resultSGD= sgd.predict(test_data_features)

	outdir =time.strftime("%d-%m-%Y")
	
	if not os.path.exists(outdir):
   		os.mkdir(outdir)

   	if not os.path.exists(outdir +"/"+typeOp):
   		os.mkdir(outdir +"/"+typeOp)

   	outdir=outdir +"/"+typeOp

	# Copy the results to a pandas dataframe with an "id" column and
	# a "age" column

	output = pd.DataFrame( data={"id":test_data["screen_name"], "realAge":test_data["age"], "ageRandomForest":resultForest,"ageNaiveBayes":resultBayes})
	#print output

	# Use pandas to write the comma-separated output file
	outname = 'Bigram_model_ForestAndBayes.csv'
	fullname = os.path.join(outdir, outname)    
	output.to_csv(fullname,index=False)

	###################################
	#******* MODEL EVALUATION *********
	###################################
	
	import ml_utils as ml_utils
	db_access = MongoDBUtils()

	ageRanges=[]
	if typeOp=='normal':
   		ageRanges=db_access.getAgeRanges()
   	else:
		ageRanges=db_access.get3AgeRanges()

	target_names=ageRanges


	data_aux = transformer_tfidf.fit_transform(df_complete.tweets)
	data_aux = data_aux.toarray()
	data = np.c_[data_aux, df_complete['friends_count'],df_complete['tweets_count'], df_complete['linkedin'],df_complete['snapchat'], df_complete['instagram'],df_complete['facebook'],df_complete['followers_count'],df_complete['favourites_count'],df_complete['qtyMentions'],df_complete['qtyHashtags'],df_complete['qtyUrls'], df_complete['qtyEmojis'], df_complete['qtyUppercase'],df_complete['profile_pic_gender']]
	
	y_complete = df_complete['age']

	name_prefix='tweetNgramsAndCustomFields_'+typeOp+'_'+balanced
	print data.shape
	#--------------
	##BAYES
	#--------------
	print "Metrics for Naive Bayes:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir)
	print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names)

	scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracyNB = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracyNB

	#--------------
	##RANDOM FOREST
	#--------------
	print "Metrics for Random Forest:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir)
	print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names)
	
	scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracyRF = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracyRF 

	#--------------
	##SVM
	#--------------
	print "Metrics for SVM:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir)
	print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names)
	
	scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracySVM = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracySVM

	#--------------
	##SGD
	#--------------
	print "Metrics for SGD:"
	ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir)
	print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names)

	scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
	accuracySGD = round(scores.mean(),2)	
	print "10-Fold Accuracy: ", accuracySGD 
	#--------------
	##OUTPUT
	#--------------
	result= "ACCURACY--> N.Bayes:",accuracyNB,"|RForest:", accuracyRF,"|SVM:", accuracySVM,"|SGD:", accuracySGD
	print result
	return result	
Exemple #7
0
def main_subscriptionNgrams(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(DATASET_PATH +
                                 "/subscriptionLists_balanced_train.csv",
                                 sep=",",
                                 dtype=str)
        test_data = pd.read_csv(DATASET_PATH +
                                "/subscriptionLists_balanced_test.csv",
                                sep=",",
                                dtype=str)
    else:
        train_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                 "_subscriptionLists_train.csv",
                                 sep=",",
                                 dtype=str)
        test_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                "_subscriptionLists_test.csv",
                                sep=",",
                                dtype=str)

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    stopwords = getSpanishStopwords()

    count_vect = CountVectorizer(stop_words=stopwords,
                                 max_features=5000,
                                 ngram_range=(1, 3),
                                 token_pattern=r'\b\w+\b')
    X_train_counts = count_vect.fit_transform(train_data.subscriptionLists)
    # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
    # into feature vectors.

    ##To see occurrences of a specific word:
    #print count_vect.vocabulary_.get(u'amigos')

    train_data_features = X_train_counts.toarray()
    #print len(train_data) #186 users en train

    #print train_data_features.shape
    #(186, 500) --> It has 212 rows and 500 features (500 most frequent words).

    # Take a look at the words in the vocabulary
    vocab = count_vect.get_feature_names()
    #print vocab

    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)

    # For each, print the vocabulary word and the number of times it
    # appears in the training set
    #for tag, count in zip(vocab, dist):
    #	print count, tag

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT:{'kernel': 'rbf', 'C': 10, 'gamma': 0.01}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 1}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 40, 'loss': 'log'}

    # ********* APLICO MODELOS Y LOS ENTRENO CON LA DATA EN TRAIN*********#

    print "Training the models..."

    # Initialize Multinomial Naive Bayes
    bayes = MultinomialNB()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=120,
                                    max_depth=30,
                                    min_samples_leaf=1)
    # Fit the forest to the training set, using the bag of)

    svm = SVC(kernel='rbf', C=10, gamma=0.01)

    sgd = SGDClassifier(penalty='elasticnet',
                        alpha=0.0001,
                        n_iter=40,
                        loss='log')

    # Fit the forest to the training set, using the bag of words as
    # features and the age range as the response variable

    forest = forest.fit(train_data_features, train_data["age"])

    bayes = bayes.fit(train_data_features, train_data["age"])

    svm = svm.fit(train_data_features, train_data["age"])

    sgd = sgd.fit(train_data_features, train_data["age"])
    # Read the test data

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = count_vect.transform(test_data.subscriptionLists)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make age range predictions
    resultForest = forest.predict(test_data_features)

    resultBayes = bayes.predict(test_data_features)

    resultSVM = svm.predict(test_data_features)

    resultSGD = sgd.predict(test_data_features)

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes
        })
    #print output

    # Use pandas to write the comma-separated output file
    outname = 'subscriptionLists_Bag_of_Words_ForestAndBayes.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    ###################################
    #******* MODEL EVALUATION *********
    ###################################
    import ml_utils as ml_utils
    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[['screen_name', 'subscriptionLists']]
    data = count_vect.fit_transform(data.subscriptionLists)
    y_complete = df_complete['age']

    name_prefix = 'subscriptionNgrams_' + typeOp + '_' + balanced

    #--------------
    ##BAYES
    #--------------
    print "Metrics for Naive Bayes:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes,
                                   ageRanges, name_prefix, 'NaiveBayes',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultBayes,
                                target_names=target_names)

    scores = cross_val_score(bayes,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyNB = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    print "Metrics for Random Forest:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest,
                                   ageRanges, name_prefix, 'RandomForest',
                                   outdir)
    print classification_report(test_data['age'].tolist(),
                                resultForest,
                                target_names=target_names)

    scores = cross_val_score(forest,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracyRF = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    print "Metrics for SVM:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM,
                                   ageRanges, name_prefix, 'SVM', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSVM,
                                target_names=target_names)

    scores = cross_val_score(svm,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySVM = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    print "Metrics for SGD:"
    ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD,
                                   ageRanges, name_prefix, 'SGD', outdir)
    print classification_report(test_data['age'].tolist(),
                                resultSGD,
                                target_names=target_names)

    scores = cross_val_score(sgd,
                             data,
                             y_complete,
                             cv=StratifiedKFold(n_splits=10,
                                                shuffle=True,
                                                random_state=5),
                             scoring=make_scorer(accuracy_score))
    accuracySGD = round(scores.mean(), 2)
    print "10-Fold Accuracy: ", accuracySGD
    #--------------
    ##OUTPUT
    #--------------

    result = "ACCURACY--> Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD
    print result
    return result
Exemple #8
0
def main_tweetNgrams(typeOp, balanced):

    if balanced == 'balanced':
        train_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_train.csv",
                                 sep=",",
                                 dtype=str)[['screen_name', 'tweets', 'age']]
        test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv",
                                sep=",",
                                dtype=str)[['screen_name', 'tweets', 'age']]
    else:
        #train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)[['screen_name','tweets','age']]
        #test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)[['screen_name','tweets','age']]

        #EXPERIMENT 4
        train_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                 "_faceAPI_tweets_train.csv",
                                 sep=",",
                                 dtype=str)[['screen_name', 'tweets', 'age']]
        test_data = pd.read_csv(DATASET_PATH + "/" + typeOp +
                                "_faceAPI_tweets_test.csv",
                                sep=",",
                                dtype=str)[['screen_name', 'tweets', 'age']]

    # Show the number of observations for the test and training dataframes
    print 'Number of observations in the training data:', len(train_data)
    print 'Number of observations in the test data:', len(test_data)

    frames = [train_data, test_data]
    df_complete = pd.concat(frames)

    print 'Number of observations in the whole dataset:', len(df_complete)

    ##STOPWORDS EN SPANISH, SCIKIT TRAE SOLO EN INGLES
    stopwords = getCustomStopwords()

    #count_vect = CountVectorizer(stop_words=stopwords, max_features=5000 ) #Para hacer bag of words
    #X_train_counts = count_vect.fit_transform(train_data.tweets)
    # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data
    # into feature vectors.

    transformer_tfidf = TfidfVectorizer(smooth_idf=False,
                                        lowercase=False,
                                        stop_words=stopwords,
                                        max_features=5000,
                                        ngram_range=(1, 3))
    tfidf = transformer_tfidf.fit_transform(train_data.tweets)

    ##To see occurrences of a specific word:
    #print count_vect.vocabulary_.get(u'amigos')

    train_data_features = tfidf.toarray()
    #print len(train_data) #186 users en train

    # Take a look at the words in the vocabulary
    vocab = transformer_tfidf.get_feature_names()
    #print vocab

    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)

    # For each, print the vocabulary word and the number of times it
    # appears in the training set
    #for tag, count in zip(vocab, dist):
    #	print count, tag

    ########################################
    #******* HYPERPARAMETER TUNING *********
    ########################################

    import ml_utils as ml_utils
    #PARAMETERS TUNING
    #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 10, 'gamma': 0.1}
    #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 160, 'max_depth': 20, 'min_samples_leaf': 3}
    #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 50, 'loss': 'log'}

    ########################################
    #******* MODEL TRAINING        *********
    ########################################

    # ********* ENTRENO LOS MODELOS CON LA DATA EN TRAIN*********#

    print "Training the Classifiers..."

    # Initialize Multinomial Naive Bayes
    bayes = MultinomialNB()

    # Initialize a Random Forest classifier with 100 trees
    forest = RandomForestClassifier(n_estimators=160,
                                    max_depth=20,
                                    min_samples_leaf=3)
    # Fit the forest to the training set, using the bag of)

    svm = SVC(kernel='rbf', C=10, gamma=0.1)

    sgd = SGDClassifier(loss='log',
                        penalty='l2',
                        random_state=42,
                        alpha=0.0001,
                        n_iter=60)

    regr = LinearRegression()
    # Fit the forest to the training set, using the bag of words as
    # features and the age range as the response variable

    #forest = forest.fit( train_data_features, train_data["age"] )

    #bayes = bayes.fit( train_data_features, train_data["age"] )

    #svm = svm.fit(train_data_features, train_data["age"] )

    #sgd= sgd.fit(train_data_features, train_data["age"] )

    regr = regr.fit(train_data_features, train_data["age"])

    # Read the test data

    # Get a bag of words for the test set, and convert to a numpy array
    test_data_features = transformer_tfidf.transform(test_data.tweets)
    test_data_features = test_data_features.toarray()

    # Use the random forest to make age range predictions
    #resultForest = forest.predict(test_data_features)

    #resultBayes = bayes.predict(test_data_features)
    #print "resultbayes: ", resultBayes

    #resultSVM= svm.predict(test_data_features)

    #resultSGD= sgd.predict(test_data_features)

    resultLR = regr.predict(test_data_features)

    outdir = time.strftime("%d-%m-%Y")

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    if not os.path.exists(outdir + "/" + typeOp):
        os.mkdir(outdir + "/" + typeOp)

    outdir = outdir + "/" + typeOp

    output = pd.DataFrame(
        data={
            "id": test_data["screen_name"],
            "realAge": test_data["age"],
            "ageRandomForest": resultForest,
            "ageNaiveBayes": resultBayes,
            "ageSVM": resultSVM,
            "ageSGD": resultSGD
        })

    # Use pandas to write the comma-separated output file
    outname = 'tweets_ngrams_results.csv'
    fullname = os.path.join(outdir, outname)
    output.to_csv(fullname, index=False)

    ###################################
    #******* MODEL EVALUATION *********
    ###################################

    print "Evaluating the model --> Calculating metrics ..."

    db_access = MongoDBUtils()

    ageRanges = []
    if typeOp == 'normal':
        ageRanges = db_access.getAgeRanges()
    else:
        ageRanges = db_access.get3AgeRanges()

    target_names = ageRanges

    data = df_complete[['screen_name', 'tweets']]
    data = transformer_tfidf.fit_transform(data.tweets)
    y_complete = df_complete['age']

    name_prefix = 'tweetNgrams_' + typeOp + '_' + balanced
    #--------------
    ##BAYES
    #--------------
    #print "Metrics for Naive Bayes:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir)
    #print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names)

    #scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracyNB = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracyNB

    #--------------
    ##RANDOM FOREST
    #--------------
    #print "Metrics for Random Forest:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir)
    #print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names)

    #scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracyRF = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracyRF

    #--------------
    ##SVM
    #--------------
    #print "Metrics for SVM:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir)
    #print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names)

    #scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracySVM = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracySVM

    #--------------
    ##SGD
    #--------------
    #print "Metrics for SGD:"
    #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir)
    #print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names)

    #scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score))
    #accuracySGD = round(scores.mean(),2)
    #print "10-Fold Accuracy: ", accuracySGD

    #--------------
    ##OUTPUT
    #--------------
    result = "ACCURACY--> N.Bayes:", 0, "|RForest:", 0, "|SVM:", 0, "|SGD:", 0
    #print result
    return result