def main_wordcloudsFollowing(): ##WORDCLOUD FOR EVERY AGE RANGE db_access = MongoDBUtils() ageRanges = db_access.getAgeRanges() stopwords = getSpanishStopwords() for ar in ageRanges: #Decode data df_subscription = pd.read_csv(DATASET_PATH + "/subscriptionLists_" + ar + ".csv", sep=",", dtype=str) text = ' '.join(df_subscription['subscriptionLists']) for stop in stopwords: stop = ' ' + stop.encode('utf-8') + ' ' text = text.replace(stop, ' ').encode('utf-8', 'ignore') wordcloud = WordCloud(width=1600, height=800).generate(text.decode("utf-8")) print "Dibujando wordcloud para ", ar, " ..." # Open a plot of the generated image. plt.figure(figsize=(20, 10), facecolor='k') plt.title('wordcloud subscription lists:' + ar) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig('wordcloud_subscriptions' + ar + ".png", facecolor='k', bbox_inches='tight')
def convertToCategory(ageRanges,typeOp): db_access = MongoDBUtils() if typeOp =='normal': ages = db_access.getAgeRanges() else: ages=['10-17','18-24','25-xx'] result=[] for ar in ageRanges: result.append(ages[ar].encode("utf-8")) return result
def convertToInt(ageRanges,typeOp): db_access = MongoDBUtils() ages=[] result=[] if typeOp =='normal': ages = db_access.getAgeRanges() else: ages=['10-17','18-24','25-xx'] for ar in ageRanges: result.append(ages.index(ar)) return result
def main_wordcloudsTweets(): ##WORDCLOUD FOR EVERY AGE RANGE db_access = MongoDBUtils() ageRanges = db_access.getAgeRanges() #ageRanges=['50-64'] stopwords = getCustomStopwords() stopwords.append(u'jajaja') stopwords.append(u'gracia') stopwords.append(u'asi') stopwords.append(u'via') stopwords.append(u'dia') stopwords.append(u'tambien') stopsAux = [] for stop in stopwords: stopsAux.append(stop.encode('utf-8')) for ar in ageRanges: print ar #Decode data df_tweets = pd.read_csv(DATASET_PATH + "/tweets_" + ar + ".csv", sep=",") text = '' for tw in df_tweets['tweets']: tw = tw.translate(None, string.punctuation) tw = tw.replace('¿', ' ') tw = tw.replace('¡', ' ') tw = tw.replace('á', 'a') tw = tw.replace('é', 'e') tw = tw.replace('í', 'i') tw = tw.replace('ó', 'o') tw = tw.replace('ú', 'u') # Replace all stop words from the tweet text += removeStopWords(tw, stopwords) text = removeStopWords(text, stopwords) wordcloud = WordCloud(width=1600, height=800).generate(text.decode("utf-8")) print "Dibujando wordcloud para ", ar, " ..." # Open a plot of the generated image. plt.figure(figsize=(20, 10), facecolor='k') plt.title('wordcloud ages:' + ar) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad=0) plt.savefig('wordcloud_' + ar + ".png", facecolor='k', bbox_inches='tight')
def main_customFields(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv( DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] else: train_data = pd.read_csv( DATASET_PATH + "/" + typeOp + "_tweets_train.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] test_data = pd.read_csv( DATASET_PATH + "/" + typeOp + "_tweets_test.csv", sep=",", dtype=str)[[ 'screen_name', 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender', 'age' ]] # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) features = train_data.columns[1:(len(train_data.columns) - 1)] train_data_features = train_data[features] test_data_features = test_data[features] import ml_utils as ml_utils # convert age ranges into integers y = ml_utils.convertToInt(train_data['age'], typeOp) ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 140, 'max_depth': 20, 'min_samples_leaf': 2} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 50, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## print "Training the classifiers ..." forest = RandomForestClassifier(n_estimators=140, max_depth=20, min_samples_leaf=2) bayes = MultinomialNB() svm = SVC(kernel='rbf', C=8, gamma=0.01) sgd = SGDClassifier(loss='log', penalty='l2', random_state=42, alpha=0.001, n_iter=50) # Train the Classifier to take the training features and learn how they relate to the age forest.fit(train_data_features, y) bayes.fit(train_data_features, y) svm = svm.fit(train_data_features, y) sgd = sgd.fit(train_data_features, y) # Apply the Classifier we trained to the test data # Create actual english names for the ages for each predicted age range resultForest = ml_utils.convertToCategory( forest.predict(test_data_features), typeOp) resultBayes = ml_utils.convertToCategory(bayes.predict(test_data_features), typeOp) resultSVM = ml_utils.convertToCategory(svm.predict(test_data_features), typeOp) resultSGD = ml_utils.convertToCategory(sgd.predict(test_data_features), typeOp) # View the predicted probabilities of the first 10 observations forest.predict_proba(test_data_features)[0:10] outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes, "ageSVM": resultSVM, "ageSGD": resultSGD }) # Use pandas to write the comma-separated output file outname = 'tweets_customFields_results.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) # View a list of the features and their importance scores headers = ["name", "score"] print "Importance of Features: " #, sorted(list(zip(train_data[features], forest.feature_importances_)), key=lambda x: x[1]) values = sorted(zip(train_data_features, forest.feature_importances_), key=lambda x: x[1] * -1) print tabulate(values, headers, tablefmt="plain") ############################################# # EVALUATE THE MODEL ############################################# print "Evaluating the model --> Calculating metrics ..." db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[[ 'friends_count', 'tweets_count', 'linkedin', 'snapchat', 'instagram', 'facebook', 'followers_count', 'favourites_count', 'qtyMentions', 'qtyHashtags', 'qtyUrls', 'qtyEmojis', 'qtyUppercase', 'profile_pic_gender' ]] y_complete = ml_utils.convertToInt(df_complete['age'], typeOp) #-------------- ##BAYES #-------------- name_prefix = 'customFields_' + typeOp + '_' + balanced print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes, ageRanges, name_prefix, 'NaiveBayes', outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest, ageRanges, name_prefix, 'RandomForest', outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM, ageRanges, name_prefix, 'SVM', outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD, ageRanges, name_prefix, 'SGD', outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> N.Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD print result return result # Copy the results to a pandas dataframe output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes })
def main_tweetNgramsAndCustomFields(typeOp,balanced): if balanced == 'balanced': train_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_train.csv", sep=",",dtype=str) test_data=pd.read_csv(DATASET_PATH+"/tweets_balanced_test.csv", sep=",",dtype=str) else: train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str) test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str) # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:',len(test_data) frames = [train_data, test_data] df_complete= pd.concat(frames) print 'Number of observations in the whole dataset:',len(df_complete) stopwords = getCustomStopwords() transformer_tfidf = TfidfVectorizer(smooth_idf=False,lowercase=False,stop_words=stopwords,max_features=5000, ngram_range=(1,3)) tfidf = transformer_tfidf.fit_transform(train_data.tweets) ''' headers = ["name", "score"] idf = transformer_tfidf.idf_ print "Most frequent TFIDF terms in dataset: " valuesTfIdf = sorted(zip(idf,transformer_tfidf.get_feature_names()), key=lambda x: x[0]) print(tabulate(valuesTfIdf, headers, tablefmt="plain")) ''' # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_feat = tfidf.toarray() #print len(train_data) #186 users en train train_data_features = np.c_[train_data_feat, train_data['friends_count'],train_data['tweets_count'], train_data['linkedin'],train_data['snapchat'], train_data['instagram'],train_data['facebook'],train_data['followers_count'],train_data['favourites_count'],train_data['qtyMentions'],train_data['qtyHashtags'],train_data['qtyUrls'], train_data['qtyEmojis'], train_data['qtyUppercase'],train_data['profile_pic_gender']] # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 8, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT:{'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 3} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT:{'penalty': 'l2', 'alpha': 0.001, 'n_iter': 40, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## print "Training the models..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators = 120, max_depth= 30, min_samples_leaf= 3) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C= 8, gamma = 0.01) sgd = SGDClassifier(loss='log', penalty='l2', alpha=0.001,n_iter=40) # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable forest = forest.fit( train_data_features, train_data["age"] ) bayes = bayes.fit( train_data_features, train_data["age"] ) svm = svm.fit(train_data_features, train_data["age"] ) sgd= sgd.fit(train_data_features, train_data["age"] ) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_feat = transformer_tfidf.transform(test_data.tweets) test_data_feat = test_data_feat.toarray() test_data_features = np.c_[test_data_feat, test_data['friends_count'],test_data['tweets_count'], test_data['linkedin'],test_data['snapchat'], test_data['instagram'],test_data['facebook'],test_data['followers_count'],test_data['favourites_count'],test_data['qtyMentions'],test_data['qtyHashtags'],test_data['qtyUrls'], test_data['qtyEmojis'], test_data['qtyUppercase'],test_data['profile_pic_gender']] # Use the random forest to make age range predictions resultForest = forest.predict(test_data_features) resultBayes = bayes.predict(test_data_features) resultSVM= svm.predict(test_data_features) resultSGD= sgd.predict(test_data_features) outdir =time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir +"/"+typeOp): os.mkdir(outdir +"/"+typeOp) outdir=outdir +"/"+typeOp # Copy the results to a pandas dataframe with an "id" column and # a "age" column output = pd.DataFrame( data={"id":test_data["screen_name"], "realAge":test_data["age"], "ageRandomForest":resultForest,"ageNaiveBayes":resultBayes}) #print output # Use pandas to write the comma-separated output file outname = 'Bigram_model_ForestAndBayes.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname,index=False) ################################### #******* MODEL EVALUATION ********* ################################### import ml_utils as ml_utils db_access = MongoDBUtils() ageRanges=[] if typeOp=='normal': ageRanges=db_access.getAgeRanges() else: ageRanges=db_access.get3AgeRanges() target_names=ageRanges data_aux = transformer_tfidf.fit_transform(df_complete.tweets) data_aux = data_aux.toarray() data = np.c_[data_aux, df_complete['friends_count'],df_complete['tweets_count'], df_complete['linkedin'],df_complete['snapchat'], df_complete['instagram'],df_complete['facebook'],df_complete['followers_count'],df_complete['favourites_count'],df_complete['qtyMentions'],df_complete['qtyHashtags'],df_complete['qtyUrls'], df_complete['qtyEmojis'], df_complete['qtyUppercase'],df_complete['profile_pic_gender']] y_complete = df_complete['age'] name_prefix='tweetNgramsAndCustomFields_'+typeOp+'_'+balanced print data.shape #-------------- ##BAYES #-------------- print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(),2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result= "ACCURACY--> N.Bayes:",accuracyNB,"|RForest:", accuracyRF,"|SVM:", accuracySVM,"|SGD:", accuracySGD print result return result
def main_subscriptionNgrams(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv(DATASET_PATH + "/subscriptionLists_balanced_train.csv", sep=",", dtype=str) test_data = pd.read_csv(DATASET_PATH + "/subscriptionLists_balanced_test.csv", sep=",", dtype=str) else: train_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_subscriptionLists_train.csv", sep=",", dtype=str) test_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_subscriptionLists_test.csv", sep=",", dtype=str) # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) stopwords = getSpanishStopwords() count_vect = CountVectorizer(stop_words=stopwords, max_features=5000, ngram_range=(1, 3), token_pattern=r'\b\w+\b') X_train_counts = count_vect.fit_transform(train_data.subscriptionLists) # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_features = X_train_counts.toarray() #print len(train_data) #186 users en train #print train_data_features.shape #(186, 500) --> It has 212 rows and 500 features (500 most frequent words). # Take a look at the words in the vocabulary vocab = count_vect.get_feature_names() #print vocab # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT:{'kernel': 'rbf', 'C': 10, 'gamma': 0.01} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 120, 'max_depth': 30, 'min_samples_leaf': 1} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 40, 'loss': 'log'} # ********* APLICO MODELOS Y LOS ENTRENO CON LA DATA EN TRAIN*********# print "Training the models..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=120, max_depth=30, min_samples_leaf=1) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C=10, gamma=0.01) sgd = SGDClassifier(penalty='elasticnet', alpha=0.0001, n_iter=40, loss='log') # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable forest = forest.fit(train_data_features, train_data["age"]) bayes = bayes.fit(train_data_features, train_data["age"]) svm = svm.fit(train_data_features, train_data["age"]) sgd = sgd.fit(train_data_features, train_data["age"]) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_features = count_vect.transform(test_data.subscriptionLists) test_data_features = test_data_features.toarray() # Use the random forest to make age range predictions resultForest = forest.predict(test_data_features) resultBayes = bayes.predict(test_data_features) resultSVM = svm.predict(test_data_features) resultSGD = sgd.predict(test_data_features) outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes }) #print output # Use pandas to write the comma-separated output file outname = 'subscriptionLists_Bag_of_Words_ForestAndBayes.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) ################################### #******* MODEL EVALUATION ********* ################################### import ml_utils as ml_utils db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[['screen_name', 'subscriptionLists']] data = count_vect.fit_transform(data.subscriptionLists) y_complete = df_complete['age'] name_prefix = 'subscriptionNgrams_' + typeOp + '_' + balanced #-------------- ##BAYES #-------------- print "Metrics for Naive Bayes:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultBayes, ageRanges, name_prefix, 'NaiveBayes', outdir) print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyNB = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- print "Metrics for Random Forest:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultForest, ageRanges, name_prefix, 'RandomForest', outdir) print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracyRF = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- print "Metrics for SVM:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSVM, ageRanges, name_prefix, 'SVM', outdir) print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySVM = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- print "Metrics for SGD:" ml_utils.createConfusionMatrix(test_data['age'].tolist(), resultSGD, ageRanges, name_prefix, 'SGD', outdir) print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=5), scoring=make_scorer(accuracy_score)) accuracySGD = round(scores.mean(), 2) print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> Bayes:", accuracyNB, "|RForest:", accuracyRF, "|SVM:", accuracySVM, "|SGD:", accuracySGD print result return result
def main_tweetNgrams(typeOp, balanced): if balanced == 'balanced': train_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_train.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] test_data = pd.read_csv(DATASET_PATH + "/tweets_balanced_test.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] else: #train_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_train.csv", sep=",",dtype=str)[['screen_name','tweets','age']] #test_data=pd.read_csv(DATASET_PATH+"/"+typeOp+"_tweets_test.csv", sep=",",dtype=str)[['screen_name','tweets','age']] #EXPERIMENT 4 train_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_faceAPI_tweets_train.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] test_data = pd.read_csv(DATASET_PATH + "/" + typeOp + "_faceAPI_tweets_test.csv", sep=",", dtype=str)[['screen_name', 'tweets', 'age']] # Show the number of observations for the test and training dataframes print 'Number of observations in the training data:', len(train_data) print 'Number of observations in the test data:', len(test_data) frames = [train_data, test_data] df_complete = pd.concat(frames) print 'Number of observations in the whole dataset:', len(df_complete) ##STOPWORDS EN SPANISH, SCIKIT TRAE SOLO EN INGLES stopwords = getCustomStopwords() #count_vect = CountVectorizer(stop_words=stopwords, max_features=5000 ) #Para hacer bag of words #X_train_counts = count_vect.fit_transform(train_data.tweets) # fit_transform() fits the model and learns the vocabulary; second, it transforms our training data # into feature vectors. transformer_tfidf = TfidfVectorizer(smooth_idf=False, lowercase=False, stop_words=stopwords, max_features=5000, ngram_range=(1, 3)) tfidf = transformer_tfidf.fit_transform(train_data.tweets) ##To see occurrences of a specific word: #print count_vect.vocabulary_.get(u'amigos') train_data_features = tfidf.toarray() #print len(train_data) #186 users en train # Take a look at the words in the vocabulary vocab = transformer_tfidf.get_feature_names() #print vocab # Sum up the counts of each vocabulary word dist = np.sum(train_data_features, axis=0) # For each, print the vocabulary word and the number of times it # appears in the training set #for tag, count in zip(vocab, dist): # print count, tag ######################################## #******* HYPERPARAMETER TUNING ********* ######################################## import ml_utils as ml_utils #PARAMETERS TUNING #print ml_utils.SVM_param_selection(train_data_features, train_data["age"]) #RESULT: {'kernel': 'rbf', 'C': 10, 'gamma': 0.1} #print ml_utils.RandomForest_param_selection(train_data_features, train_data["age"])#RESULT: {'n_estimators': 160, 'max_depth': 20, 'min_samples_leaf': 3} #print ml_utils.SGD_param_selection(train_data_features, train_data["age"]) #RESULT: {'penalty': 'elasticnet', 'alpha': 0.0001, 'n_iter': 50, 'loss': 'log'} ######################################## #******* MODEL TRAINING ********* ######################################## # ********* ENTRENO LOS MODELOS CON LA DATA EN TRAIN*********# print "Training the Classifiers..." # Initialize Multinomial Naive Bayes bayes = MultinomialNB() # Initialize a Random Forest classifier with 100 trees forest = RandomForestClassifier(n_estimators=160, max_depth=20, min_samples_leaf=3) # Fit the forest to the training set, using the bag of) svm = SVC(kernel='rbf', C=10, gamma=0.1) sgd = SGDClassifier(loss='log', penalty='l2', random_state=42, alpha=0.0001, n_iter=60) regr = LinearRegression() # Fit the forest to the training set, using the bag of words as # features and the age range as the response variable #forest = forest.fit( train_data_features, train_data["age"] ) #bayes = bayes.fit( train_data_features, train_data["age"] ) #svm = svm.fit(train_data_features, train_data["age"] ) #sgd= sgd.fit(train_data_features, train_data["age"] ) regr = regr.fit(train_data_features, train_data["age"]) # Read the test data # Get a bag of words for the test set, and convert to a numpy array test_data_features = transformer_tfidf.transform(test_data.tweets) test_data_features = test_data_features.toarray() # Use the random forest to make age range predictions #resultForest = forest.predict(test_data_features) #resultBayes = bayes.predict(test_data_features) #print "resultbayes: ", resultBayes #resultSVM= svm.predict(test_data_features) #resultSGD= sgd.predict(test_data_features) resultLR = regr.predict(test_data_features) outdir = time.strftime("%d-%m-%Y") if not os.path.exists(outdir): os.mkdir(outdir) if not os.path.exists(outdir + "/" + typeOp): os.mkdir(outdir + "/" + typeOp) outdir = outdir + "/" + typeOp output = pd.DataFrame( data={ "id": test_data["screen_name"], "realAge": test_data["age"], "ageRandomForest": resultForest, "ageNaiveBayes": resultBayes, "ageSVM": resultSVM, "ageSGD": resultSGD }) # Use pandas to write the comma-separated output file outname = 'tweets_ngrams_results.csv' fullname = os.path.join(outdir, outname) output.to_csv(fullname, index=False) ################################### #******* MODEL EVALUATION ********* ################################### print "Evaluating the model --> Calculating metrics ..." db_access = MongoDBUtils() ageRanges = [] if typeOp == 'normal': ageRanges = db_access.getAgeRanges() else: ageRanges = db_access.get3AgeRanges() target_names = ageRanges data = df_complete[['screen_name', 'tweets']] data = transformer_tfidf.fit_transform(data.tweets) y_complete = df_complete['age'] name_prefix = 'tweetNgrams_' + typeOp + '_' + balanced #-------------- ##BAYES #-------------- #print "Metrics for Naive Bayes:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultBayes,ageRanges,name_prefix,'NaiveBayes',outdir) #print classification_report(test_data['age'].tolist(), resultBayes, target_names=target_names) #scores = cross_val_score(bayes, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracyNB = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracyNB #-------------- ##RANDOM FOREST #-------------- #print "Metrics for Random Forest:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultForest,ageRanges,name_prefix,'RandomForest',outdir) #print classification_report(test_data['age'].tolist(), resultForest, target_names=target_names) #scores = cross_val_score(forest, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracyRF = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracyRF #-------------- ##SVM #-------------- #print "Metrics for SVM:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSVM,ageRanges,name_prefix,'SVM',outdir) #print classification_report(test_data['age'].tolist(), resultSVM, target_names=target_names) #scores = cross_val_score(svm, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracySVM = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracySVM #-------------- ##SGD #-------------- #print "Metrics for SGD:" #ml_utils.createConfusionMatrix(test_data['age'].tolist(),resultSGD,ageRanges,name_prefix,'SGD',outdir) #print classification_report(test_data['age'].tolist(), resultSGD, target_names=target_names) #scores = cross_val_score(sgd, data, y_complete, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state = 5),scoring=make_scorer(accuracy_score)) #accuracySGD = round(scores.mean(),2) #print "10-Fold Accuracy: ", accuracySGD #-------------- ##OUTPUT #-------------- result = "ACCURACY--> N.Bayes:", 0, "|RForest:", 0, "|SVM:", 0, "|SGD:", 0 #print result return result