def run(): indexStart = 0 facebookDb = FacebookDataDatabase() for raw_url in adDb.selectAdData()[indexStart:]: global token token = getToken() url = modifyUrl(raw_url) post_number = get_page_id(url) if not facebookDb.isPageInDb(post_number): page = Page() page.metrics = getPageMetrics(url) page.posts = getPostData(url) if page.posts: for post in page.posts: token = getToken() set_post_data(post) StoreInFacebookData(post.id, post.image_url, post.message, post.share_count, post.comment_count, page.metrics.fan_count, page.metrics.rating_count, page.metrics.talking_about_count, page.metrics.star_rating, -1) print("Stored!", post.id) else: pass
class MessageGetter: facebookDb = FacebookDataDatabase() @staticmethod def get_columns(remove_columns=True): columns = MessageGetter.facebookDb.getColumnNames() columns = list(map(lambda x: x[1], columns)) if remove_columns: columns.remove("imageId") columns.remove("imageUrl") return columns @staticmethod def __dict_factory(row, columns): d = {} for idx, col in enumerate(columns): d[col] = row[idx] return d @staticmethod def __get_post(): number_of_posts_to_train_on = len( MessageGetter.facebookDb.selectFacebookData()) postData = MessageGetter.facebookDb.selectFacebookData() for i, post in enumerate(postData[:number_of_posts_to_train_on]): if i % 1000 == 0: print("gather data percent: ", i / number_of_posts_to_train_on) post_obj = MessageGetter.__dict_factory( post, MessageGetter.get_columns(False)) yield post_obj @staticmethod def get_post_generator(): post_generator = MessageGetter.__get_post() return post_generator
class Global: batch_size = 256 epochs = 100 group_size = 100_000 plot_losses = PlotLearning("nlp_nn_sentiment_count") facebookDb = FacebookDataDatabase() regularizer_function = None
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getShareCount # Set this to change the model type limit = len(os.listdir("../Image_CNN/images")) group_size = limit plot_losses = PlotLearning("combined_keras_model") batch_size = 1 epochs = 20
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getCommentCount # Set this to change the model type limit = 80_000 # len(os.listdir("../Image_CNN/images")) group_size = limit plot_losses = PlotLearning("combined_keras_model_comment_count") batch_size = 512 epochs = 100
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getShareCount # Set this to change the model type group_size = 6000 limit = 7000 plot_losses = PlotLearning("combined_keras_model") batch_size = 1 epochs = 20
class Global: batch_size = 256 epochs = 100 group_size = 100_000 plot_losses = PlotLearning("nlp_nn_comment_count") facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getCommentCount metric_name = "commentCount" regularizer_function = None
def test_to_vector(): import os all_files = os.listdir("../Image_CNN/images") ids = list(map(lambda x: x[:-4], all_files)) from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getCommentCount # Set this to change the model type rows = list( map( lambda x: x[0], filter(lambda x: x if x else None, map(lambda x: facebookDb.getRow(x), ids)))) data = list(map(lambda x: (x[0], x[10], x[2], x[3]), rows)) messages = list(map(lambda x: x[1], data)) word_vectors = to_vector(messages) for word_vector in word_vectors[:100]: print(word_vector)
class Global: batch_size = 1 epochs = 5 group_size = 6000 plot_losses = PlotLearning("nlp_nn") facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getSentiment metric_name = None regularizer_function = None
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getSentiment # Set this to change the model type group_size = 40000 limit = 60000
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getCommentCount # Set this to change the model type limit = 30000
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import matplotlib.pyplot as plt facebookDb = FacebookDataDatabase() commentCounts = list( filter(lambda x: x > -1, map(lambda x: x[0], facebookDb.selectColumnData("postPositivity")))) plt.hist(commentCounts, bins=100) # arguments are passed to np.histogram plt.title("Histogram of Post Sentiment Positivity") plt.xlabel("Post Sentiment") plt.ylabel("Bin Count") plt.savefig( "/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/postSentimentHistogram.png" )
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase from collections import defaultdict d = defaultdict(lambda: 0) facebookDb = FacebookDataDatabase() ids = list(map(lambda x: x[0], facebookDb.get_post_ids())) for id in ids: page_id = id.split("_")[0] d[page_id] += 1 import matplotlib.pyplot as plt import numpy as np plt.figure(figsize=(10, 4)) plt.subplots_adjust(wspace=0.3) plt.figure(1) plt.hist(d.values(), bins=30, color='g') plt.ylabel('Bin Count') plt.xlabel('Number of Posts Scraped') plt.plot() plt.subplots_adjust(hspace=.5) plt.savefig('Posts_Per_Page_Histogram.png', bbox_inches='tight', dpi=300)
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getShareCount # Set this to change the model type limit = 100000 # len(os.listdir("../Image_CNN/images")) group_size = limit
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getShareCount # Set this to change the model type group_size = 150000 limit = 200000
if (sentiment["neu"] == 1) and sentiment["neg"] == 0 and sentiment["pos"] == 0 and (compound == 0): return 0 else: return compound messages = commentDb.getMessages(postId) if len(messages) == 0: print("No comments for: {0}".format(postId)) return scores = [] for message in messages: sentiment = get_sentiment_scores(message[0]) if sentiment != -1: scores.append(sentiment) if not scores: return 0 return statistics.mean(scores) facebookDb = FacebookDataDatabase() post_ids = list(map(lambda x: x[0], facebookDb.getImageIdWithPositiveCommentCounts())) comment_db_post_ids = list(map(lambda x: x[0], commentDb.getPostIds())) post_ids_with_new_comments = set(post_ids).union(set(comment_db_post_ids)) for data in list(post_ids_with_new_comments): postId = data mean_sentiment_score = SentimentAnalyzer.GetPostSentiment(postId) print(mean_sentiment_score) if mean_sentiment_score: facebookDb.insertSentimentData(mean_sentiment_score, -1, postId)
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase facebookDb = FacebookDataDatabase() sentimentsTuples = facebookDb.selectColumnData("postPositivity") sentiments = list(map(lambda x: x[0], sentimentsTuples)) import numpy as np from matplotlib import pyplot as plt # fixed bin size bins = np.arange(0, 100, 1) # fixed bin size plt.xlim([min(sentiments), 100]) plt.hist(sentiments, bins=bins, alpha=0.5) plt.savefig( '/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/DataStats/sentimentHist.png' ) print(np.std(sentiments)) print(np.var(sentiments))
import os import sys sys.path.append(os.pardir) from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import os facebookDb = FacebookDataDatabase() files = os.listdir("./images") delete_paths = [] ids = list(map(lambda x: x.replace(".png", "").replace(".jpg", ""), files)) for x in ids: shareCount = facebookDb.getShareCount(x) if shareCount is None: print(x) delete_paths.append(os.path.join("./images", x)) print() for x in ids: shareCount = facebookDb.getCommentCount(x) if shareCount is None: print(x) delete_paths.append(os.path.join("./images", x)) #should_delete = input("Should we delete these files?") #if should_delete == 'y' or should_delete == 'yes': # for path in delete_paths: # os.remove(path)
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase from Notebooks.LinkDatabases.PostComments import PostDataDatabase import numpy as np facebookDb = FacebookDataDatabase() commentDb = PostDataDatabase() commentCounts = facebookDb.selectColumnData("commentCount") print("Comment Count Variance: {0}".format(np.var(commentCounts))) shareCounts = facebookDb.selectColumnData("shareCount") print("Share Count Variance: {0}".format(np.var(shareCounts))) sentiments = list(map(lambda x: x[0] * 100, facebookDb.selectColumnData("postPositivity"))) print(sentiments[:20]) print("Sentiment Variance: {0}".format(np.var(sentiments)))
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import pandas as pd facebookDb = FacebookDataDatabase() # fanCount INT,numberOfRatings INT, talkingAboutCount INT, pageRating REAL data = facebookDb.selectPageMetrics() df = pd.DataFrame(data, columns=[ "fanCount", "numberOfRatings", "talkingAboutCount", "shareCount", "commentCount", "sentiment" ]) df.to_csv("page_metrics.csv")
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase facebookDb = FacebookDataDatabase() shareCountsTuples = facebookDb.selectColumnData("shareCount") shareCounts = list(map(lambda x: x[0], shareCountsTuples)) import numpy as np from matplotlib import pyplot as plt # fixed bin size bins = np.arange(0, 100, 1) # fixed bin size plt.xlim([min(shareCounts), 100]) plt.hist(shareCounts, bins=bins, alpha=0.5) plt.savefig( '/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/DataStats/shareCountHist.png' ) print(np.std(shareCounts)) print(np.var(shareCounts))
for file in files: try: file_loaded = image.load_img(file, target_size=(200, 200)) final.append(file_loaded) except: print("Skipping file: {0}".format(file)) files.remove(file) assert len(final) > 1 return final, files def to_array(images): return np.array(list(map(lambda x: img_to_array(x) / 255, images))) facebookDb = FacebookDataDatabase() metricGetter = facebookDb.getShareCount all_files = os.listdir("./images")[:image_count] files = list(filter(lambda x: ".jpg" in x or ".png" in x, filter(lambda x: not ".DS_Store" in x, all_files))) imagePaths = list(map(lambda x: os.path.join("./images", x), files)) imagePaths = list(filter(lambda x: getSize(x) > 1, imagePaths)) imagePaths = list(filter(lambda x: os.path.exists(x), imagePaths)) images, imagePaths = load_images(imagePaths) # weed out images that fail to load, there are only a few image_arrays = to_array(images) shareCounts = to_share_count(get_ids(imagePaths), metricGetter) labels = np.array(shareCounts) (trainX, testX, trainY, testY) = train_test_split(image_arrays, labels, test_size=0.25, random_state=42) model_name = '{0}.h5'.format(metricGetter.__name__)
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import matplotlib.pyplot as plt facebookDb = FacebookDataDatabase() commentCounts = list( map(lambda x: x[0], facebookDb.selectColumnData("shareCount")))[:5000] plt.hist(commentCounts, bins=5000) # arguments are passed to np.histogram plt.xlim(0, 200) plt.title("Histogram of Share Counts") plt.xlabel("Share Count") plt.ylabel("Count") plt.savefig( "/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/shareCountHistogram.png" )
class Static: facebookDb = FacebookDataDatabase() metric_getter = facebookDb.getCommentCount # Set this to change the model type group_size = 10000 limit = 10000 plot_losses = PlotLearning("combined_keras_model")
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import pandas as pd import numpy as np fbDatabase = FacebookDataDatabase() counts = list(map(lambda x: x[0] if x[0] > 0 else 0, fbDatabase.selectColumnData("commentCount"))) for x in counts[:100]: print(x) df = pd.DataFrame(counts, columns=["commentCount"]) df.to_csv("comment_counts.csv")
import pandas as pd from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase from Notebooks.LinkDatabases.PostComments import PostDataDatabase facebookDb = FacebookDataDatabase() commentDb = PostDataDatabase() comment_data = commentDb.selectPostData() df = pd.DataFrame.from_records(comment_data, columns=["imageId", "commentId", "text"]) for imageId in df.imageId.unique(): count = len(df[df["imageId"] == imageId]) facebookDb.insertCommentCountData(count, imageId)
import sys sys.path.append(os.pardir) from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase from imageio import imwrite import cv2 import urllib.request from PIL import Image import re import os import numpy as np from numpy import array from scipy.ndimage import filters import ntpath facebookDb = FacebookDataDatabase() image_regex = "([A-Za-z_\d]+.jpg|[A-Za-z_\d]+.png)" def getExistingImages(): files = [] base = "./images" for file in os.listdir(base): files.append(file) return files def denoise_image(full_path): im = Image.open(full_path) if im.size == (1, 1,): os.remove(full_path)
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import matplotlib.pyplot as plt facebookDb = FacebookDataDatabase() commentCounts = list( map(lambda x: x[0], facebookDb.selectColumnData("commentCount"))) plt.hist(commentCounts, bins=2000) # arguments are passed to np.histogram plt.xlim(0, 150) plt.title("Histogram of Comment Counts") plt.xlabel("Comment Count") plt.ylabel("Count in Bin") plt.savefig( "/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/commentCountHistogram.png" )
import os import sys from datetime import datetime sys.path.append("../../") from Notebooks.SearchFbData.GetKeyData import get_key_data from Notebooks.Token.GenerateToken import getToken from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase from Notebooks.LinkDatabases.PostComments import PostDataDatabase import requests import ast import unicodedata facebookDb = FacebookDataDatabase() commentDb = PostDataDatabase() global token token = getToken() # Get/Set post data. These are person posts on the Ad in question class Comment: def __init__(self): self.id = None self.message = None self.like_count = None def setId(self, id): self.id = id
from Notebooks.LinkDatabases.FacebookData import FacebookDataDatabase import numpy as np facebookDb = FacebookDataDatabase() commentCountsTuples = facebookDb.selectColumnData("commentCount") commentCounts = list(map(lambda x: x[0], commentCountsTuples)) commentCountsLog = list(map(lambda x: np.log(x) if x > 0 else x, commentCounts)) import numpy as np from matplotlib import pyplot as plt # fixed bin size bins = np.arange(0, 100, 1) # fixed bin size plt.xlim([min(commentCountsLog), 100]) plt.hist(commentCountsLog, bins=bins, alpha=0.5) plt.savefig( '/Users/ccrowe/Documents/Thesis/facebook_api/Notebooks/DataStats/commentCountHist.png' ) print(np.std(commentCountsLog)) print(np.var(commentCountsLog))