def main(): parser = ArgumentParser() parser.add_argument('--group_id', default="866250103534243") parser.add_argument('--auth_file', default='../../data/facebook_auth.csv') parser.add_argument('--out_dir', default='../../data/facebook-maria') parser.add_argument('--start_date', default='2017-09-20') parser.add_argument('--end_date', default='2017-10-20') args = parser.parse_args() group_id = args.group_id auth_file_name = args.auth_file out_dir = args.out_dir start_date = args.start_date end_date = args.end_date # load auth data auth = dict([l.strip().split(',') for l in open(auth_file_name)]) app_id = auth['app_id'] app_secret = auth['app_secret'] access_token = '%s|%s'%(app_id, app_secret) # mine out_file_name = os.path.join(out_dir, '%s_%s_%s_facebook_comments.tsv'%(group_id, start_date, end_date)) status_file_name = os.path.join(out_dir, '%s_%s_%s_facebook_posts.tsv'%(group_id, start_date, end_date)) scrapeFacebookPageFeedComments(group_id, access_token, status_file_name, out_file_name) print('finished scraping, writing comments to %s'%(out_file_name))
def index_post(): error = None url = request.form['text'] if request_once(url) == None: message="Please make sure you entered a vaild url" return jsonify({"error": message}) if get_page_name(url) != config.page_name: message="Please enter a post url for the {0} page".format(config.page_name) return jsonify({"error": message}) post_id = get_post_id(url) status_id = "{0}_{1}".format(config.page_id, post_id) if status_id == None: message="Please make sure you entered a vaild Facebook url" return jsonify({"error": message}) si = io.StringIO() cw = csv.writer(si) # add a filename headers = Headers() headers.set('Content-Disposition', 'attachment', filename='fb_comments.csv') # stream the response as the data is generated return Response( stream_with_context(scrapeFacebookPageFeedComments( si, cw, config.page_id, config.access_token, status_id)), mimetype='application/download', headers=headers )
def public_comment(): app_id = request.form['app_id'] app_secret = request.form['app_secret'] file_id = request.form['page_id'] access_token = app_id + "|" + app_secret def request_until_succeed(url): req = Request(url) success = False while success is False: try: response = urlopen(req) if response.getcode() == 200: success = True except Exception as e: print(e) time.sleep(5) print("Error for URL {}: {}".format(url, datetime.datetime.now())) print("Retrying.") return response.read() # Needed to write tricky unicode correctly to csv def unicode_decode(text): try: return text.encode('utf-8').decode() except UnicodeDecodeError: return text.encode('utf-8') def getFacebookCommentFeedUrl(base_url): # Construct the URL string fields = "&fields=id,message" + \ ",created_time,comments,from,attachment" url = base_url + fields return url def getReactionsForComments(base_url): reaction_types = ['like', 'love', 'wow', 'haha', 'sad', 'angry'] reactions_dict = {} # dict of {status_id: tuple<6>} for reaction_type in reaction_types: fields = "&fields=reactions.type({}).limit(0).summary(total_count)".format( reaction_type.upper()) # fields = "&fields=reactions.type({}).limit(0).summary(total_count)".format( # reaction_type.upper()) url = base_url + fields data = json.loads(request_until_succeed(url))['data'] data_processed = set() # set() removes rare duplicates in statuses for status in data: id = status['id'] count = status['reactions']['summary']['total_count'] data_processed.add((id, count)) for id, count in data_processed: if id in reactions_dict: reactions_dict[id] = reactions_dict[id] + (count, ) else: reactions_dict[id] = (count, ) return reactions_dict def processFacebookComment(comment, status_id, parent_id=''): # The status is now a Python dictionary, so for top-level items, # we can simply call the key. # Additionally, some items may not always exist, # so must check for existence first comment_id = comment['id'] comment_message = '' if 'message' not in comment or comment['message'] \ is '' else unicode_decode(comment['message']) # comment_author = unicode_decode(comment['from']['name']) num_reactions = 0 if 'reactions' not in comment else \ comment['reactions']['summary']['total_count'] if 'attachment' in comment: attachment_type = comment['attachment']['type'] attachment_type = 'gif' if attachment_type == 'animated_image_share' \ else attachment_type attach_tag = "[[{}]]".format(attachment_type.upper()) comment_message = attach_tag if comment_message is '' else \ comment_message + " " + attach_tag # Time needs special care since a) it's in UTC and # b) it's not easy to use in statistical programs. comment_published = datetime.datetime.strptime( comment['created_time'], '%Y-%m-%dT%H:%M:%S+0000') comment_published = comment_published + datetime.timedelta( hours=-5) # EST comment_published = comment_published.strftime( '%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs # Return a tuple of all processed data return (comment_id, status_id, parent_id, comment_message, comment_published, num_reactions) def scrapeFacebookPageFeedComments(page_id, access_token): with open('{}_facebook_comments.csv'.format(file_id), 'w', encoding="utf-8") as file: w = csv.writer(file) w.writerow([ "comment_id", "status_id", "parent_id", "comment_message", "comment_published", "num_reactions", "num_likes", "num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys", "num_special" ]) num_processed = 0 scrape_starttime = datetime.datetime.now() after = '' base = "https://graph.facebook.com/v2.9" parameters = "/?limit={}&access_token={}".format(100, access_token) print("Scraping {} Comments From Posts: {}\n".format( file_id, scrape_starttime)) with open('{}_facebook_statuses.csv'.format(file_id), 'r', encoding='utf-8') as csvfile: reader = csv.DictReader(csvfile) # Uncomment below line to scrape comments for a specific status_id # reader = [dict(status_id='5550296508_10154352768246509')] for status in reader: has_next_page = True while has_next_page: node = "/{}/comments".format(status['status_id']) after = '' if after is '' else "&after={}".format( after) base_url = base + node + parameters + after url = getFacebookCommentFeedUrl(base_url) # print(url) comments = json.loads(request_until_succeed(url)) reactions = getReactionsForComments(base_url) for comment in comments['data']: comment_data = processFacebookComment( comment, status['status_id']) reactions_data = reactions[comment_data[0]] # calculate thankful/pride through algebra num_special = comment_data[5] - sum(reactions_data) w.writerow(comment_data + reactions_data + (num_special, )) if 'comments' in comment: has_next_subpage = True sub_after = '' while has_next_subpage: sub_node = "/{}/comments".format( comment['id']) sub_after = '' if sub_after is '' else "&after={}".format( sub_after) sub_base_url = base + sub_node + parameters + sub_after sub_url = getFacebookCommentFeedUrl( sub_base_url) sub_comments = json.loads( request_until_succeed(sub_url)) sub_reactions = getReactionsForComments( sub_base_url) for sub_comment in sub_comments['data']: sub_comment_data = processFacebookComment( sub_comment, status['status_id'], comment['id']) sub_reactions_data = sub_reactions[ sub_comment_data[0]] num_sub_special = sub_comment_data[ 5] - sum(sub_reactions_data) w.writerow(sub_comment_data + sub_reactions_data + (num_sub_special, )) num_processed += 1 if num_processed % 100 == 0: print("{} Comments Processed: {}". format( num_processed, datetime.datetime.now())) if 'paging' in sub_comments: if 'next' in sub_comments['paging']: sub_after = sub_comments['paging'][ 'cursors']['after'] else: has_next_subpage = False else: has_next_subpage = False # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 100 == 0: print("{} Comments Processed: {}".format( num_processed, datetime.datetime.now())) if 'paging' in comments: if 'next' in comments['paging']: after = comments['paging']['cursors']['after'] else: has_next_page = False else: has_next_page = False print("\nDone!\n{} Comments Processed in {}".format( num_processed, datetime.datetime.now() - scrape_starttime)) scrapeFacebookPageFeedComments(file_id, access_token) return file_id