Example #1
0
def main():
  parser = ArgumentParser()
  parser.add_argument('--group_id', default="866250103534243")
  parser.add_argument('--auth_file', default='../../data/facebook_auth.csv')
  parser.add_argument('--out_dir', default='../../data/facebook-maria')
  parser.add_argument('--start_date', default='2017-09-20')
  parser.add_argument('--end_date', default='2017-10-20')
  args = parser.parse_args()
  group_id = args.group_id
  auth_file_name = args.auth_file
  out_dir = args.out_dir
  start_date = args.start_date
  end_date = args.end_date
  
  # load auth data
  auth = dict([l.strip().split(',') for l in open(auth_file_name)])
  app_id = auth['app_id']
  app_secret = auth['app_secret']
  access_token = '%s|%s'%(app_id, app_secret)
  
  # mine
  out_file_name = os.path.join(out_dir, '%s_%s_%s_facebook_comments.tsv'%(group_id, start_date, end_date))
  status_file_name = os.path.join(out_dir, '%s_%s_%s_facebook_posts.tsv'%(group_id, start_date, end_date))
  scrapeFacebookPageFeedComments(group_id, access_token, status_file_name, out_file_name)
  print('finished scraping, writing comments to %s'%(out_file_name))
Example #2
0
def index_post():

    error = None
    url = request.form['text']

    if request_once(url) == None:
        message="Please make sure you entered a vaild url"
        return jsonify({"error": message})

    if get_page_name(url) != config.page_name:
        message="Please enter a post url for the {0} page".format(config.page_name)
        return jsonify({"error": message})


    post_id = get_post_id(url)

    status_id = "{0}_{1}".format(config.page_id, post_id)

    if status_id == None:
        message="Please make sure you entered a vaild Facebook url"
        return jsonify({"error": message})

    si = io.StringIO()
    cw = csv.writer(si)

    # add a filename
    headers = Headers()
    headers.set('Content-Disposition', 'attachment', filename='fb_comments.csv')

    # stream the response as the data is generated
    return Response(
        stream_with_context(scrapeFacebookPageFeedComments(
            si,
            cw,
            config.page_id,
            config.access_token,
            status_id)),
        mimetype='application/download', headers=headers
    )
def public_comment():
    app_id = request.form['app_id']
    app_secret = request.form['app_secret']
    file_id = request.form['page_id']
    access_token = app_id + "|" + app_secret

    def request_until_succeed(url):
        req = Request(url)
        success = False
        while success is False:
            try:
                response = urlopen(req)
                if response.getcode() == 200:
                    success = True
            except Exception as e:
                print(e)
                time.sleep(5)

                print("Error for URL {}: {}".format(url,
                                                    datetime.datetime.now()))
                print("Retrying.")

        return response.read()

    # Needed to write tricky unicode correctly to csv

    def unicode_decode(text):
        try:
            return text.encode('utf-8').decode()
        except UnicodeDecodeError:
            return text.encode('utf-8')

    def getFacebookCommentFeedUrl(base_url):
        # Construct the URL string
        fields = "&fields=id,message" + \
                 ",created_time,comments,from,attachment"
        url = base_url + fields

        return url

    def getReactionsForComments(base_url):
        reaction_types = ['like', 'love', 'wow', 'haha', 'sad', 'angry']
        reactions_dict = {}  # dict of {status_id: tuple<6>}

        for reaction_type in reaction_types:
            fields = "&fields=reactions.type({}).limit(0).summary(total_count)".format(
                reaction_type.upper())
            # fields = "&fields=reactions.type({}).limit(0).summary(total_count)".format(
            #   reaction_type.upper())

            url = base_url + fields

            data = json.loads(request_until_succeed(url))['data']

            data_processed = set()  # set() removes rare duplicates in statuses
            for status in data:
                id = status['id']
                count = status['reactions']['summary']['total_count']
                data_processed.add((id, count))

            for id, count in data_processed:
                if id in reactions_dict:
                    reactions_dict[id] = reactions_dict[id] + (count, )
                else:
                    reactions_dict[id] = (count, )

        return reactions_dict

    def processFacebookComment(comment, status_id, parent_id=''):
        # The status is now a Python dictionary, so for top-level items,
        # we can simply call the key.

        # Additionally, some items may not always exist,
        # so must check for existence first

        comment_id = comment['id']
        comment_message = '' if 'message' not in comment or comment['message'] \
                                is '' else unicode_decode(comment['message'])
        # comment_author = unicode_decode(comment['from']['name'])
        num_reactions = 0 if 'reactions' not in comment else \
            comment['reactions']['summary']['total_count']

        if 'attachment' in comment:
            attachment_type = comment['attachment']['type']
            attachment_type = 'gif' if attachment_type == 'animated_image_share' \
                else attachment_type
            attach_tag = "[[{}]]".format(attachment_type.upper())
            comment_message = attach_tag if comment_message is '' else \
                comment_message + " " + attach_tag

        # Time needs special care since a) it's in UTC and
        # b) it's not easy to use in statistical programs.

        comment_published = datetime.datetime.strptime(
            comment['created_time'], '%Y-%m-%dT%H:%M:%S+0000')
        comment_published = comment_published + datetime.timedelta(
            hours=-5)  # EST
        comment_published = comment_published.strftime(
            '%Y-%m-%d %H:%M:%S')  # best time format for spreadsheet programs

        # Return a tuple of all processed data

        return (comment_id, status_id, parent_id, comment_message,
                comment_published, num_reactions)

    def scrapeFacebookPageFeedComments(page_id, access_token):
        with open('{}_facebook_comments.csv'.format(file_id),
                  'w',
                  encoding="utf-8") as file:
            w = csv.writer(file)
            w.writerow([
                "comment_id", "status_id", "parent_id", "comment_message",
                "comment_published", "num_reactions", "num_likes", "num_loves",
                "num_wows", "num_hahas", "num_sads", "num_angrys",
                "num_special"
            ])

            num_processed = 0
            scrape_starttime = datetime.datetime.now()
            after = ''
            base = "https://graph.facebook.com/v2.9"
            parameters = "/?limit={}&access_token={}".format(100, access_token)

            print("Scraping {} Comments From Posts: {}\n".format(
                file_id, scrape_starttime))

            with open('{}_facebook_statuses.csv'.format(file_id),
                      'r',
                      encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)

                # Uncomment below line to scrape comments for a specific status_id
                # reader = [dict(status_id='5550296508_10154352768246509')]

                for status in reader:
                    has_next_page = True

                    while has_next_page:

                        node = "/{}/comments".format(status['status_id'])
                        after = '' if after is '' else "&after={}".format(
                            after)
                        base_url = base + node + parameters + after

                        url = getFacebookCommentFeedUrl(base_url)
                        # print(url)
                        comments = json.loads(request_until_succeed(url))
                        reactions = getReactionsForComments(base_url)

                        for comment in comments['data']:
                            comment_data = processFacebookComment(
                                comment, status['status_id'])
                            reactions_data = reactions[comment_data[0]]

                            # calculate thankful/pride through algebra
                            num_special = comment_data[5] - sum(reactions_data)
                            w.writerow(comment_data + reactions_data +
                                       (num_special, ))

                            if 'comments' in comment:
                                has_next_subpage = True
                                sub_after = ''

                                while has_next_subpage:
                                    sub_node = "/{}/comments".format(
                                        comment['id'])
                                    sub_after = '' if sub_after is '' else "&after={}".format(
                                        sub_after)
                                    sub_base_url = base + sub_node + parameters + sub_after

                                    sub_url = getFacebookCommentFeedUrl(
                                        sub_base_url)
                                    sub_comments = json.loads(
                                        request_until_succeed(sub_url))
                                    sub_reactions = getReactionsForComments(
                                        sub_base_url)

                                    for sub_comment in sub_comments['data']:
                                        sub_comment_data = processFacebookComment(
                                            sub_comment, status['status_id'],
                                            comment['id'])
                                        sub_reactions_data = sub_reactions[
                                            sub_comment_data[0]]

                                        num_sub_special = sub_comment_data[
                                            5] - sum(sub_reactions_data)

                                        w.writerow(sub_comment_data +
                                                   sub_reactions_data +
                                                   (num_sub_special, ))

                                        num_processed += 1
                                        if num_processed % 100 == 0:
                                            print("{} Comments Processed: {}".
                                                  format(
                                                      num_processed,
                                                      datetime.datetime.now()))

                                    if 'paging' in sub_comments:
                                        if 'next' in sub_comments['paging']:
                                            sub_after = sub_comments['paging'][
                                                'cursors']['after']
                                        else:
                                            has_next_subpage = False
                                    else:
                                        has_next_subpage = False

                            # output progress occasionally to make sure code is not
                            # stalling
                            num_processed += 1
                            if num_processed % 100 == 0:
                                print("{} Comments Processed: {}".format(
                                    num_processed, datetime.datetime.now()))

                        if 'paging' in comments:
                            if 'next' in comments['paging']:
                                after = comments['paging']['cursors']['after']
                            else:
                                has_next_page = False
                        else:
                            has_next_page = False

            print("\nDone!\n{} Comments Processed in {}".format(
                num_processed,
                datetime.datetime.now() - scrape_starttime))

    scrapeFacebookPageFeedComments(file_id, access_token)
    return file_id