コード例 #1
0
ファイル: scraping_tools.py プロジェクト: MarkVdBergh/fb_test
    def check_rate_limit(self, store=True, pause=True):
        # ToDo: Ugly processing !!!
        # Check the response header of an api call to know if rate is limited
        base = 'https://graph.facebook.com/v2.8'
        pageid = '270994524621'
        access = '&access_token={}'.format(self.access_token)
        node = '/{}/posts'.format(pageid)
        fields = '?fields=id,created_time'

        url = base + node + fields + access
        r = requests.get(url)
        timestamp = utc_now()
        rate_limit = r.headers.get('x-app-usage')
        while rate_limit:
            doc = {
                'type': 'fb_rate_limit',
                'status': r.status_code,
                'timestamp': timestamp,
                'rate_limit': rate_limit
            }
            if store: save_log(doc)
            print 'Rate is limited !, Pause for 60 sec'
            print doc
            time.sleep(60)
            r = requests.get(url)
            timestamp = utc_now()
            rate_limit = r.headers.get('x-app-usage')
        return
コード例 #2
0
ファイル: scraping_tools.py プロジェクト: MarkVdBergh/fb_test
    def get_all_comments(self, id, limit=500):
        if is_blacklisted(id):
            print '\nComment blacklisted: {}'.format(id)
            return [{'blacklisted': id}]
        # ToDo: get likes for comments
        # ToDo: get_object doesn't retrieve enough fields.
        #ToDo: Try to correct malformed comment ids. Ex. '103536906359022:898483933530978:10103399554941441_898491433530228'

        fields = 'id,created_time,comments.limit(%d){attachment,comment_count,from,id,like_count,message,likes{pic,link,name,username,picture{url,is_silhouette}},created_time}' % limit
        try:
            comments = self.graph.get_object(id,
                                             fields=fields,
                                             date_format='U')
        except:
            doc = {'id': id, 'timestamp': utc_now(), 'type': 'message'}
            save_blacklist_doc(doc=doc)
            print '\nBlacklisted: {}'.format(id)
            self.blacklist = get_blacklist(
            )  # Update the blacklist in the FacebookScraping instance
            return [{'blacklisted': id}]

        allcomments = []
        while True:
            try:
                for comment in comments['comments']['data']:
                    allcomments.append(comment)
                    if comment['comment_count'] > 0:  # Get sub comments
                        sub_comment = self.get_all_comments(comment['id'])
                        allcomments = allcomments + sub_comment
                # When there are no more pages (['paging']['next']), break from the loop and end the script.
                comments = requests.get(
                    comments['comments']['paging']['next']).json()
            except KeyError:
                break
        return allcomments
コード例 #3
0
ファイル: engine.py プロジェクト: MarkVdBergh/fb_test
    def __init__(self, pageidlist=None, access_token=None, since=None, until=None):
        """
        :param pageidlist: [str, str, ...]
        :param access_token:
        :param since: (yyyy,m,d): local date
        :param until: (yyyy,m,d): local date
        """
        MON = MongoDb(collection='facebook')
        if not access_token:
            self.access_token = FB_APP_ID + "|" + FB_APP_SECRET
        else:
            self.access_token = access_token
        if not pageidlist:
            self.pageidlist = PAGE_LIST
        else:
            self.pageidlist = pageidlist

        # convert local date (yyyy,mm,dd) into utc datetime
        if not since:
            self.since = yyyymmdd_to_datetime(2000, 1, 1, 0, 0, 0)
        else:
            self.since = yyyymmdd_to_datetime(since[0], since[1], since[2], 0, 0, 0)
        if not until:
            self.until = utc_now()
        else:
            self.until = yyyymmdd_to_datetime(until[0], until[1], until[2], 23, 59, 59)
コード例 #4
0
    def get_posts_from_page(self, pageid, since=0, until=utc_now(), limit=0, filtr={}, projection=None):
        """
        :param pageid:
        :param since:
        :param until:
        :param filtr:
        :param projection:
        :return:
        """
        # filtr['created_time'] = {'$gte': since, '$lt': until}

        posts = self.collection.find(filtr, projection).limit(limit)
        return posts
コード例 #5
0
    def __init__(self, pageid, since, till):
        self.pageid = pageid
        if not since: self.since = 0
        if not till: self.till = utc_now()
        self.all_posts = None
        # Create document template
        day_stat_data_template = {
            'reactions': {
                'like': 0,
                'love': 0,
                'haha': 0,
                'wow': 0,
                'sad': 0,
                'angry': 0
            },
            'comments': {
                'nr': 0,
                'likes': 0
            }
        }

        day_stat = {'{i}': day_stat_template for i in range(1, 32)}
        self.doc = {'id': pageid, 'date': 0, 'day_stats': day_stat}
        pass
コード例 #6
0
ファイル: run_scrape.py プロジェクト: MarkVdBergh/fb_test
    bulkdays = 3
    lst = 'POLITICS'
elif list_nr == 2:
    page_ids = pageids_news
    bulkdays = 1
    lst = 'NEWS'
elif list_nr > 1000:
    page_ids = ['{}'.format(list_nr)]
    bulkdays = int(raw_input('Enter bulkdays: '))
    lst = list_nr
else:
    page_ids = ['37823307325']
    bulkdays = 2
    lst = 'TEST'
i = 0
print '\n{} Start scraping list: {}. Attempt: {}\n'.format(utc_now(), lst, i)
while i < 10:
    try:
        result = ENG.run_scraping(pageidlist=page_ids,
                                  resume=resume,
                                  bulkdays=bulkdays)
        if result:
            print '{} End scraping list {}'.format(utc_now(), lst)
            break
    except:
        e = sys.exc_info()[0]
        print e
        i += 1
        time.sleep(600)

# ENG.run_scraping(pageidlist=['446368145415026'], resume=False, bulkdays=30)
コード例 #7
0
ファイル: scraping_tools.py プロジェクト: MarkVdBergh/fb_test
    def get_all_posts(self,
                      page_id,
                      fields='all',
                      since=None,
                      until=None,
                      limit=100):
        """
        Gets all posts on a page, group or user
        :param page_id: string
        The unique id of the page, group or user
        :param fields: comma separated string, 'all', None
        A description of all fields can be found at: https://developers.facebook.com/docs/graph-api/reference/v2.8/post/
        Can be:
            - Comma separated string: with all fields that need to be retrieved.
            - 'all': comma separated string with default fields
            - None: facebook default fields

        :return: dict
        """
        if fields == 'all':
            # For a list of fields, see:
            # https://developers.facebook.com/docs/graph-api/reference/v2.8/post/
            fields = 'id, name,created_time, from, to, type, status_type, message, link, picture, story, shares'  # , likes,reactions'
            # ToDo: get_connections vs Get_objects. How to use limit

        chunk = self.graph.get_connections(page_id,
                                           connection_name='posts',
                                           fields=fields,
                                           date_format='U',
                                           since=since,
                                           until=until)
        # Add data to each post
        posts = []
        profile = self.get_page_profile(page_id)
        while True:  # get all chuncks of 25 posts for a page
            for i, post in enumerate(chunk['data']):
                print '{}/25\t Get data from "{}" for post: {}  \t'.format(
                    i, profile['name'], post['id']),
                post_id = post['id']
                post['profile'] = profile
                post['update_ts'] = datetime_to_timestamp(utc_now())
                post['update_dt'] = datetime.datetime.now(
                )  #ToDo: add tz=pytz.utc) ???
                post['comments'] = self.get_all_comments(post_id)
                post['reactions'] = self.get_all_reactions(post_id)
                post['created_time_dt'] = timestamp_to_datetime(
                    post['created_time'])  # local timezone
                posts.append(post)
                print 'comments:{}, \treactions:{}'.format(
                    len(post['comments']), len(post['reactions']))
            # Attempt to make a request to the next page of data, if it exists.
            try:
                chunk = requests.get(chunk['paging']['next']).json()
            except KeyError:  # When there are no more pages (['paging']['next']), break from the loop and end the script.
                break
        # the posts are sorted in decending order in the list [NEW,...,OLD].
        # This is not ideal.
        # When saving the posts in the database, NEW will be saved first. When crash, OLD is not saved. Resume scraping
        # will start from NEW and OLD will be missing in the database.
        # Reversing the order avoids executing a rollback of the inserted posts in case of crash.
        posts.reverse()
        return posts