def collectPostComments(self, postID):
        self.prompt("(post_id: {}) Collecting post's comments . . .".format(postID))

        comment_list_returned = []

        comments_collected = 0

        base = "https://graph.facebook.com/v3.0"
        node = "/{}".format(postID)
        parameters = "?access_token={}&fields=comments".format(self.access_token)
        nextcommentpage = ''

        url = base + node + parameters + nextcommentpage

        searchnextpage = True
        while searchnextpage:

            data = json.loads(self.requestUntilSucceed(url).decode())

            if 'comments' in data:

                for comment in data['comments']['data']:

                    comment_dict = {}
                    comment_dict['comment_id'] = comment['id']
                    comment_dict['comment_message'] = eu.cleanStrings(comment['message'])
                    comment_dict['comment_createdDate'] = eu.formatFacebookTime(comment['created_time'])

                    comment_list_returned.append(comment_dict)

                    comments_collected += 1

                    if comments_collected % 10 == 0:
                        self.prompt("(post_id: {}) {} Comments collected!".format(postID, comments_collected))

                if 'next' in data['comments']['paging']:
                    url = data['comments']['paging']['next']
                else:
                    searchnextpage = False

            else:
                searchnextpage = False

        self.prompt("(post_id: {}) All Post's comments collected! total: {}".format(postID, comments_collected))

        return comment_list_returned
Exemple #2
0
    def collectVideoStatistics(self, videoID):
        self.prompt(
            "(video_id: {}) Collecting video's statistics . . .".format(
                videoID))

        stat_dict_returned = {}

        parameters = {
            "part":
            "statistics,snippet",
            "id":
            videoID,
            "key":
            self.api_key,
            "fields":
            "items(id,snippet(publishedAt,title),statistics(commentCount,dislikeCount,likeCount,viewCount))"
        }
        url = "https://www.googleapis.com/youtube/v3/videos"

        page = requests.request(method="get", url=url, params=parameters)
        data = self.retryUntilSuccess(page)
        stat_results = json.loads(data)

        stat_dict_returned['video_statistic'] = stat_results['items'][0][
            'statistics']

        video_title = stat_results['items'][0]['snippet']['title']
        stat_dict_returned['video_title'] = eu.cleanStrings(video_title)

        video_createdDate = stat_results['items'][0]['snippet']['publishedAt']
        stat_dict_returned['video_createdDate'] = eu.formatYoutubeTime(
            video_createdDate)

        self.prompt(
            "(video_id: {}) Video's statistics collected!".format(videoID))

        return stat_dict_returned
    def crawlPost(self, pemdaID, pemdaName, pageType, pageID):
        self.prompt('(pemda_id: {}, pemda_name: {}, page_type: {}, page_id: {}) Crawl Started!'.format(
            pemdaID,
            pemdaName,
            pageType,
            pageID
        ))

        channel_exist = self.fmc.checkAccount(pageID.lower())

        if channel_exist:
            crawllimit = self.crawllimit
        else:
            crawllimit = None

        complete_dict = {}

        complete_dict['account'] = {}
        complete_dict['account']['page_id'] = pageID.lower()
        complete_dict['account']['page_type'] = pageType

        base = "https://graph.facebook.com/v3.0"
        node = "/{}".format(pageID)
        parameters = "?access_token={}&fields=id,name,fan_count,posts.limit({})".format(self.access_token, self.crawllimit)
        since_date = "2016-01-01"
        until_date = ""

        since = ".since({})".format(since_date) if since_date \
            is not '' else ''
        until = ".until({})".format(until_date) if until_date \
            is not '' else ''

        after = ''

        fields = "{message,link,created_time,type,name,id,comments.limit(0).summary(true),shares," + \
                 "reactions.type(LIKE).summary(total_count).limit(0).as(like)," +\
                 "reactions.type(LOVE).summary(total_count).limit(0).as(love)," +\
                 "reactions.type(WOW).summary(total_count).limit(0).as(wow)," +\
                 "reactions.type(HAHA).summary(total_count).limit(0).as(haha)," +\
                 "reactions.type(SAD).summary(total_count).limit(0).as(sad)," +\
                 "reactions.type(ANGRY).summary(total_count).limit(0).as(angry)}"

        posts_crawled = 0

        searchnextpage = True
        while searchnextpage:

            url = base + node + parameters + since + until + after + fields
            j_input = json.loads(self.requestUntilSucceed(url).decode())

            complete_dict['account']['page_id_number'] = j_input['id']
            complete_dict['account']['page_name'] = j_input['name']
            complete_dict['account']['page_fanCount'] = j_input['fan_count']

            if 'posts' in j_input:

                for post in j_input['posts']['data']:

                    complete_dict['post'] = {}
                    complete_dict['post']['post_id'] = post['id']
                    complete_dict['post']['post_type'] = post['type']
                    complete_dict['post']['post_message'] = '' if 'message' not in post else eu.cleanStrings(post['message'])
                    complete_dict['post']['post_createdtime'] = eu.formatFacebookTime(post['created_time'])

                    complete_dict['post']['post_commentCount'] = 0 if 'comments' not in post else post['comments']['summary']['total_count']
                    complete_dict['post']['post_shareCount'] = 0 if 'shares' not in post else post['shares']['count']

                    complete_dict['post']['post_reaction'] = {}
                    complete_dict['post']['post_reaction']['like'] = post['like']['summary']['total_count']
                    complete_dict['post']['post_reaction']['love'] = post['love']['summary']['total_count']
                    complete_dict['post']['post_reaction']['wow'] = post['wow']['summary']['total_count']
                    complete_dict['post']['post_reaction']['haha'] = post['haha']['summary']['total_count']
                    complete_dict['post']['post_reaction']['sad'] = post['sad']['summary']['total_count']
                    complete_dict['post']['post_reaction']['angry'] = post['angry']['summary']['total_count']

                    self.p.pushPostDocument(complete_dict)

                    # returned_comments = []
                    # returned_comments = self.collectPostComments(post['id'])

                    # complete_dict['comment'] = returned_comments

                    # self.p.pushCommentDocument(complete_dict)

                    posts_crawled += 1

                    self.prompt('(page_id: {}) {} Post crawled!'.format(
                            pageID,
                            posts_crawled
                    ))

                    if posts_crawled == crawllimit:
                        searchnextpage = False
                        break

                after = ".after({})".format(j_input['posts']['paging']['cursors']['after'])

            else:
                searchnextpage = False

            self.prompt("(page_id: {}) All Post crawled! total: {}".format(pageID, posts_crawled))

        self.p.pushAccountDocument(complete_dict)

        self.prompt('(pemda_id: {}, pemda_name: {}, page_type: {}, page_id: {}) Finished crawling!'.format(
            pemdaID,
            pemdaName,
            pageType,
            pageID
        ))
    def collectPostComments(self, base_url):

        comments_dict = {}

        fields = "{id,comments}"
        url = base_url + fields

        data = json.loads(self.requestUntilSucceed(url).decode())['posts']['data']

        for status in data:
            id = status['id']
            comment_list = []

            if 'comments' in status:

                comment_list = status['comments']['data']
                paging = status['comments']['paging']

                for comment in comment_list:

                    comment['comment_id'] = comment['id']
                    comment['comment_message'] = eu.cleanStrings(comment['message'])
                    comment['comment_createdDate'] = eu.formatFacebookTime(comment['created_time'])

                    comment.pop('id', None)
                    comment.pop('message', None)
                    comment.pop('created_time', None)

                if 'next' in paging:
                    nextcommentpageurl = status['comments']['paging']['next']

                    comments_has_next_page = True

                    while comments_has_next_page:

                        nextcommentpageurl = '' if nextcommentpageurl is '' else "{}".format(nextcommentpageurl)
                        data_nextpage = json.loads(self.requestUntilSucceed(nextcommentpageurl).decode())

                        if data_nextpage['data']:

                            for comment in data_nextpage['data']:

                                comment['comment_id'] = comment['id']
                                comment['comment_message'] = eu.cleanStrings(comment['message'])
                                comment['comment_createdDate'] = eu.formatFacebookTime(comment['created_time'])

                                comment.pop('id', None)
                                comment.pop('message', None)
                                comment.pop('created_time', None)

                            comment_list = comment_list + data_nextpage['data']

                            if 'next' in data_nextpage['paging']:
                                nextcommentpageurl = data_nextpage['paging']['next']
                            else:
                                comments_has_next_page = False

                        else:
                            comments_has_next_page = False

            comments_dict[id] = comment_list

        return comments_dict
    def crawlTweets(self, pemdaID, pemdaName, accountID):

        self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !'.format(pemdaID, pemdaName, accountID))

        # Mengecek apakah id akun tersebut ada di database

        account_exist = self.tmc.checkAccount(accountID.lower())

        # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh.

        if account_exist:
            crawllimit = self.crawllimit
        else:
            crawllimit = None

        complete_list = []

        tweets_crawled = 0

        complete_dict = {}

        complete_dict['account'] = {}

        complete_dict['account']['account_id'] = accountID.lower()

        try:

            for tweets in tweepy.Cursor(
                    self.twitterAPI.user_timeline,
                    screen_name=accountID,
                    count=100,
                    include_rts=True,
                    tweet_mode='extended').items():

                json_str = json.dumps(tweets._json)
                j_results = json.loads(json_str)

                if 'RT @' not in j_results['full_text']:

                    # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet)

                    complete_dict['account']['account_id_number'] = j_results['user']['id_str']
                    complete_dict['account']['account_followerCount'] = j_results['user']['followers_count']

                    complete_dict['post'] = {}
                    complete_dict['post']['tweet_id'] = j_results['id_str']
                    complete_dict['post']['tweet_message'] = eu.cleanStrings(j_results['full_text'])
                    complete_dict['post']['tweet_createdDate'] = eu.formatTwitterTime(j_results['created_at'])
                    complete_dict['post']['tweet_retweetCount'] = j_results['retweet_count']
                    complete_dict['post']['tweet_favoriteCount'] = j_results['favorite_count']
                    complete_dict['post']['tweet_type'] = "text"
                    # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual

                    if 'entities' in j_results:
                        if 'media' in j_results['entities']:
                            complete_dict['post']['tweet_type'] = j_results['entities']['media'][0]['type']

                    if 'extended_entities' in j_results:
                        if 'media' in j_results['extended_entities']:
                            complete_dict['post']['tweet_type'] = j_results['extended_entities']['media'][0]['type']

                    complete_dict['post']['tweet_replyCount'] = 0

                    complete_list.append(complete_dict.copy())

                    # Counter

                    tweets_crawled += 1
                    self.prompt('(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}'.format(accountID, complete_dict['post']['tweet_id'], tweets_crawled))

                    # Berhenti crawling bila telah mencapai crawllimit

                    if tweets_crawled == crawllimit:
                        break

            if complete_list:

                # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies()

                self.collectReplies(complete_list)

                # Mem-push json/dict untuk membuat post document

                for one_complete_dict in complete_list:

                    self.p.pushPostDocument(one_complete_dict)

            # Mem-push json/dict untuk membuat account document

            self.p.pushAccountDocument(complete_dict)

        except tweepy.TweepError as e:
            logging.error(e)
            if e.reason == 'Twitter error response: status code = 404':
                raise NoAccountException

        self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !'.format(pemdaID, pemdaName, accountID))
Exemple #6
0
    def crawlTweets(self, pemdaID, pemdaName, accountID):
        '''
            Memulia crawling dengan argumen :
                pemdaID: ID pemda
                pemdaName: Nama pemda
                accountID: ID akun twitter resmi milik pemda

        '''

        self.prompt(
            '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !'
            .format(pemdaID, pemdaName, accountID))

        # Mengecek apakah id akun tersebut ada di database

        tmc = TwitterMongoConnector()
        account_exist = tmc.checkAccount(int(pemdaID), accountID.lower())

        # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh.

        if account_exist:
            crawllimit = self.crawllimit
        else:
            crawllimit = None
        '''
            Memulai crawling:
            Hasil crawling akan diserialisasi ke dalam bentuk json/dict sebagai berikut:
            {
                pemda_id: <id pemda>
                pemda_name: <nama pemda>
                account: {
                    account_id: <id akun>
                    account_id_number: <id akun dalam bentuk nomor>
                    account_followerCount: <jumlah follower dari akun tersebut. Hanya dapat diambil dari data tweet (hasil crawl) yang masuk>
                }
                post: {
                    tweet_id: <id unik tweet>
                    tweet_message: <isi/pesan dari tweet>
                    tweet_createdDate: <tanggal dibuatnya tweet>
                    tweet_retweetCount: <jumlah retweet dari tweet>
                    tweet_favoriteCount: <jumlah favorite dari tweet>
                    tweet_type: <jenis tweet>
                    tweet_replyCount: <jumlah reply dari tweet. Dicari secara manual dengan menggunakan API search>
                }
            }
        '''

        try:
            tweets_crawled = 0

            complete_dict = {}

            # Mengubah pemdaID ke dalam bentuk int (bentuk string dari spreadsheet)

            complete_dict['pemda_id'] = int(pemdaID)
            complete_dict['pemda_name'] = pemdaName

            complete_dict['account'] = {}

            # Mengubah id akun dari spreadsheet ke huruf kecil

            complete_dict['account']['account_id'] = accountID.lower()

            complete_dict['post'] = {}

            for tweets in tweepy.Cursor(self.twitterAPI.user_timeline,
                                        screen_name=accountID,
                                        count=100,
                                        include_rts=True,
                                        tweet_mode='extended').items():

                json_str = json.dumps(tweets._json)
                j_results = json.loads(json_str)

                if 'RT @' not in j_results['full_text']:

                    # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet)

                    complete_dict['account']['account_id_number'] = j_results[
                        'user']['id_str']
                    complete_dict['account'][
                        'account_followerCount'] = j_results['user'][
                            'followers_count']

                    complete_dict['post']['tweet_id'] = j_results['id_str']
                    complete_dict['post']['tweet_message'] = eu.cleanStrings(
                        j_results['full_text'])
                    complete_dict['post'][
                        'tweet_createdDate'] = eu.formatTwitterTime(
                            j_results['created_at'])
                    complete_dict['post']['tweet_retweetCount'] = j_results[
                        'retweet_count']
                    complete_dict['post']['tweet_favoriteCount'] = j_results[
                        'favorite_count']

                    # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual

                    if 'media' in j_results['entities']:
                        complete_dict['post']['tweet_type'] = j_results[
                            'entities']['media'][0]['type']
                    else:
                        complete_dict['post']['tweet_type'] = "text"

                    # Counter

                    tweets_crawled += 1
                    self.prompt(
                        '(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}'
                        .format(accountID, complete_dict['post']['tweet_id'],
                                tweets_crawled))

                    # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies()

                    complete_dict['post'][
                        'tweet_replyCount'] = self.collectReplies(
                            accountID, j_results['id_str'])

                    # Mem-push json/dict untuk membuat post document

                    self.p.pushPostDocument(complete_dict)

                    # Berhenti crawling bila telah mencapai crawllimit

                    if tweets_crawled == crawllimit:
                        break

            # Mem-push json/dict untuk membuat account document

            self.p.pushAccountDocument(complete_dict)

        except tweepy.TweepError as e:
            logging.error(e)
            if e.reason == 'Twitter error response: status code = 404':
                raise NoAccountException

        self.prompt(
            '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !'
            .format(pemdaID, pemdaName, accountID))
Exemple #7
0
    def collectVideoComments(self, videoID):
        self.prompt(
            "(video_id: {}) Collecting video's comments . . .".format(videoID))

        comm_dict_returned = []
        parameters = {
            "part":
            "snippet",
            "maxResults":
            100,
            "videoId":
            videoID,
            "key":
            self.api_key,
            "fields":
            "items(snippet(topLevelComment(id,snippet(publishedAt,textOriginal)))),nextPageToken"
        }
        url = "https://www.googleapis.com/youtube/v3/commentThreads"
        nextPageToken = ''
        has_next_page = True

        comments_collected = 0

        while has_next_page:
            parameters['pageToken'] = '' if '' else nextPageToken

            page = requests.request(method="get", url=url, params=parameters)
            data = self.retryUntilSuccess(page)
            comment_results = json.loads(data)

            if 'items' in comment_results and comment_results['items']:

                for comment in comment_results['items']:
                    comm_dict = {}
                    comm_dict['comment_id'] = comment['snippet'][
                        'topLevelComment']['id']

                    comment_message = comment['snippet']['topLevelComment'][
                        'snippet']['textOriginal']
                    comm_dict['comment_message'] = eu.cleanStrings(
                        comment_message)

                    comment_createdDate = comment['snippet'][
                        'topLevelComment']['snippet']['publishedAt']
                    comm_dict['comment_createdDate'] = eu.formatYoutubeTime(
                        comment_createdDate)

                    comm_dict_returned.append(comm_dict)

                    comments_collected += 1

                    if comments_collected % 10 == 0:
                        self.prompt(
                            "(video_id: {}) {} Comments collected!".format(
                                videoID, comments_collected))

            if 'nextPageToken' in comment_results:
                nextPageToken = comment_results['nextPageToken']
            else:
                has_next_page = False

        self.prompt(
            "(video_id: {}) All Video's comments collected! total: {}".format(
                videoID, comments_collected))

        return comm_dict_returned