def collectPostComments(self, postID): self.prompt("(post_id: {}) Collecting post's comments . . .".format(postID)) comment_list_returned = [] comments_collected = 0 base = "https://graph.facebook.com/v3.0" node = "/{}".format(postID) parameters = "?access_token={}&fields=comments".format(self.access_token) nextcommentpage = '' url = base + node + parameters + nextcommentpage searchnextpage = True while searchnextpage: data = json.loads(self.requestUntilSucceed(url).decode()) if 'comments' in data: for comment in data['comments']['data']: comment_dict = {} comment_dict['comment_id'] = comment['id'] comment_dict['comment_message'] = eu.cleanStrings(comment['message']) comment_dict['comment_createdDate'] = eu.formatFacebookTime(comment['created_time']) comment_list_returned.append(comment_dict) comments_collected += 1 if comments_collected % 10 == 0: self.prompt("(post_id: {}) {} Comments collected!".format(postID, comments_collected)) if 'next' in data['comments']['paging']: url = data['comments']['paging']['next'] else: searchnextpage = False else: searchnextpage = False self.prompt("(post_id: {}) All Post's comments collected! total: {}".format(postID, comments_collected)) return comment_list_returned
def collectVideoStatistics(self, videoID): self.prompt( "(video_id: {}) Collecting video's statistics . . .".format( videoID)) stat_dict_returned = {} parameters = { "part": "statistics,snippet", "id": videoID, "key": self.api_key, "fields": "items(id,snippet(publishedAt,title),statistics(commentCount,dislikeCount,likeCount,viewCount))" } url = "https://www.googleapis.com/youtube/v3/videos" page = requests.request(method="get", url=url, params=parameters) data = self.retryUntilSuccess(page) stat_results = json.loads(data) stat_dict_returned['video_statistic'] = stat_results['items'][0][ 'statistics'] video_title = stat_results['items'][0]['snippet']['title'] stat_dict_returned['video_title'] = eu.cleanStrings(video_title) video_createdDate = stat_results['items'][0]['snippet']['publishedAt'] stat_dict_returned['video_createdDate'] = eu.formatYoutubeTime( video_createdDate) self.prompt( "(video_id: {}) Video's statistics collected!".format(videoID)) return stat_dict_returned
def crawlPost(self, pemdaID, pemdaName, pageType, pageID): self.prompt('(pemda_id: {}, pemda_name: {}, page_type: {}, page_id: {}) Crawl Started!'.format( pemdaID, pemdaName, pageType, pageID )) channel_exist = self.fmc.checkAccount(pageID.lower()) if channel_exist: crawllimit = self.crawllimit else: crawllimit = None complete_dict = {} complete_dict['account'] = {} complete_dict['account']['page_id'] = pageID.lower() complete_dict['account']['page_type'] = pageType base = "https://graph.facebook.com/v3.0" node = "/{}".format(pageID) parameters = "?access_token={}&fields=id,name,fan_count,posts.limit({})".format(self.access_token, self.crawllimit) since_date = "2016-01-01" until_date = "" since = ".since({})".format(since_date) if since_date \ is not '' else '' until = ".until({})".format(until_date) if until_date \ is not '' else '' after = '' fields = "{message,link,created_time,type,name,id,comments.limit(0).summary(true),shares," + \ "reactions.type(LIKE).summary(total_count).limit(0).as(like)," +\ "reactions.type(LOVE).summary(total_count).limit(0).as(love)," +\ "reactions.type(WOW).summary(total_count).limit(0).as(wow)," +\ "reactions.type(HAHA).summary(total_count).limit(0).as(haha)," +\ "reactions.type(SAD).summary(total_count).limit(0).as(sad)," +\ "reactions.type(ANGRY).summary(total_count).limit(0).as(angry)}" posts_crawled = 0 searchnextpage = True while searchnextpage: url = base + node + parameters + since + until + after + fields j_input = json.loads(self.requestUntilSucceed(url).decode()) complete_dict['account']['page_id_number'] = j_input['id'] complete_dict['account']['page_name'] = j_input['name'] complete_dict['account']['page_fanCount'] = j_input['fan_count'] if 'posts' in j_input: for post in j_input['posts']['data']: complete_dict['post'] = {} complete_dict['post']['post_id'] = post['id'] complete_dict['post']['post_type'] = post['type'] complete_dict['post']['post_message'] = '' if 'message' not in post else eu.cleanStrings(post['message']) complete_dict['post']['post_createdtime'] = eu.formatFacebookTime(post['created_time']) complete_dict['post']['post_commentCount'] = 0 if 'comments' not in post else post['comments']['summary']['total_count'] complete_dict['post']['post_shareCount'] = 0 if 'shares' not in post else post['shares']['count'] complete_dict['post']['post_reaction'] = {} complete_dict['post']['post_reaction']['like'] = post['like']['summary']['total_count'] complete_dict['post']['post_reaction']['love'] = post['love']['summary']['total_count'] complete_dict['post']['post_reaction']['wow'] = post['wow']['summary']['total_count'] complete_dict['post']['post_reaction']['haha'] = post['haha']['summary']['total_count'] complete_dict['post']['post_reaction']['sad'] = post['sad']['summary']['total_count'] complete_dict['post']['post_reaction']['angry'] = post['angry']['summary']['total_count'] self.p.pushPostDocument(complete_dict) # returned_comments = [] # returned_comments = self.collectPostComments(post['id']) # complete_dict['comment'] = returned_comments # self.p.pushCommentDocument(complete_dict) posts_crawled += 1 self.prompt('(page_id: {}) {} Post crawled!'.format( pageID, posts_crawled )) if posts_crawled == crawllimit: searchnextpage = False break after = ".after({})".format(j_input['posts']['paging']['cursors']['after']) else: searchnextpage = False self.prompt("(page_id: {}) All Post crawled! total: {}".format(pageID, posts_crawled)) self.p.pushAccountDocument(complete_dict) self.prompt('(pemda_id: {}, pemda_name: {}, page_type: {}, page_id: {}) Finished crawling!'.format( pemdaID, pemdaName, pageType, pageID ))
def collectPostComments(self, base_url): comments_dict = {} fields = "{id,comments}" url = base_url + fields data = json.loads(self.requestUntilSucceed(url).decode())['posts']['data'] for status in data: id = status['id'] comment_list = [] if 'comments' in status: comment_list = status['comments']['data'] paging = status['comments']['paging'] for comment in comment_list: comment['comment_id'] = comment['id'] comment['comment_message'] = eu.cleanStrings(comment['message']) comment['comment_createdDate'] = eu.formatFacebookTime(comment['created_time']) comment.pop('id', None) comment.pop('message', None) comment.pop('created_time', None) if 'next' in paging: nextcommentpageurl = status['comments']['paging']['next'] comments_has_next_page = True while comments_has_next_page: nextcommentpageurl = '' if nextcommentpageurl is '' else "{}".format(nextcommentpageurl) data_nextpage = json.loads(self.requestUntilSucceed(nextcommentpageurl).decode()) if data_nextpage['data']: for comment in data_nextpage['data']: comment['comment_id'] = comment['id'] comment['comment_message'] = eu.cleanStrings(comment['message']) comment['comment_createdDate'] = eu.formatFacebookTime(comment['created_time']) comment.pop('id', None) comment.pop('message', None) comment.pop('created_time', None) comment_list = comment_list + data_nextpage['data'] if 'next' in data_nextpage['paging']: nextcommentpageurl = data_nextpage['paging']['next'] else: comments_has_next_page = False else: comments_has_next_page = False comments_dict[id] = comment_list return comments_dict
def crawlTweets(self, pemdaID, pemdaName, accountID): self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !'.format(pemdaID, pemdaName, accountID)) # Mengecek apakah id akun tersebut ada di database account_exist = self.tmc.checkAccount(accountID.lower()) # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh. if account_exist: crawllimit = self.crawllimit else: crawllimit = None complete_list = [] tweets_crawled = 0 complete_dict = {} complete_dict['account'] = {} complete_dict['account']['account_id'] = accountID.lower() try: for tweets in tweepy.Cursor( self.twitterAPI.user_timeline, screen_name=accountID, count=100, include_rts=True, tweet_mode='extended').items(): json_str = json.dumps(tweets._json) j_results = json.loads(json_str) if 'RT @' not in j_results['full_text']: # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet) complete_dict['account']['account_id_number'] = j_results['user']['id_str'] complete_dict['account']['account_followerCount'] = j_results['user']['followers_count'] complete_dict['post'] = {} complete_dict['post']['tweet_id'] = j_results['id_str'] complete_dict['post']['tweet_message'] = eu.cleanStrings(j_results['full_text']) complete_dict['post']['tweet_createdDate'] = eu.formatTwitterTime(j_results['created_at']) complete_dict['post']['tweet_retweetCount'] = j_results['retweet_count'] complete_dict['post']['tweet_favoriteCount'] = j_results['favorite_count'] complete_dict['post']['tweet_type'] = "text" # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual if 'entities' in j_results: if 'media' in j_results['entities']: complete_dict['post']['tweet_type'] = j_results['entities']['media'][0]['type'] if 'extended_entities' in j_results: if 'media' in j_results['extended_entities']: complete_dict['post']['tweet_type'] = j_results['extended_entities']['media'][0]['type'] complete_dict['post']['tweet_replyCount'] = 0 complete_list.append(complete_dict.copy()) # Counter tweets_crawled += 1 self.prompt('(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}'.format(accountID, complete_dict['post']['tweet_id'], tweets_crawled)) # Berhenti crawling bila telah mencapai crawllimit if tweets_crawled == crawllimit: break if complete_list: # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies() self.collectReplies(complete_list) # Mem-push json/dict untuk membuat post document for one_complete_dict in complete_list: self.p.pushPostDocument(one_complete_dict) # Mem-push json/dict untuk membuat account document self.p.pushAccountDocument(complete_dict) except tweepy.TweepError as e: logging.error(e) if e.reason == 'Twitter error response: status code = 404': raise NoAccountException self.prompt('(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !'.format(pemdaID, pemdaName, accountID))
def crawlTweets(self, pemdaID, pemdaName, accountID): ''' Memulia crawling dengan argumen : pemdaID: ID pemda pemdaName: Nama pemda accountID: ID akun twitter resmi milik pemda ''' self.prompt( '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Crawl Started !' .format(pemdaID, pemdaName, accountID)) # Mengecek apakah id akun tersebut ada di database tmc = TwitterMongoConnector() account_exist = tmc.checkAccount(int(pemdaID), accountID.lower()) # Bila ada, crawl akan di limit sebesar self.crawllimit. Bila tidak, akan mengcrawling akun secara menyeluruh. if account_exist: crawllimit = self.crawllimit else: crawllimit = None ''' Memulai crawling: Hasil crawling akan diserialisasi ke dalam bentuk json/dict sebagai berikut: { pemda_id: <id pemda> pemda_name: <nama pemda> account: { account_id: <id akun> account_id_number: <id akun dalam bentuk nomor> account_followerCount: <jumlah follower dari akun tersebut. Hanya dapat diambil dari data tweet (hasil crawl) yang masuk> } post: { tweet_id: <id unik tweet> tweet_message: <isi/pesan dari tweet> tweet_createdDate: <tanggal dibuatnya tweet> tweet_retweetCount: <jumlah retweet dari tweet> tweet_favoriteCount: <jumlah favorite dari tweet> tweet_type: <jenis tweet> tweet_replyCount: <jumlah reply dari tweet. Dicari secara manual dengan menggunakan API search> } } ''' try: tweets_crawled = 0 complete_dict = {} # Mengubah pemdaID ke dalam bentuk int (bentuk string dari spreadsheet) complete_dict['pemda_id'] = int(pemdaID) complete_dict['pemda_name'] = pemdaName complete_dict['account'] = {} # Mengubah id akun dari spreadsheet ke huruf kecil complete_dict['account']['account_id'] = accountID.lower() complete_dict['post'] = {} for tweets in tweepy.Cursor(self.twitterAPI.user_timeline, screen_name=accountID, count=100, include_rts=True, tweet_mode='extended').items(): json_str = json.dumps(tweets._json) j_results = json.loads(json_str) if 'RT @' not in j_results['full_text']: # accout_id_number dan followerCount hanya dapat diambil setelah mendapatkan data hasil crawl (berupa tweet) complete_dict['account']['account_id_number'] = j_results[ 'user']['id_str'] complete_dict['account'][ 'account_followerCount'] = j_results['user'][ 'followers_count'] complete_dict['post']['tweet_id'] = j_results['id_str'] complete_dict['post']['tweet_message'] = eu.cleanStrings( j_results['full_text']) complete_dict['post'][ 'tweet_createdDate'] = eu.formatTwitterTime( j_results['created_at']) complete_dict['post']['tweet_retweetCount'] = j_results[ 'retweet_count'] complete_dict['post']['tweet_favoriteCount'] = j_results[ 'favorite_count'] # Jenis tweet berupa text (dari hasil crawling) tidak memiliki atribut tweet_type, maka harus di-inisiasi secara manual if 'media' in j_results['entities']: complete_dict['post']['tweet_type'] = j_results[ 'entities']['media'][0]['type'] else: complete_dict['post']['tweet_type'] = "text" # Counter tweets_crawled += 1 self.prompt( '(account_id: {}, tweet_id: {}) Tweets Crawled ! total: {}' .format(accountID, complete_dict['post']['tweet_id'], tweets_crawled)) # Mengambil angka reply dengan memanfaatkan API Search pada method collectReplies() complete_dict['post'][ 'tweet_replyCount'] = self.collectReplies( accountID, j_results['id_str']) # Mem-push json/dict untuk membuat post document self.p.pushPostDocument(complete_dict) # Berhenti crawling bila telah mencapai crawllimit if tweets_crawled == crawllimit: break # Mem-push json/dict untuk membuat account document self.p.pushAccountDocument(complete_dict) except tweepy.TweepError as e: logging.error(e) if e.reason == 'Twitter error response: status code = 404': raise NoAccountException self.prompt( '(pemda_id: {}, pemda_name: {}, pemda_account: {}) Done Crawling !' .format(pemdaID, pemdaName, accountID))
def collectVideoComments(self, videoID): self.prompt( "(video_id: {}) Collecting video's comments . . .".format(videoID)) comm_dict_returned = [] parameters = { "part": "snippet", "maxResults": 100, "videoId": videoID, "key": self.api_key, "fields": "items(snippet(topLevelComment(id,snippet(publishedAt,textOriginal)))),nextPageToken" } url = "https://www.googleapis.com/youtube/v3/commentThreads" nextPageToken = '' has_next_page = True comments_collected = 0 while has_next_page: parameters['pageToken'] = '' if '' else nextPageToken page = requests.request(method="get", url=url, params=parameters) data = self.retryUntilSuccess(page) comment_results = json.loads(data) if 'items' in comment_results and comment_results['items']: for comment in comment_results['items']: comm_dict = {} comm_dict['comment_id'] = comment['snippet'][ 'topLevelComment']['id'] comment_message = comment['snippet']['topLevelComment'][ 'snippet']['textOriginal'] comm_dict['comment_message'] = eu.cleanStrings( comment_message) comment_createdDate = comment['snippet'][ 'topLevelComment']['snippet']['publishedAt'] comm_dict['comment_createdDate'] = eu.formatYoutubeTime( comment_createdDate) comm_dict_returned.append(comm_dict) comments_collected += 1 if comments_collected % 10 == 0: self.prompt( "(video_id: {}) {} Comments collected!".format( videoID, comments_collected)) if 'nextPageToken' in comment_results: nextPageToken = comment_results['nextPageToken'] else: has_next_page = False self.prompt( "(video_id: {}) All Video's comments collected! total: {}".format( videoID, comments_collected)) return comm_dict_returned