def getComments(self, postid, limit=100, **kwargs): if type(postid) != str: postid = str(postid) if limit > 7500: limit = 7500 url = self._get_api_url() + '/{}/comments'.format(postid) url = self._prepare_url( url + '?fields={}&limit={}'.format(','.join(COMMENT_FIELDS), limit)) has_next_page = True num_processed = 0 after = '' comments = [] while has_next_page: after = '' if after is '' else "&after={}".format(after) base_url = url + after response = self._request_until_succeed(base_url) trans_comments = translate().process_comments(response['data']) comments = comments + trans_comments num_processed = num_processed + len(trans_comments) if 'paging' in response: after = response['paging']['cursors']['after'] else: has_next_page = False if num_processed >= limit: return comments return comments
def getvideos(self,q, n=10,nextPageToken=None): flag = True self.videos = {} self.channels = {} video_counter = 0 batch = self.youtube.new_batch_http_request() if n < self.DEFAULT_VIDEO_SIZE: DEFAULT_VIDEO_SIZE = n while flag: if nextPageToken: search_response = self.youtube.search().list(q=q, part="id,snippet", maxResults=DEFAULT_VIDEO_SIZE, type='video', pageToken=nextPageToken).execute() else: search_response = self.youtube.search().list(q=q, part="id,snippet", maxResults=DEFAULT_VIDEO_SIZE, type='video', ).execute() nextPageToken = search_response.get('nextPageToken') if not nextPageToken: flag = False for search_result in search_response: if 'items' in search_result: video_counter += len(search_response[search_result]) for item in search_response[search_result]: self.videos[item['id']['videoId']] = item self.channels[item['snippet']['channelId']] = item batch.add(self.youtube.commentThreads().list(part=self.commentSnippet, videoId=item['id']['videoId'], maxResults=50, textFormat='plainText'),callback=self.comments) batch.add(self.youtube.videos().list(part=self.videoSnippet, id=item['id']['videoId']),callback=self.videoStats) batch.add(self.youtube.channels().list(part=self.channelSnippet, id=item['snippet']['channelId'], maxResults=30),callback=self.channelInfo) batch.execute() if video_counter >= n: break videos = self.delete_keys_from_dict(self.videos,self.DELETE_FIELDS) videos = list(videos.values()) videos = translate().process_videos(data=videos) return videos
def insert_tweet_data(self,data): tweets = translate().process_tweets(data) grouped_tweets = groupby(tweets, lambda x: x.get('searchKey')) for key, value in grouped_tweets: inserted_ids = inserttweets(db=db, data=list(value)) if len(inserted_ids) > 0: min_id = inserted_ids[0] max_id = inserted_ids[len(inserted_ids)-1] log.info("{} No of Documents with {} inserted Successfully ... ".format(len(inserted_ids),key)) self.change_alert_record(key, min_id, max_id, len(list(inserted_ids)))
def getTweets(self, q, limit=100, **kwargs): logger.info("q param {}".format(q)) flag = True tweets = [] count = 0 max_id = None kwargs['q'] = q kwargs['count'] = 100 while (flag): current_tweet_ids = [] if max_id: kwargs['max_id'] = max_id url = self.buildUrl(SEARCH_URL, **kwargs) data, resp = self.oauth_req(url) else: url = self.buildUrl(SEARCH_URL, **kwargs) data, resp = self.oauth_req(url) data = json.loads(data) length = len(data['statuses']) count = count + length if length == 0: flag = False else: for tweet in data['statuses']: current_tweet_ids.append(tweet.get('id')) max_id = min(current_tweet_ids) tweets = tweets + translate().process_tweets(data['statuses']) if count > limit: return tweets[:limit] if count < limit: logger.warning( "Requested {} Tweets but got {} tweets from Api ".format( limit, len(tweets))) return tweets
def get_user_tweets(self, q, ids, limit=50, **kwargs): tweets_final = [] logger.info("userid param {}".format(ids)) for id in ids: logger.info("userid param {}".format(id)) print("userid param {}".format(id)) flag = True tweets = [] count = 0 max_id = None kwargs['user_id'] = id kwargs['count'] = 50 while (flag): current_tweet_ids = [] if max_id: kwargs['max_id'] = max_id url = self.buildUrl(USER_TIME_LINE, **kwargs) data, resp = self.oauth_req(url) else: url = self.buildUrl(USER_TIME_LINE, **kwargs) data, resp = self.oauth_req(url) data = json.loads(data) print( "the count of total tweets are **************************", count) length = len(data) if length == 0: flag = False else: for tweet in data: current_tweet_ids.append(tweet.get('id')) max_id = min(current_tweet_ids) data = list( filter(lambda x: json.dumps(x).rfind(q) != -1, data)) length = len(data) count = count + length if length == 0: flag = False else: for tweet in data: current_tweet_ids.append(tweet.get('id')) max_id = min(current_tweet_ids) tweets = tweets + translate().process_tweets(data) if count > limit: flag = False if count < limit: logger.warning( "Requested {} Tweets but got {} tweets from Api ".format( limit, len(tweets))) tweets_final += tweets return tweets_final
def getPosts(self, pageid, limit=100, commentLimit=50, **kwargs): if type(pageid) != str: pageid = str(pageid) page = self.getPage(pageid=pageid) url = self._get_api_url() + '/{}/posts'.format(pageid) if limit < 100: url = self._prepare_url( url + '?fields={}&limit={}'.format(','.join(POST_FIELDS), limit)) else: url = self._prepare_url( url + '?fields={}&limit={}'.format(','.join(POST_FIELDS), 100)) has_next_page = True num_processed = 0 after = '' if 'since' in kwargs: since = "&since={}".format(kwargs['since']) else: since = "&since={}".format('') if 'until' in kwargs: until = "&until={}".format(kwargs['until']) else: until = "&until={}".format('') logger.info("Scraping Facebook Page:{} ".format(pageid)) posts = [] posts_indexed = {} def indexposts(posts): for post in posts: posts_indexed[post.get('id')] = post while has_next_page: after = '' if after is '' else "&after={}".format(after) base_url = url + after + since + until response = self._request_until_succeed(base_url) if response: statuses = translate().process_posts(data=response['data']) else: statuses = [] indexposts(statuses) # if commentLimit > 50 : # post_ids = [status.get('id') for status in statuses] # comments = self.getCommentsPosts(post_ids,limit=commentLimit) # for key,value in comments.items(): # post = posts_indexed[key] # post['comments']['data'] = value # posts_indexed[key] = post if commentLimit > 50: logger.info("user asked comments above default size") else: logger.info("Going with default comment limit") for status in list(posts_indexed.values()): comments = translate().process_comments( status['comments']['data']) status['comments']['data'] = comments status['page'] = page posts.append(Post(data=status)) num_processed += 1 if num_processed % 100 == 0: print(("{} Statuses Processed: {}".format( num_processed, datetime.datetime.now()))) # if there is no next page, we're done. if 'paging' in response: after = response['paging']['cursors']['after'] else: has_next_page = False posts_indexed = {} if num_processed >= limit: return posts[:limit] if num_processed < limit: logger.warn('Required {} but got {} from page {}'.format( limit, num_processed, pageid)) return posts