def extract_cursor_from_payload(payload): found_cursor = nested_get(CURSOR_FIRST_POSSIBLE_PATH, payload) if found_cursor is None: found_cursor = nested_get(CURSOR_SECOND_POSSIBLE_PATH, payload) return found_cursor
def payload_tweets_iter(payload): tweet_index = payload['globalObjects']['tweets'] user_index = payload['globalObjects']['users'] for instruction in payload['timeline']['instructions']: if 'addEntries' in instruction: entries = instruction['addEntries']['entries'] elif 'replaceEntry' in instruction: entries = [instruction['replaceEntry']['entry']] else: continue for entry in entries: entry_id = entry['entryId'] # Filtering tweets if (not entry_id.startswith('sq-I-t-') and not entry_id.startswith('tweet-')): continue tweet_meta = nested_get(['content', 'item', 'content', 'tweet'], entry) if tweet_meta is None: tweet_meta = nested_get( ['content', 'item', 'content', 'tombstone', 'tweet'], entry) # Parsing error? if tweet_meta is None: raise TwitterPublicAPIParsingError # Skipping ads if 'promotedMetadata' in tweet_meta: continue tweet = process_single_tweet(tweet_meta['id'], tweet_index, user_index) # Additional metadata meta = None if tweet is not None: if 'forwardPivot' in tweet_meta: pivot = tweet_meta['forwardPivot'] meta = { 'intervention_text': nested_get(['text', 'text'], pivot), 'intervention_type': pivot.get('displayType'), 'intervention_url': nested_get(['landingUrl', 'url'], pivot) } yield tweet, meta
def generator(): starting_url = forge_comments_url( self.key, video_id ) queue = deque([(False, video_id, starting_url)]) while len(queue) != 0: is_reply, item_id, url = queue.popleft() result = self.request_json(url) for item in result['items']: comment_id = item['id'] replies = nested_get(['replies', 'comments'], item, []) total_reply_count = nested_get(['snippet', 'totalReplyCount'], item, 0) if not raw: item = format_comment(item) if not is_reply else format_reply(item, video_id=video_id) yield item if is_reply: continue # Getting replies if not full_replies or len(replies) >= total_reply_count: for reply in replies: if not raw: reply = format_reply(reply) yield reply elif total_reply_count > 0: replies_url = forge_replies_url( self.key, comment_id ) queue.append((True, comment_id, replies_url)) if len(result['items']) == 0: break # Next page token = result.get('nextPageToken') if token is not None: forge = forge_replies_url if is_reply else forge_comments_url next_url = forge( self.key, item_id, token=token ) queue.append((is_reply, item_id, next_url))
def crowdtangle_summary(http, link, token=None, start_date=None, with_top_posts=False, sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE, format='csv_dict_row', platforms=None): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.summary: unkown `format`.') if not isinstance(start_date, str): raise TypeError( 'minet.crowdtangle.summary: expecting a `start_date` kwarg.') if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES: raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.') # Fetching api_url = url_forge(link, token, start_date, sort_by, platforms, with_top_posts) err, response, data = request_json(http, api_url) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) stats = nested_get(['result', 'summary', 'facebook'], data) posts = nested_get(['result', 'posts'], data) if with_top_posts else None if stats is not None: if format == 'csv_dict_row': stats = format_summary(stats, as_dict=True) elif format == 'csv_row': stats = format_summary(stats) if not with_top_posts: return stats else: if posts is not None: if format == 'csv_dict_row': posts = [format_post(post, as_dict=True) for post in posts] elif format == 'csv_row': posts = [format_post(post) for post in posts] return stats, posts
def format_comment(item): meta = item['snippet'] snippet = nested_get(['snippet', 'topLevelComment', 'snippet'], item) row = YouTubeComment(meta['videoId'], item['id'], snippet['authorDisplayName'], nested_get(['authorChannelId', 'value'], snippet), snippet['textOriginal'], int(snippet['likeCount']), snippet['publishedAt'], snippet['updatedAt'], int(meta['totalReplyCount']), None) return row
def crowdtangle_lists(pool, token=None, format='csv_dict_row'): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.lists: unkown `format`.') # Fetching api_url = URL_TEMPLATE % token err, response, data = request_json(api_url, pool=pool) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) lists = nested_get(['result', 'lists'], data) if format == 'csv_dict_row': return [format_list(l, as_dict=True) for l in lists] elif format == 'csv_row': return [format_list(l) for l in lists] return lists
def crowdtangle_post(pool, post_id, token=None, format='csv_dict_row'): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.post: unkown `format`.') # Fetching api_url = URL_TEMPLATE % (post_id, token) err, response, data = request_json(api_url, pool=pool) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) post = nested_get(['result', 'posts', 0], data) if post is None: return if format == 'csv_dict_row': return format_post(post, as_dict=True) elif format == 'csv_row': return format_post(post) return post
def request_search(self, query, cursor=None, refs=None, dump=False): params = forge_search_params(query, cursor=cursor) url = '%s?%s' % (TWITTER_PUBLIC_SEARCH_ENDPOINT, params) headers = { 'Authorization': TWITTER_PUBLIC_API_AUTH_HEADER, 'X-Guest-Token': self.guest_token, 'Cookie': self.cookie, 'Accept-Language': 'en' } err, response, data = self.request_json(url, headers=headers) if err: raise err if response.status == 429: self.reset() raise TwitterPublicAPIRateLimitError if response.status >= 400: error = nested_get(['errors', 0], data) if error is not None and error.get('code') == 130: raise TwitterPublicAPIOverCapacityError raise TwitterPublicAPIInvalidResponseError cursor = extract_cursor_from_payload(data) tweets = [] if dump: return data for tweet, meta in payload_tweets_iter(data): result = normalize_tweet(tweet, extract_referenced_tweets=refs is not None, collection_source='scraping') if refs is not None: for is_first, extracted_tweet in with_is_first(result): # Casting to int64 to save up memory id_int64 = int(extracted_tweet['id']) if id_int64 in refs: continue if is_first: extracted_tweets.append((extracted_tweet, meta)) else: extracted_tweets.append((extracted_tweet, None)) refs.add(id_int64) else: tweets.append((result, meta)) return cursor, tweets
def resolve(self, config): # Attempting to resolve env variable env_var = rc_key_to_env_var(self.key) env_value = os.environ.get(env_var, '').strip() if env_value: return self.type(env_value) return nested_get(self.key, config, self.default)
def payload_tweets_iter(payload): tweet_index = payload['globalObjects']['tweets'] user_index = payload['globalObjects']['users'] for instruction in payload['timeline']['instructions']: if 'addEntries' in instruction: entries = instruction['addEntries']['entries'] elif 'replaceEntry' in instruction: entries = [instruction['replaceEntry']['entry']] else: continue for entry in entries: entry_id = entry['entryId'] # Filtering tweets if (not entry_id.startswith('sq-I-t-') and not entry_id.startswith('tweet-')): continue tweet_meta = nested_get(['content', 'item', 'content', 'tweet'], entry) if tweet_meta is None: tweet_meta = nested_get( ['content', 'item', 'content', 'tombstone', 'tweet'], entry) # Parsing error? if tweet_meta is None: raise TwitterPublicAPIParsingError # Skipping ads if 'promotedMetadata' in tweet_meta: continue tweet = process_single_tweet(tweet_meta['id'], tweet_index, user_index) if tweet is not None: yield tweet
def format_reply(item, video_id=None): snippet = item['snippet'] row = YouTubeComment( video_id if video_id is not None else snippet['videoId'], item['id'], snippet['authorDisplayName'], nested_get(['authorChannelId', 'value'], snippet), snippet['textOriginal'], int(snippet['likeCount']), snippet['publishedAt'], snippet['updatedAt'], None, snippet['parentId']) return row
def search_hashtag(self, name): name = name.lstrip('#') cursor = None while True: url = forge_hashtag_search_url(name, cursor=cursor) print(url, cursor) data = self.request_json(url) data = nested_get(['data', 'hashtag', 'edge_hashtag_to_media'], data) edges = data.get('edges') for edge in edges: yield edge['node']['shortcode'] print('Found %i posts' % len(edges)) has_next_page = nested_get(['page_info', 'has_next_page'], data) if not has_next_page: break cursor = nested_get(['page_info', 'end_cursor'], data)
def collect_top_reactions(data): edges = nested_get(['top_reactions', 'edges'], data) if edges is None: return index = {} for edge in edges: emotion = REACTION_KEYS.get(edge['node']['key']) if emotion is None: print_err('Found unkown emotion %s' % edge) continue index[emotion] = edge['reaction_count'] or 0 return index
def request_json(self, url): err, response, data = request_json(url, pool=self.pool) if err: raise err if response.status == 403: sleep_time = seconds_to_midnight_pacific_time() + 10 if callable(self.before_sleep): self.before_sleep(sleep_time) time.sleep(sleep_time) return self.request_json(url) if response.status >= 400: if data is not None and 'API key not valid' in nested_get(['error', 'message'], data, ''): raise YouTubeInvalidAPIKeyError raise YouTubeInvalidAPICall(url, response.status, data) return data
def apply_scraper(scraper, element, root=None, html=None, context=None): # Is this a tail call of item if isinstance(scraper, str): if scraper in EXTRACTOR_NAMES: return extract(element, scraper) return element.get(scraper) sel = get_aliases(scraper, ['sel', '$']) iterator = get_aliases(scraper, ['iterator', 'it', '$$']) # First we need to solve local selection if sel is not None: element = element.select_one(sel) elif 'sel_eval' in scraper: # TODO: validate element = eval_expression(scraper['sel_eval'], element=element, elements=[], context=context, html=html, root=root) # Then we need to solve iterator single_value = True if iterator is not None: elements = element.select(iterator) single_value = False elif 'iterator_eval' in scraper: elements = eval_expression(scraper['iterator_eval'], element=element, elements=[], context=context, html=html, root=root) single_value = False else: elements = [element] # Handling local context if 'context' in scraper: local_context = {} for k, field_scraper in scraper['context'].items(): local_context[k] = apply_scraper(field_scraper, element, root=root, html=html, context=context) context = merge_contexts(context, local_context) # Actual iteration acc = None if single_value else [] already_seen = set() if 'uniq' in scraper and not single_value else None for element in elements: value = None # Do we have fields? if 'fields' in scraper: value = {} for k, field_scraper in scraper['fields'].items(): value[k] = apply_scraper(field_scraper, element, root=root, html=html, context=context) # Do we have a scalar? elif 'item' in scraper: # Default value is text value = apply_scraper(scraper['item'], element, root=root, html=html, context=context) else: try: if 'attr' in scraper: value = element.get(scraper['attr']) elif 'extract' in scraper: value = extract(element, scraper['extract']) elif 'get' in scraper: value = nested_get(scraper['get'], context) elif 'constant' in scraper: value = scraper['constant'] else: # Default value is text value = extract(element, 'text') # Format? if 'format' in scraper: value = FORMATTER.format(scraper['format'], value=value, context=context) # Eval? if 'eval' in scraper: value = eval_expression(scraper['eval'], element=element, elements=elements, value=value, context=context, html=html, root=root) except: value = None # Transform if 'transform' in scraper and value is not None: value = apply_transform_chain(scraper['transform'], value) # Default value? if 'default' in scraper and value is None: value = scraper['default'] if single_value: acc = value else: # Filtering? if 'filter_eval' in scraper: passed_filter = eval_expression(scraper['filter_eval'], element=element, elements=elements, value=value, context=context, html=html, root=root) if not passed_filter: continue if 'filter' in scraper: filtering_clause = scraper['filter'] if filtering_clause is True and not value: continue if isinstance(filtering_clause, str) and not value.get(filtering_clause): continue if 'uniq' in scraper: uniq_clause = scraper['uniq'] k = value if uniq_clause is True and value in already_seen: continue if isinstance(uniq_clause, str): k = value.get(uniq_clause) if k in already_seen: continue already_seen.add(k) acc.append(value) # NOTE: this opens a way for reducers if not single_value and 'join' in scraper: acc = scraper['join'].join(acc) return acc
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None): if scope is None: scope = EvaluationScope() # Is this a tail call of item? if isinstance(scraper, str): if scraper in EXTRACTOR_NAMES: return extract(element, scraper) return element.get(scraper) sel = get_sel(scraper) iterator = get_iterator(scraper) # First we need to solve local selection if sel is not None: element = soupsieve.select_one(sel, element) elif 'sel_eval' in scraper: evaluated_sel = eval_expression( scraper['sel_eval'], element=element, elements=[], context=context, root=root, path=path + ['sel_eval'], expect=(Tag, str), allow_none=True, scope=scope ) if isinstance(evaluated_sel, str): element = soupsieve.select_one(evaluated_sel, element) else: element = evaluated_sel if element is None: return None # Then we need to solve iterator single_value = True if iterator is not None: single_value = False elements = soupsieve.select(iterator, element) elif 'iterator_eval' in scraper: single_value = False evaluated_elements = eval_expression( scraper['iterator_eval'], element=element, elements=[], context=context, root=root, path=path + ['iterator_eval'], check=is_valid_iterator_eval_output, scope=scope ) if isinstance(evaluated_elements, str): elements = soupsieve.select(evaluated_elements, element) else: elements = evaluated_elements else: elements = [element] # Handling local context if 'set_context' in scraper: local_context = {} for k, field_scraper in scraper['set_context'].items(): local_context[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['set_context', k], scope=scope ) context = merge_contexts(context, local_context) # Actual iteration acc = None if single_value else [] already_seen = set() if 'uniq' in scraper and not single_value else None for element in elements: value = None # Do we have fields? if 'fields' in scraper: value = {} for k, field_scraper in scraper['fields'].items(): value[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['fields', k], scope=scope ) # Do we have a scalar? elif 'item' in scraper: # Default value is text value = interpret_scraper( scraper['item'], element, root=root, context=context, path=path + ['item'], scope=scope ) else: if 'attr' in scraper: value = element.get(scraper['attr']) elif 'extract' in scraper: value = extract(element, scraper['extract']) elif 'get_context' in scraper: value = nested_get(scraper['get_context'], context) elif 'default' not in scraper: # Default value is text value = extract(element, 'text') # Eval? if 'eval' in scraper: value = eval_expression( scraper['eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['eval'], expect=DATA_TYPES, allow_none=True, scope=scope ) # Default value after all? if 'default' in scraper and value is None: value = scraper['default'] if single_value: acc = value else: # Filtering? if 'filter_eval' in scraper: passed_filter = eval_expression( scraper['filter_eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['filter_eval'], expect=bool, allow_none=True, scope=scope ) if not passed_filter: continue if 'filter' in scraper: filtering_clause = scraper['filter'] if filtering_clause is True and not value: continue if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value): continue if 'uniq' in scraper: uniq_clause = scraper['uniq'] k = value if uniq_clause is True and value in already_seen: continue if isinstance(uniq_clause, str): k = nested_get(uniq_clause, value) if k in already_seen: continue already_seen.add(k) acc.append(value) return acc
def resolve(self, config): return nested_get(self.key, config, self.default)
def test_nested_get(self): assert nested_get('a.d.e', NESTED_OBJECT) == 5 assert nested_get('b.d.a.a', NESTED_OBJECT) is None assert nested_get(['a', 'b', 0, 'c'], NESTED_OBJECT) == 4 assert nested_get(['a', 'b', 1, 'c', 2], NESTED_OBJECT) is None
def fetch_facebook_page_stats(url): err, response = request(http, url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = nested_get([ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ], data) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data