def search_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) edit_namespace_with_csv_io(namespace, 'keyword') enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit='videos', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) limit = namespace.limit for (row, keyword) in enricher.cells(namespace.column, with_rows=True): url = URL_template_accurate % {'subject': keyword, 'key': namespace.key} next_page = True while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: error_file.write('Running out of API points. You will have to wait until midnight, Pacific time!') time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data_l = get_data(result) for data in data_l: if limit is not(None): if limit == 0: return True else: limit -= 1 enricher.writerow(row, data) else: enricher.writerow(row, data)
def crowdtangle_post(http, post_id, token=None, format='csv_dict_row'): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.post: unkown `format`.') # Fetching api_url = URL_TEMPLATE % (post_id, token) err, response, data = request_json(http, api_url) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) post = nested_get(['result', 'posts', 0], data) if post is None: return if format == 'csv_dict_row': return format_post(post, as_dict=True) elif format == 'csv_row': return format_post(post) return post
def videos_action(namespace, output_file): enricher = casanova.enricher(namespace.file, output_file, add=REPORT_HEADERS, keep=namespace.select) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() column = namespace.column def rows_with_videos_id(): for row, ytb_data in enricher.cells(namespace.column, with_rows=True): video_id = None if is_youtube_video_id(ytb_data): video_id = ytb_data elif is_youtube_url(ytb_data): video_id = extract_video_id_from_youtube_url(ytb_data) yield row, video_id for chunk in chunks_iter(rows_with_videos_id(), 50): all_ids = [video_id for _, video_id in chunk if video_id] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) not_available = [] id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for row, video_id in chunk: if video_id is None or video_id in not_available: enricher.writerow(row) else: enricher.writerow(row, data[video_id])
def crowdtangle_lists(http, token=None, format='csv_dict_row'): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.lists: unkown `format`.') # Fetching api_url = URL_TEMPLATE % token err, response, data = request_json(http, api_url) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) lists = nested_get(['result', 'lists'], data) if format == 'csv_dict_row': return [format_list(l, as_dict=True) for l in lists] elif format == 'csv_row': return [format_list(l) for l in lists] return lists
def crowdtangle_summary(http, link, token=None, start_date=None, with_top_posts=False, sort_by=CROWDTANGLE_SUMMARY_DEFAULT_SORT_TYPE, format='csv_dict_row', platforms=None): if token is None: raise CrowdTangleMissingTokenError if format not in CROWDTANGLE_OUTPUT_FORMATS: raise TypeError('minet.crowdtangle.summary: unkown `format`.') if not isinstance(start_date, str): raise TypeError( 'minet.crowdtangle.summary: expecting a `start_date` kwarg.') if sort_by not in CROWDTANGLE_SUMMARY_SORT_TYPES: raise TypeError('minet.crowdtangle.summary: unknown `sort_by`.') # Fetching api_url = url_forge(link, token, start_date, sort_by, platforms, with_top_posts) err, response, data = request_json(http, api_url) if err is not None: raise err if response.status == 401: raise CrowdTangleInvalidTokenError if response.status >= 400: raise CrowdTangleInvalidRequestError(api_url) stats = nested_get(['result', 'summary', 'facebook'], data) posts = nested_get(['result', 'posts'], data) if with_top_posts else None if stats is not None: if format == 'csv_dict_row': stats = format_summary(stats, as_dict=True) elif format == 'csv_row': stats = format_summary(stats) if not with_top_posts: return stats else: if posts is not None: if format == 'csv_dict_row': posts = [format_post(post, as_dict=True) for post in posts] elif format == 'csv_row': posts = [format_post(post) for post in posts] return stats, posts
def videos_action(namespace, output_file): enricher = CSVEnricher( namespace.file, namespace.column, output_file, report_headers=REPORT_HEADERS, select=namespace.select.split(',') if namespace.select else None ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for chunk in gen_chunks(enricher): all_ids = [row[0] for row in chunk if row[0]] list_id = ",".join(all_ids) url = URL_TEMPLATE % {'list_id': list_id, 'key': namespace.key} err, response, result = request_json(http, url) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) data = get_data(result) id_available = set(data) not_available = set(all_ids).difference(id_available) loading_bar.update(len(chunk)) line_empty = [] for item in chunk: video_id, line = item if video_id is None: enricher.write_empty(line) elif video_id in not_available: line_empty = [video_id] + [''] * (len(REPORT_HEADERS) - 1) enricher.write(line, line_empty) else: enricher.write(line, data[video_id])
def comments_action(namespace, output_file): output_file = open_output_file(namespace.output) writer = csv.writer(output_file) writer.writerow(CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() url = URL_TEMPLATE % {'id': namespace.id, 'key': namespace.key} next_page = True all_data = [] while next_page: if next_page is True: err, response, result = request_json(http, url) else: url_next = url + '&pageToken=' + next_page err, response, result = request_json(http, url_next) if err: die(err) elif response.status == 403: time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: die(response.status) next_page, data = get_data(result) for comment in data: loading_bar.update() writer.writerow(comment)
def mediacloud_topic_stories(http, token, topic_id, link_id=None, media_id=None, from_media_id=None, format='csv_dict_row'): while True: url = url_forge( token, topic_id=topic_id, link_id=link_id, media_id=media_id, from_media_id=from_media_id, ) err, _, data = request_json(http, url) if err: raise err if 'stories' not in data or len(data['stories']) == 0: return next_link_id = get_next_link_id(data) for story in data['stories']: if format == 'csv_dict_row': yield format_topic_story(story, next_link_id, as_dict=True) elif format == 'csv_row': yield format_topic_story(story, next_link_id) else: yield story if next_link_id is None: return link_id = next_link_id
def generator(): last_processed_stories_id = None while True: url = url_forge( token, query, collections=collections, count=count, last_processed_stories_id=last_processed_stories_id ) err, response, data = request_json(http, url) if err: raise err if response.status >= 500: raise MediacloudServerError(server_error=data.get('error')) if count: yield data['count'] return for story in data: if format == 'csv_dict_row': yield format_story(story, as_dict=True) elif format == 'csv_row': yield format_story(story) else: yield story last_processed_stories_id = get_last_processed_stories_id(data) if last_processed_stories_id is None: return
def make_requests(current_url, http=http): return (request_json(http, current_url), current_url)
def comments_action(namespace, output_file): # Handling output output_file = open_output_file(namespace.output) # Handling input if is_youtube_video_id(namespace.column): edit_namespace_with_csv_io(namespace, 'video_id') elif is_youtube_url(namespace.column): edit_namespace_with_csv_io(namespace, 'video_url') # Enricher enricher = casanova.enricher(namespace.file, output_file, keep=namespace.select, add=CSV_HEADERS) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' comments', ) http = create_pool() error_file = DummyTqdmFile(sys.stderr) for (row, url_id) in enricher.cells(namespace.column, with_rows=True): if is_youtube_url(url_id): yt_id = extract_video_id_from_youtube_url(url_id) if yt_id: url = URL_TEMPLATE % {'id': yt_id, 'key': namespace.key} elif is_youtube_video_id(url_id): url = URL_TEMPLATE % {'id': url_id, 'key': namespace.key} else: continue # FULL commentaries # if namespace.full: url_queue = deque([url]) while len(url_queue) != 0: current_url = url_queue.popleft() err, response, result = request_json(http, current_url) if err: error_file.write('{} for {}'.format(err, current_url)) continue elif response.status == 403 and result.get('error').get( 'errors')[0].get('reason') == 'commentsDisabled': error_file.write( 'Comments are disabled for {}'.format(current_url)) continue elif response.status == 403: error_file.write( 'Running out of API points. You will have to wait until midnight, Pacific time!' ) time.sleep(seconds_to_midnight_pacific_time()) continue elif response.status >= 400: error_file.write('Error {} for {}'.format( response.status, current_url)) continue kind = result.get('kind', None) next_page = result.get('nextPageToken', None) if next_page: url_next = current_url + '&pageToken=' + next_page url_queue.append(url_next) if kind == 'youtube#commentThreadListResponse': # Handling comments pagination items = result.get('items', None) for item in items: snippet = item['snippet'] replies = item.get('replies') if replies: # Checking whether youtube's API send a subset of the replies or not if snippet['totalReplyCount'] != len( replies['comments']) and namespace.full: # If we want the replies and those are not all given by the API, we add the URL specific to the topComment # to the queue, and we deal with that topLevelComment new_url = URL_PARENTID_TEMPLATE % { 'id': snippet['topLevelComment']['id'], 'key': namespace.key } url_queue.append(new_url) data = get_data_full(snippet, True) enricher.writerow(row, data) else: dataTop = get_data_full(snippet, True) enricher.writerow(row, dataTop) for rep in replies['comments']: enricher.writerow(row, get_data_full(rep, False)) else: # if there is not 'replies' key, it means that the comment we fetch is only a topLevelComment top_comment = get_data_full(snippet, True) enricher.writerow(row, top_comment) else: # Handling, commentList, nothing to see here, dealing commments by comments items = result.get('items', None) for item in items: data = get_data_full(item, False) enricher.writerow(row, data)
def request_json(self, url, headers=None): return request_json(self.http, url, spoof_ua=True, headers=headers)