def crawl_action(namespace): # Loading crawler definition queue_path = join(namespace.output_dir, 'queue') definition = load_definition(namespace.crawler) if namespace.resume: print_err('Resuming crawl...') else: rmtree(queue_path, ignore_errors=True) # Scaffolding output directory os.makedirs(namespace.output_dir, exist_ok=True) jobs_output_path = join(namespace.output_dir, 'jobs.csv') jobs_output, jobs_writer = open_report(jobs_output_path, JOBS_HEADERS, resume=namespace.resume) # Creating crawler crawler = Crawler(definition, throttle=namespace.throttle, queue_path=queue_path) reporter_pool = ScraperReporterPool(crawler, namespace.output_dir, resume=namespace.resume) # Loading bar loading_bar = tqdm(desc='Crawling', unit=' pages', dynamic_ncols=True) def update_loading_bar(result): state = crawler.state loading_bar.set_postfix(queue=state.jobs_queued, spider=result.job.spider) loading_bar.update() # Starting crawler crawler.start() # Running crawler for result in crawler: update_loading_bar(result) jobs_writer.writerow(format_job_for_csv(result)) if result.error is not None: continue reporter_pool.write(result.job.spider, result.scraped) loading_bar.close() jobs_output.close() reporter_pool.close()
def crawl_action(cli_args, defer): # Loading crawler definition queue_path = join(cli_args.output_dir, 'queue') if cli_args.resume: print_err('Resuming crawl...') else: rmtree(queue_path, ignore_errors=True) # Scaffolding output directory os.makedirs(cli_args.output_dir, exist_ok=True) jobs_output_path = join(cli_args.output_dir, 'jobs.csv') jobs_output, jobs_writer = open_report(jobs_output_path, JOBS_HEADERS, resume=cli_args.resume) defer(jobs_output.close) # Creating crawler crawler = Crawler(cli_args.crawler, throttle=cli_args.throttle, queue_path=queue_path) reporter_pool = ScraperReporterPool(crawler, cli_args.output_dir, resume=cli_args.resume) defer(reporter_pool.close) # Loading bar loading_bar = LoadingBar(desc='Crawling', unit='page') def update_loading_bar(result): state = crawler.state loading_bar.update_stats(queued=state.jobs_queued, doing=state.jobs_doing + 1, spider=result.job.spider) loading_bar.update() # Starting crawler crawler.start() # Running crawler for result in crawler: update_loading_bar(result) jobs_writer.writerow(format_job_for_csv(result)) if result.error is not None: continue reporter_pool.write(result.job.spider, result.scraped)
def call(self, route, args={}, tryouts=MAX_TRYOUTS): if route not in self.auth: self.auth[route] = "user" auth = self.auth[route] try: return self.api[auth].__getattr__("/".join(route.split('.')))(**args) except TwitterHTTPError as e: if e.e.code == 429: now = time() reset = int(e.e.headers["x-rate-limit-reset"]) if route not in self.waits: self.waits[route] = {"user": now, "app": now} self.waits[route][auth] = reset print_err("REACHED API LIMITS on %s %s until %s for auth %s" % (route, args, reset, auth)) minwait = sorted([(a, w) for a, w in self.waits[route].items()], key=lambda x: x[1])[0] if minwait[1] > now: sleeptime = 5 + max(0, int(minwait[1] - now)) print_err(" will wait for %s for the next %ss (%s)" % (minwait[0], sleeptime, datetime.fromtimestamp(now + sleeptime).isoformat()[11:19])) sleep(sleeptime) self.auth[route] = minwait[0] return self.call(route, args, tryouts) elif tryouts: return self.call(route, args, tryouts - 1) else: print_err("ERROR after %s tryouts for %s %s %s" % (self.MAX_TRYOUTS, route, auth, args)) print_err("%s: %s" % (type(e), e))
def collect_top_reactions(data): edges = getpath(data, ['top_reactions', 'edges']) if edges is None: return index = {} for edge in edges: emotion = FACEBOOK_REACTION_KEYS.get(edge['node']['key']) if emotion is None: print_err('Found unkown emotion %s' % edge) continue index[emotion] = edge['reaction_count'] or 0 return index
def captions_action(namespace, output_file): enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for line, video_id in enricher.cells(namespace.column, with_rows=True): url_caption = '' url_inf = INFO_URL_TEMPLATE % {'id': video_id} err1, info_vid = request(http, url_inf) info_vid_dec = unquote(str(info_vid.data)) captionsTracks = re.findall(get_info, info_vid_dec) if captionsTracks: dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks'] for i in range(len(dict_captions)): if namespace.lang and namespace.lang == dict_captions[i]['languageCode']: url_caption = dict_captions[i]['baseUrl'] break if not(url_caption) and dict_captions: url_caption = dict_captions[0]['baseUrl'] else: url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id} urls = [] time.sleep(0.01) err, result = request(http, url_vid) timedtext = re.findall(timed, str(result.data)) for x in timedtext: proper_timed = x.replace("\\\\u0026", "&") if proper_timed[-2:] == namespace.lang: url_caption = API_BASE_URL % {'temp': proper_timed} break if not(url_caption) and timedtext and not(namespace.lang): url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")} if not url_caption: print_err('no subtitles for {}'.format(video_id)) continue time.sleep(0.01) err, result_caption = request(http, url_caption) if err is not None: print_err(err) elif result_caption.status >= 400: print_err(f'error, status : {result_caption.status} for id : {video_id}') enricher.writerow(line) else: soup = BeautifulSoup(result_caption.data, 'lxml') full_text = [] caption_text = " ".join(item.get_text() for item in soup.find_all('text')) enricher.writerow(line, [caption_text]) loading_bar.update()
def fetch_facebook_page_stats(url): err, response = request(url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = getpath(data, [ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ]) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data
def action(namespace, output_file): # Do we need to resume? need_to_resume = False if getattr(namespace, "resume", False): need_to_resume = True if namespace.output is None: die( "Cannot --resume without knowing the output (use -o/--output rather stdout).", ) if namespace.sort_by != "date": die("Cannot --resume if --sort_by is not `date`.") if namespace.format != "csv": die("Cannot --resume jsonl format yet.") with open(namespace.output, "r") as f: resume_reader = casanova.reader(f) last_cell = None resume_loader = tqdm(desc="Resuming", unit=" lines") for cell in resume_reader.cells("datetime"): resume_loader.update() last_cell = cell resume_loader.close() if last_cell is not None: last_date = last_cell.replace(" ", "T") namespace.end_date = last_date print_err("Resuming from: %s" % last_date) # Loading bar loading_bar = tqdm( desc="Fetching %s" % item_name, dynamic_ncols=True, unit=" %s" % item_name, total=namespace.limit, ) if namespace.format == "csv": writer = csv.writer(output_file) if not need_to_resume: writer.writerow( csv_headers(namespace) if callable(csv_headers ) else csv_headers) else: writer = ndjson.writer(output_file) client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit) args = [] if callable(get_args): args = get_args(namespace) create_iterator = getattr(client, method_name) iterator = create_iterator( *args, partition_strategy=getattr(namespace, "partition_strategy", None), limit=namespace.limit, format="csv_row" if namespace.format == "csv" else "raw", per_call=True, detailed=True, namespace=namespace) try: for details, items in iterator: if details is not None: loading_bar.set_postfix(**details) for item in items: writer.writerow(item) loading_bar.update(len(items)) except CrowdTangleInvalidTokenError: loading_bar.close() die([ "Your API token is invalid.", "Check that you indicated a valid one using the `--token` argument.", ]) loading_bar.close()
def action(namespace, output_file): # Do we need to resume? need_to_resume = False if getattr(namespace, 'resume', False): need_to_resume = True if namespace.output is None: die( 'Cannot --resume without knowing the output (use -o/--output rather stdout).', ) if namespace.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if namespace.format != 'csv': die('Cannot --resume jsonl format yet.') with open(namespace.output, 'r', encoding='utf-8') as f: resume_reader = casanova.reader(f) last_cell = None resume_loader = tqdm(desc='Resuming', unit=' lines') for cell in resume_reader.cells('datetime'): resume_loader.update() last_cell = cell resume_loader.close() if last_cell is not None: last_date = last_cell.replace(' ', 'T') namespace.end_date = last_date print_err('Resuming from: %s' % last_date) if callable(announce): print_err(announce(namespace)) # Loading bar loading_bar = tqdm(desc='Fetching %s' % item_name, dynamic_ncols=True, unit=' %s' % item_name, total=namespace.limit) if namespace.format == 'csv': writer = csv.writer(output_file) if not need_to_resume: writer.writerow( csv_headers(namespace) if callable(csv_headers ) else csv_headers) else: writer = ndjson.writer(output_file) client = CrowdTangleAPIClient(namespace.token, rate_limit=namespace.rate_limit) args = [] if callable(get_args): args = get_args(namespace) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, CrowdTangleRateLimitExceeded): reason = 'Call failed because of rate limit!' elif isinstance(exc, CrowdTangleInvalidJSONError): reason = 'Call failed because of invalid JSON payload!' else: reason = 'Call failed because of server timeout!' tqdm.write( '%s\nWill wait for %s before attempting again.' % (reason, prettyprint_seconds(retry_state.idle_for, granularity=2)), file=sys.stderr) create_iterator = getattr(client, method_name) iterator = create_iterator( *args, partition_strategy=getattr(namespace, 'partition_strategy', None), limit=namespace.limit, format='csv_row' if namespace.format == 'csv' else 'raw', per_call=True, detailed=True, namespace=namespace, before_sleep=before_sleep) try: for details, items in iterator: if details is not None: loading_bar.set_postfix(**details) for item in items: writer.writerow(item) loading_bar.update(len(items)) except CrowdTangleInvalidTokenError: loading_bar.close() die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) loading_bar.close()
def action(namespace, output_file): # Do we need to resume? need_to_resume = False if getattr(namespace, 'resume', False): need_to_resume = True if namespace.output is None: die( 'Cannot --resume without knowing the output (use -o/--output rather stdout).', ) if namespace.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if namespace.format != 'csv': die('Cannot --resume jsonl format yet.') with open(namespace.output, 'r') as f: resume_reader = casanova.reader(f) last_cell = None resume_loader = tqdm(desc='Resuming', unit=' lines') for cell in resume_reader.cells('datetime'): resume_loader.update() last_cell = cell resume_loader.close() if last_cell is not None: last_date = last_cell.replace(' ', 'T') namespace.end_date = last_date print_err('Resuming from: %s' % last_date) # Loading bar loading_bar = tqdm(desc='Fetching %s' % item_name, dynamic_ncols=True, unit=' %s' % item_name, total=namespace.limit) if namespace.format == 'csv': writer = csv.writer(output_file) if not need_to_resume: writer.writerow( csv_headers(namespace) if callable(csv_headers ) else csv_headers) else: writer = ndjson.writer(output_file) client = CrowdTangleClient(namespace.token, rate_limit=namespace.rate_limit) args = [] if callable(get_args): args = get_args(namespace) create_iterator = getattr(client, method_name) iterator = create_iterator( *args, partition_strategy=getattr(namespace, 'partition_strategy', None), limit=namespace.limit, format='csv_row' if namespace.format == 'csv' else 'raw', per_call=True, detailed=True, namespace=namespace) try: for details, items in iterator: if details is not None: loading_bar.set_postfix(**details) for item in items: writer.writerow(item) loading_bar.update(len(items)) except CrowdTangleInvalidTokenError: loading_bar.close() die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ]) loading_bar.close()
def action(cli_args): resume = getattr(cli_args, 'resume', False) # Validation if resume: if cli_args.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if cli_args.format != 'csv': die('Cannot --resume jsonl format yet.') if cli_args.format == 'csv': fieldnames = csv_headers(cli_args) if callable( csv_headers) else csv_headers writer = casanova.writer(cli_args.output, fieldnames) else: writer = ndjson.writer(cli_args.output) # Acquiring state from resumer if getattr(cli_args, 'resume', False): last_date = cli_args.output.pop_state() if last_date is not None: cli_args.end_date = last_date.replace(' ', 'T') print_err('Resuming from: %s' % cli_args.end_date) if callable(announce): print_err(announce(cli_args)) # Loading bar loading_bar = LoadingBar(desc='Fetching %s' % item_name, unit=item_name[:-1], total=cli_args.limit) args = [] if callable(get_args): args = get_args(cli_args) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) create_iterator = getattr(client, method_name) iterator = create_iterator(*args, limit=cli_args.limit, raw=cli_args.format != 'csv', per_call=True, detailed=True, namespace=cli_args) try: for details, items in iterator: loading_bar.update(len(items)) if details is not None: loading_bar.update_stats(**details) for item in items: if cli_args.format == 'csv': item = item.as_csv_row() writer.writerow(item) except CrowdTangleInvalidTokenError: loading_bar.die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])
def action(cli_args): resume = getattr(cli_args, 'resume', False) # Validation if resume: if cli_args.sort_by != 'date': die('Cannot --resume if --sort_by is not `date`.') if cli_args.format != 'csv': die('Cannot --resume jsonl format yet.') if cli_args.format == 'csv': fieldnames = csv_headers(cli_args) if callable( csv_headers) else csv_headers writer = casanova.writer(cli_args.output, fieldnames) else: writer = ndjson.writer(cli_args.output) # Acquiring state from resumer if getattr(cli_args, 'resume', False): last_date = cli_args.output.pop_state() if last_date is not None: cli_args.end_date = last_date.replace(' ', 'T') print_err('Resuming from: %s' % cli_args.end_date) if callable(announce): print_err(announce(cli_args)) # Loading bar loading_bar = LoadingBar(desc='Fetching %s' % item_name, unit=item_name[:-1], total=cli_args.limit) client = CrowdTangleAPIClient(cli_args.token, rate_limit=cli_args.rate_limit) args = [] if callable(get_args): args = get_args(cli_args) def before_sleep(retry_state): exc = retry_state.outcome.exception() if isinstance(exc, CrowdTangleRateLimitExceeded): reason = 'Call failed because of rate limit!' elif isinstance(exc, CrowdTangleInvalidJSONError): reason = 'Call failed because of invalid JSON payload!' else: reason = 'Call failed because of server timeout!' loading_bar.print( '%s\nWill wait for %s before attempting again.' % (reason, prettyprint_seconds(retry_state.idle_for, granularity=2))) create_iterator = getattr(client, method_name) iterator = create_iterator(*args, limit=cli_args.limit, raw=cli_args.format != 'csv', per_call=True, detailed=True, namespace=cli_args, before_sleep=before_sleep) try: for details, items in iterator: loading_bar.update(len(items)) if details is not None: loading_bar.update_stats(**details) for item in items: if cli_args.format == 'csv': item = item.as_csv_row() writer.writerow(item) except CrowdTangleInvalidTokenError: loading_bar.die([ 'Your API token is invalid.', 'Check that you indicated a valid one using the `--token` argument.' ])