def worker(payload): http, item, url = payload if url is None: return FetchWorkerResult(url=None, item=item, response=None, error=None, meta=None) kwargs = request_args(url, item) if request_args is not None else {} error, response = request(http, url, **kwargs) if error: return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=None) # Forcing urllib3 to read data in thread data = response.data # Meta meta = extract_response_meta(response, guess_encoding=guess_encoding, guess_extension=guess_extension) return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=meta)
def request_page(target): error, result = request(http, target, cookie=cookie) if error is not None: raise error return result.data.decode('utf-8')
def step(http, url, item_key): err, result = request(http, url) # Debug if err: raise err # Bad auth if result.status == 401: raise CrowdTangleInvalidTokenError # Bad params if result.status >= 400: raise CrowdTangleInvalidRequestError try: data = json.loads(result.data)['result'] except: raise CrowdTangleInvalidJSONError if item_key not in data or len(data[item_key]) == 0: raise CrowdTangleExhaustedPagination # Extracting next link pagination = data['pagination'] next_page = pagination['nextPage'] if 'nextPage' in pagination else None return data[item_key], next_page
def request_page(self, url): error, result = request(self.http, url, cookie=self.cookie, headers={'User-Agent': 'curl/7.68.0'}) if error is not None: raise error return result.data.decode('utf-8')
def make_request(http, url): err, response = request(http, forge_url(url), headers={'Accept-Language': 'en'}) if response.status == 404: return 'not-found', None if err: return 'http-error', None return err, response.data
def work(self, job): self.state.jobs_queued = self.queue.qsize() spider = self.spiders.get(job.spider) if spider is None: raise UnknownSpiderError('Unknown spider "%s"' % job.spider) err, response = request(self.http, job.url) if err: return CrawlWorkerResult(job=job, scraped=None, error=err, response=response, meta=None, content=None, next_jobs=None) meta = spider.extract_meta_from_response(job, response) # Decoding response content content = spider.process_content(job, response, meta) if isinstance(spider, FunctionSpider): next_jobs, scraped = spider.process(job, response, content, meta) else: # Scraping items scraped = spider.scrape(job, response, content, meta) # Finding next jobs next_jobs = spider.next_jobs(job, response, content, meta) # Enqueuing next jobs if next_jobs is not None: # Consuming so that multiple agents may act on this next_jobs = list(next_jobs) self.enqueue(next_jobs) self.state.jobs_done += 1 return CrawlWorkerResult(job=job, scraped=scraped, error=None, response=response, meta=meta, content=content, next_jobs=next_jobs)
def step(http, url, item_key): err, result = request(http, url) # Debug if err: raise err # Bad auth if result.status == 401: raise CrowdTangleInvalidTokenError elif result.status == 429: raise CrowdTangleRateLimitExceeded # Bad params if result.status >= 400: data = json.loads(result.data.decode('utf-8')) raise CrowdTangleInvalidRequestError(data['message'], code=data['code'], status=result.status) try: data = json.loads(result.data)['result'] except: raise CrowdTangleInvalidJSONError items = None if item_key in data: items = data[item_key] if len(items) == 0: items = None # Extracting next link pagination = data['pagination'] next_page = pagination['nextPage'] if 'nextPage' in pagination else None return items, next_page
def captions_action(namespace, output_file): enricher = casanova.enricher( namespace.file, output_file, keep=namespace.select, add=REPORT_HEADERS ) loading_bar = tqdm( desc='Retrieving', dynamic_ncols=True, unit=' videos', ) http = create_pool() for line, video_id in enricher.cells(namespace.column, with_rows=True): url_caption = '' url_inf = INFO_URL_TEMPLATE % {'id': video_id} err1, info_vid = request(http, url_inf) info_vid_dec = unquote(str(info_vid.data)) captionsTracks = re.findall(get_info, info_vid_dec) if captionsTracks: dict_captions = json.loads(captionsTracks[0][0] + '}')['captionTracks'] for i in range(len(dict_captions)): if namespace.lang and namespace.lang == dict_captions[i]['languageCode']: url_caption = dict_captions[i]['baseUrl'] break if not(url_caption) and dict_captions: url_caption = dict_captions[0]['baseUrl'] else: url_vid = VIDEO_CALL_TEMPLATE % {'id': video_id} urls = [] time.sleep(0.01) err, result = request(http, url_vid) timedtext = re.findall(timed, str(result.data)) for x in timedtext: proper_timed = x.replace("\\\\u0026", "&") if proper_timed[-2:] == namespace.lang: url_caption = API_BASE_URL % {'temp': proper_timed} break if not(url_caption) and timedtext and not(namespace.lang): url_caption = API_BASE_URL % {'temp': timedtext[1].replace("\\\\u0026", "&")} if not url_caption: print_err('no subtitles for {}'.format(video_id)) continue time.sleep(0.01) err, result_caption = request(http, url_caption) if err is not None: print_err(err) elif result_caption.status >= 400: print_err(f'error, status : {result_caption.status} for id : {video_id}') enricher.writerow(line) else: soup = BeautifulSoup(result_caption.data, 'lxml') full_text = [] caption_text = " ".join(item.get_text() for item in soup.find_all('text')) enricher.writerow(line, [caption_text]) loading_bar.update()
def fetch_facebook_page_stats(url): err, response = request(http, url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = nested_get([ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ], data) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data
def test_bad_protocol(self): http = create_pool() err, _ = request(http, 'ttps://lemonde.fr') assert type(err) is InvalidURLError
def request(self, url): return request(self.http, url, spoof_ua=True)