def get_video_captions(video_target, langs): if not isinstance(langs, list): raise TypeError video_id = ensure_video_id(video_target) if video_id is None: raise YouTubeInvalidVideoId tracks = get_caption_tracks(video_id) if not tracks: return best_track = select_caption_track(tracks, langs=langs) if best_track is None: return err, response = request(best_track.url, pool=YOUTUBE_SCRAPER_POOL) if err: raise err soup = BeautifulSoup(response.data.decode('utf-8'), 'lxml') captions = [] for item in soup.select('text'): captions.append((item.get('start'), item.get('dur'), unescape(item.get_text().strip()))) return best_track, captions
def get_caption_tracks(video_id): # First we try to retrieve it from video info url = 'https://www.youtube.com/get_video_info?video_id=%s' % video_id err, response = request(url) if err: raise err data = unquote(response.data.decode('utf-8')) m = CAPTION_TRACKS_RE.search(data) if m is not None: data = json.loads(m.group(0) + '}')['captionTracks'] return [ YouTubeCaptionTrack(item['languageCode'], item['baseUrl'], item.get('kind') == 'asr') for item in data ] # Then we try to scrape it directly from the video page # url = 'https://www.youtube.com/watch?v=%s' % video_id # err, response = request(url) # if err: # raise err # timedtexts = TIMEDTEXT_RE.findall(response.data) return []
def make_request(url): err, response = request(forge_url(url), headers={'Accept-Language': 'en'}) if response.status == 404: return 'not-found', None if err: return 'http-error', None return err, response.data
def request_page(self, url): error, result = request(url, pool=self.pool, cookie=self.cookie, headers={'User-Agent': 'curl/7.68.0'}) if error is not None: raise error return result.data.decode('utf-8')
def work(self, job): self.state.inc_working() spider = self.spiders.get(job.spider) if spider is None: raise UnknownSpiderError('Unknown spider "%s"' % job.spider) err, response = request(job.url, pool=self.pool) if err: return CrawlWorkerResult( job=job, scraped=None, error=err, response=response, meta=None, content=None, next_jobs=None ) meta = spider.extract_meta_from_response(job, response) # Decoding response content content = spider.process_content(job, response, meta) if isinstance(spider, FunctionSpider): scraped, next_jobs = spider.process(job, response, content, meta) else: # Scraping items scraped = spider.scrape(job, response, content, meta) # Finding next jobs next_jobs = spider.next_jobs(job, response, content, meta) # Enqueuing next jobs if next_jobs is not None: # Consuming so that multiple agents may act on this next_jobs = list(next_jobs) self.enqueue(next_jobs) self.state.dec_working() return CrawlWorkerResult( job=job, scraped=scraped, error=None, response=response, meta=meta, content=content, next_jobs=next_jobs )
def worker(payload): item, url = payload if url is None: return FetchWorkerResult( url=None, item=item, response=None, error=None, meta=None ) kwargs = request_args(url, item) if request_args is not None else {} error, response = request( url, pool=pool, max_redirects=max_redirects, **kwargs ) if error: return FetchWorkerResult( url=url, item=item, response=response, error=error, meta=None ) # Forcing urllib3 to read data in thread # TODO: this is probably useless and should be replaced by preload_content at the right place data = response.data # Meta meta = extract_response_meta( response, guess_encoding=guess_encoding, guess_extension=guess_extension ) return FetchWorkerResult( url=url, item=item, response=response, error=error, meta=meta )
def step(pool, url, item_key): err, result = request(url, pool=pool) # Debug if err: raise err # Bad auth if result.status == 401: raise CrowdTangleInvalidTokenError elif result.status == 429: raise CrowdTangleRateLimitExceeded # Bad params if result.status >= 400: data = result.data.decode('utf-8') try: data = json.loads(data) except: raise CrowdTangleInvalidRequestError(data) raise CrowdTangleInvalidRequestError(data['message'], code=data['code'], status=result.status) try: data = json.loads(result.data)['result'] except (json.decoder.JSONDecodeError, TypeError, KeyError): raise CrowdTangleInvalidJSONError items = None if item_key in data: items = data[item_key] if len(items) == 0: items = None # Extracting next link pagination = data['pagination'] next_page = pagination['nextPage'] if 'nextPage' in pagination else None return items, next_page
def __request(self, url): err, response = request(url, pool=self.pool) # Debug if err: raise err # Bad auth if response.status == 401: raise CrowdTangleInvalidTokenError # Rate limited if response.status == 429: raise CrowdTangleRateLimitExceeded # Server error if response.status >= 500: raise CrowdTangleServerError(url=url, status=response.status) # Bad params if response.status >= 400: data = response.data.decode('utf-8') try: data = json.loads(data) except json.decoder.JSONDecodeError: raise CrowdTangleInvalidRequestError(data, url=url, status=response.status) raise CrowdTangleInvalidRequestError(data['message'], url=url, code=data.get('code'), status=response.status) try: data = json.loads(response.data)['result'] except (json.decoder.JSONDecodeError, TypeError, KeyError): raise CrowdTangleInvalidJSONError return data
def __call__(self, payload): item, domain, url = payload result = FetchResult(*payload) if url is None: return result # NOTE: request_args must be threadsafe kwargs = {} if self.request_args is not None: kwargs = self.request_args(domain, url, item) error, response = request( url, pool=self.pool, max_redirects=self.max_redirects, **kwargs ) if error: result.error = error else: # Forcing urllib3 to read data in thread # TODO: this is probably useless and should be replaced by preload_content at the right place data = response.data # Meta meta = extract_response_meta(response) result.response = response result.meta = meta if self.callback is not None: self.callback(result) return result
def test_bad_protocol(self): err, _ = request('ttps://lemonde.fr') assert type(err) is InvalidURLError
def fetch_facebook_page_stats(url): err, response = request(url, cookie='locale=en_US') if err: return 'http-error', None if response.status == 404: return 'not-found', None if response.status >= 400: return 'http-error', None html = response.data if CAPTCHA in html: die(['Rate limit reached!', 'Last url: %s' % url]) if (CURRENT_AVAILABILITY_DISCLAIMER in html or AVAILABILITY_DISCLAIMER in html): return 'unavailable', None if LOGIN_DISCLAIMER in html: return 'private-or-unavailable', None # TODO: integrate into ural bpost_id = url.rsplit('/', 1)[-1].encode() # Extracting metadata meta_extractor = re.compile(META_EXTRACTOR_TEMPLATE % bpost_id) match = meta_extractor.search(html) if match is None: return 'extraction-failed', None data = json5.loads(match.group(1).decode()) data = getpath(data, [ 'jsmods', 'pre_display_requires', 0, 3, 1, '__bbox', 'result', 'data', 'feedback' ]) if data is None: return 'extraction-failed', None # TODO: remove, this is here as a test # TODO: try to find a post where comments are disabled if get_count(data['seen_by_count']): print_err('Found seen_by_count: %i for %s' % (get_count(data['seen_by_count']), url)) if 'political_figure_data' in data and data['political_figure_data']: print_err('Found political_figure_data:') print_err(data['political_figure_data']) if get_count(data['reaction_count']) != get_count(data['reactors']): print_err('Found different reactions/reactors for %s' % url) # Extracting data from hidden html hidden_html_extractor = re.compile(HTML_EXTRACTOR_TEMPLATE % bpost_id) match = hidden_html_extractor.search(html) if match is not None: hidden_html = match.group(1).decode() soup = BeautifulSoup(hidden_html, 'lxml') # Sometimes fetching a post behaves weirdly if soup.select_one('h5 a') is None: return 'extraction-failed', None data['scraped'] = {} timestamp_elem = soup.select_one('[data-utime]') timestamp = int(timestamp_elem.get('data-utime')) data['scraped']['account_name'] = soup.select_one( 'h5 a').get_text().strip() data['scraped']['timestamp'] = timestamp data['scraped']['time'] = datetime.fromtimestamp( timestamp).isoformat() # TODO: use a context manager try: data['scraped']['aria_label'] = timestamp_elem.parent.get( 'aria-label') except: pass try: data['scraped']['text'] = soup.select_one( '[data-testid="post_message"]').get_text() except: pass # try: # data['scraped']['link'] = soup.select_one('[data-lynx-uri]').get('href') # except: # pass return None, data
def request(self, url): return request(url, pool=self.pool, spoof_ua=True)
def export_google_sheets_as_csv(url, cookie=None, authuser=None, max_authuser_attempts=4): if is_url(url): parsed = parse_google_drive_url(url) if parsed is None or parsed.type != 'spreadsheets': raise GoogleSheetsInvalidTargetError else: parsed = GoogleDriveFile('spreadsheets', url) base_export_url = parsed.get_export_url() export_url = base_export_url if authuser is not None: if not isinstance(authuser, int) or authuser < 0: raise TypeError('authuser should be an int >= 0') export_url = append_authuser(export_url, authuser) max_authuser_attempts = 1 else: authuser = 0 if cookie is not None and cookie in COOKIE_BROWSERS: jar = getattr(browser_cookie3, cookie)() resolver = CookieResolver(jar) cookie = resolver(export_url) if cookie is None: raise GoogleSheetsMissingCookieError attempts = max_authuser_attempts while True: attempts -= 1 err, response = request(export_url, cookie=cookie) if err: raise err if response.status == 404: raise GoogleSheetsNotFoundError if response.status == 401: raise GoogleSheetsUnauthorizedError if response.status == 403: authuser += 1 if attempts != 0: export_url = append_authuser(base_export_url, authuser) continue raise GoogleSheetsMaxAttemptsExceeded if 'csv' not in response.headers.get('Content-Type', '').lower(): raise GoogleSheetsInvalidContentTypeError break return response.data.decode('utf-8')
from minet.web import request, extract_response_meta, looks_like_html err, response = request('https://news.ycombinator.com/') del response.headers['Content-Type'] print(response.status) meta = extract_response_meta(response) # print(response.data) print(meta)