def get_session(**kwargs) -> HTMLSession: session = HTMLSession() if kwargs['use_proxy']: session.proxies = {'http': 'rproxy:5566', 'https': 'rproxy:5566'} if not kwargs['default_header']: session.headers = get_headers() return session
def get_msis_data(municipalities, date=datetime.date.today()): url = 'https://statistikk.fhi.no/msis/sykdomshendelser?etter=diagnose&fordeltPaa=geografi&diagnose=713&diagramtype=tabell&maaned=3&kommune=3416,3401' # Header for Chrome 83 Windows header = { "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "******", "Sec-Fetch-Dest": "document", "Referer": "https://www.google.no/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9" } session = HTMLSession() session.headers = header r = session.get(url) r.html.render() tables = r.html.find('table') dfs = pd.read_html(tables[0].html) # print(f'Total tables: {len(dfs)}') # print(dfs[0]) return dfs[0]
def create_session(url): session = HTMLSession() session.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0' } scraper_sess = cfscrape.create_scraper(session) r = scraper_sess.get(url) session.close() return r
def _get_lyrics_and_title(self, url): session = HTMLSession() session.headers = self.headers response = session.get(url) title = response.html.find(".header_with_cover_art-primary_info-title", first=True).text lyrics = response.html.find(".lyrics", first=True).text return title, lyrics
from requests_html import HTMLSession, HTML from bs4 import BeautifulSoup sess = HTMLSession() sess.headers = 'User-Agent', 'User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0' response = sess.get('https://leetcode.com/problems/decode-xored-array/') print(response.headers) print(response.html.links) response.html.render(timeout=10, sleep=10) #print(response.html.find('A New Way to Learn')) # a_hrefs = response.html.xpath('A New Way to Learn') body = response.content print(body) # soup = BeautifulSoup(body, 'lxml') # print(response.html.find('A New Way to Learn', first=True)) # print(soup.text)
import os import csv from bs4 import BeautifulSoup from requests_html import HTMLSession from fake_useragent import UserAgent ua = UserAgent() url = os.environ['MORPH_HOST'] selector = os.environ['MORPH_SELECTOR'] session = HTMLSession() session.headers = ua.random r.html.render() vip_companies = r.html.select(selector) for row in vip_companies: company = row.find('a span').text for pos in row.select('ul li'): print('has position ', pos.text) print('\n') #with requests.Session() as s: # download = s.get(CSV_URL,ua) # # decoded_content = download.content.decode('utf-8') # # cr = csv.reader(decoded_content.splitlines(), delimiter=',') # my_list = list(cr) # for row in my_list: # print(row)
def auth_html(self, order_id: str): count: int = 0 session = HTMLSession() session.proxies = self.proxy_worker.get_proxy_dict() session.headers = self.headers_work.get_headers() cookies = self.cookies_work.get_cookies() while count < self.number_attempts: try: response = session.get(settings.LOGIN_PAGE, cookies=cookies) response.html.render() data = response.html.html except requests.exceptions.ConnectionError as error: self._send_task_report("target_connect_error", data={ "message": error.__repr__(), "code": '', "order": order_id }) return { "status": False, "error": True, "status_code": '0', "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } try: response.raise_for_status() except requests.HTTPError as error: if response.status_code == 403: if self.is_update_proxy: # update proxy server settings proxy = self.api_worker.update_proxy( self.proxy_worker.get_proxy_id()) if proxy: self.proxy_worker.set_proxy_data( proxy[1], proxy[0]) session.proxies = self.proxy_worker.get_proxy_dict( ) count += 1 time.sleep(config.DELAY_REQUESTS) self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) continue self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } except requests.exceptions.RequestException as error: self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } # set cookies return { "status": True, "error": False, "status_code": str(response.status_code), "page_content": data, "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } return { "status": False, "error": True, "status_code": "403", "message": "Perhaps the proxy server did not respond in time. 403 HTTPError", "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) }
def get_content(self, link: str, order_id: str): """ Request page content for a given links. If the request status is 403, it requests an updated proxy server from the system api. :param order_id: str :param link: str :return: None """ count: int = 0 session = HTMLSession() session.proxies = self.proxy_worker.get_proxy_dict() session.headers = settings.LOGIN_HEADERS cookies = self.cookies_work.get_cookies() while count < self.number_attempts: try: response = session.get(link, timeout=(config.REQUEST_TIMEOUT, config.RESPONSE_TIMEOUT), cookies=cookies) session.close() except requests.exceptions.ConnectionError as error: self._send_task_report("target_connect_error", data={ "message": error.__repr__(), "code": '', "order": order_id }) return { "status": False, "error": True, "status_code": '0', "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } try: response.raise_for_status() except requests.HTTPError as error: if response.status_code == 403: if self.is_update_proxy: # update proxy server settings proxy = self.api_worker.update_proxy( self.proxy_worker.get_proxy_id()) if proxy: self.proxy_worker.set_proxy_data( proxy[1], proxy[0]) session.proxies = self.proxy_worker.get_proxy_dict( ) count += 1 time.sleep(config.DELAY_REQUESTS) self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) continue self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } except requests.exceptions.RequestException as error: self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } # set cookies return { "status": True, "error": False, "status_code": str(response.status_code), "message": response.text, "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } return { "status": False, "error": True, "status_code": "403", "message": "Perhaps the proxy server did not respond in time. 403 HTTPError", "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) }
def session(self): session = HTMLSession() session.headers = self._get_fake_headers() return session
import firebase LOGIN_URL = "https://news.ycombinator.com/login" ARTICLE_URL = "https://news.ycombinator.com/item?id=%s" STREAM_API = "https://hacker-news.firebaseio.com/v0/updates.json" COMMENT_API = "https://hacker-news.firebaseio.com/v0/item/%s.json" VOTE_URL = "https://news.ycombinator.com/vote?id=%s&how=%s&auth=%s&goto=item%3Fid%3D%s" VOTE_DIRECTION = "up" UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Iridium/2017.11 Safari/537.36 Chrome/62.0.3202.94" header = {'User-Agent': UA} auth_cookie = None session = HTMLSession() session.headers = header def vote(comment_id, direction): url = ARTICLE_URL % comment_id resp = session.get(url) for link in resp.html.absolute_links: if link.contains("vote?id=%s&how=%s" % (comment_id, direction)): vote_url = link if not vote_url: raise RuntimeError("No vote url found") # print("Vote URL: %s" % vote_url) vote_resp = session.get(vote_url)