Example #1
0
def get_session(**kwargs) -> HTMLSession:
    session = HTMLSession()
    if kwargs['use_proxy']:
        session.proxies = {'http': 'rproxy:5566', 'https': 'rproxy:5566'}
    if not kwargs['default_header']:
        session.headers = get_headers()
    return session
Example #2
0
def get_msis_data(municipalities, date=datetime.date.today()):
    url = 'https://statistikk.fhi.no/msis/sykdomshendelser?etter=diagnose&fordeltPaa=geografi&diagnose=713&diagramtype=tabell&maaned=3&kommune=3416,3401'

    # Header for Chrome 83 Windows
    header = {
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "******",
        "Sec-Fetch-Dest": "document",
        "Referer": "https://www.google.no/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9"
    }

    session = HTMLSession()
    session.headers = header
    r = session.get(url)
    r.html.render()
    tables = r.html.find('table')
    dfs = pd.read_html(tables[0].html)

    # print(f'Total tables: {len(dfs)}')
    # print(dfs[0])
    return dfs[0]
Example #3
0
def create_session(url):
    session = HTMLSession()
    session.headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0'
    }
    scraper_sess = cfscrape.create_scraper(session)
    r = scraper_sess.get(url)
    session.close()
    return r
Example #4
0
    def _get_lyrics_and_title(self, url):
        session = HTMLSession()
        session.headers = self.headers
        response = session.get(url)

        title = response.html.find(".header_with_cover_art-primary_info-title",
                                   first=True).text

        lyrics = response.html.find(".lyrics", first=True).text

        return title, lyrics
Example #5
0
from requests_html import HTMLSession, HTML
from bs4 import BeautifulSoup

sess = HTMLSession()
sess.headers = 'User-Agent', 'User-Agent", "Mozilla/5.0 (Windows NT 5.1; rv:41.0) Gecko/20100101 Firefox/41.0'

response = sess.get('https://leetcode.com/problems/decode-xored-array/')

print(response.headers)

print(response.html.links)

response.html.render(timeout=10, sleep=10)

#print(response.html.find('A New Way to Learn'))
# a_hrefs = response.html.xpath('A New Way to Learn')

body = response.content
print(body)
# soup = BeautifulSoup(body, 'lxml')

# print(response.html.find('A New Way to Learn', first=True))

# print(soup.text)
Example #6
0
import os
import csv
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from fake_useragent import UserAgent

ua = UserAgent()
url = os.environ['MORPH_HOST']
selector = os.environ['MORPH_SELECTOR']
session = HTMLSession()
session.headers = ua.random

r.html.render()

vip_companies = r.html.select(selector)

for row in vip_companies:
    company = row.find('a span').text
    for pos in row.select('ul li'):
        print('has position ', pos.text)
    print('\n')

#with requests.Session() as s:
#    download = s.get(CSV_URL,ua)
#
#    decoded_content = download.content.decode('utf-8')
#
#    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
#    my_list = list(cr)
#    for row in my_list:
#        print(row)
Example #7
0
    def auth_html(self, order_id: str):
        count: int = 0
        session = HTMLSession()
        session.proxies = self.proxy_worker.get_proxy_dict()
        session.headers = self.headers_work.get_headers()
        cookies = self.cookies_work.get_cookies()
        while count < self.number_attempts:
            try:
                response = session.get(settings.LOGIN_PAGE, cookies=cookies)
                response.html.render()
                data = response.html.html
            except requests.exceptions.ConnectionError as error:
                self._send_task_report("target_connect_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": '',
                                           "order": order_id
                                       })

                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    '0',
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            try:
                response.raise_for_status()
            except requests.HTTPError as error:
                if response.status_code == 403:
                    if self.is_update_proxy:
                        # update proxy server settings
                        proxy = self.api_worker.update_proxy(
                            self.proxy_worker.get_proxy_id())
                        if proxy:
                            self.proxy_worker.set_proxy_data(
                                proxy[1], proxy[0])
                            session.proxies = self.proxy_worker.get_proxy_dict(
                            )
                    count += 1
                    time.sleep(config.DELAY_REQUESTS)
                    self._send_task_report("main_content_error",
                                           data={
                                               "message": error.__repr__(),
                                               "code":
                                               str(response.status_code),
                                               "order": order_id
                                           })
                    continue
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            except requests.exceptions.RequestException as error:
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
                # set cookies

            return {
                "status":
                True,
                "error":
                False,
                "status_code":
                str(response.status_code),
                "page_content":
                data,
                "type_res":
                "request_module",
                "proxy":
                tuple([
                    self.proxy_worker.get_proxy_id(),
                    self.proxy_worker.get_proxy_dict()
                ])
            }

        return {
            "status":
            False,
            "error":
            True,
            "status_code":
            "403",
            "message":
            "Perhaps the proxy server did not respond in time. 403 HTTPError",
            "type_res":
            "request_module",
            "proxy":
            tuple([
                self.proxy_worker.get_proxy_id(),
                self.proxy_worker.get_proxy_dict()
            ])
        }
Example #8
0
    def get_content(self, link: str, order_id: str):
        """
        Request page content for a given links.
        If the request status is 403,
        it requests an updated proxy server from the system api.
        :param order_id: str
        :param link: str
        :return: None
        """
        count: int = 0
        session = HTMLSession()
        session.proxies = self.proxy_worker.get_proxy_dict()
        session.headers = settings.LOGIN_HEADERS
        cookies = self.cookies_work.get_cookies()
        while count < self.number_attempts:
            try:
                response = session.get(link,
                                       timeout=(config.REQUEST_TIMEOUT,
                                                config.RESPONSE_TIMEOUT),
                                       cookies=cookies)
                session.close()
            except requests.exceptions.ConnectionError as error:
                self._send_task_report("target_connect_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": '',
                                           "order": order_id
                                       })

                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    '0',
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
            try:
                response.raise_for_status()
            except requests.HTTPError as error:
                if response.status_code == 403:
                    if self.is_update_proxy:
                        # update proxy server settings
                        proxy = self.api_worker.update_proxy(
                            self.proxy_worker.get_proxy_id())
                        if proxy:
                            self.proxy_worker.set_proxy_data(
                                proxy[1], proxy[0])
                            session.proxies = self.proxy_worker.get_proxy_dict(
                            )
                    count += 1
                    time.sleep(config.DELAY_REQUESTS)
                    self._send_task_report("main_content_error",
                                           data={
                                               "message": error.__repr__(),
                                               "code":
                                               str(response.status_code),
                                               "order": order_id
                                           })
                    continue
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }

            except requests.exceptions.RequestException as error:
                self._send_task_report("main_content_error",
                                       data={
                                           "message": error.__repr__(),
                                           "code": str(response.status_code),
                                           "order": order_id
                                       })
                return {
                    "status":
                    False,
                    "error":
                    True,
                    "status_code":
                    str(response.status_code),
                    "message":
                    error.__repr__(),
                    "type_res":
                    "request_module",
                    "proxy":
                    tuple([
                        self.proxy_worker.get_proxy_id(),
                        self.proxy_worker.get_proxy_dict()
                    ])
                }
            # set cookies

            return {
                "status":
                True,
                "error":
                False,
                "status_code":
                str(response.status_code),
                "message":
                response.text,
                "type_res":
                "request_module",
                "proxy":
                tuple([
                    self.proxy_worker.get_proxy_id(),
                    self.proxy_worker.get_proxy_dict()
                ])
            }

        return {
            "status":
            False,
            "error":
            True,
            "status_code":
            "403",
            "message":
            "Perhaps the proxy server did not respond in time. 403 HTTPError",
            "type_res":
            "request_module",
            "proxy":
            tuple([
                self.proxy_worker.get_proxy_id(),
                self.proxy_worker.get_proxy_dict()
            ])
        }
 def session(self):
     session = HTMLSession()
     session.headers = self._get_fake_headers()
     return session
Example #10
0
import firebase

LOGIN_URL = "https://news.ycombinator.com/login"
ARTICLE_URL = "https://news.ycombinator.com/item?id=%s"
STREAM_API = "https://hacker-news.firebaseio.com/v0/updates.json"
COMMENT_API = "https://hacker-news.firebaseio.com/v0/item/%s.json"
VOTE_URL = "https://news.ycombinator.com/vote?id=%s&how=%s&auth=%s&goto=item%3Fid%3D%s"

VOTE_DIRECTION = "up"
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Iridium/2017.11 Safari/537.36 Chrome/62.0.3202.94"

header = {'User-Agent': UA}
auth_cookie = None

session = HTMLSession()
session.headers = header


def vote(comment_id, direction):
    url = ARTICLE_URL % comment_id
    resp = session.get(url)

    for link in resp.html.absolute_links:
        if link.contains("vote?id=%s&how=%s" % (comment_id, direction)):
            vote_url = link

    if not vote_url:
        raise RuntimeError("No vote url found")

    # print("Vote URL: %s" % vote_url)
    vote_resp = session.get(vote_url)