Esempio n. 1
0
 def __init__(self, starturl, depth=10, release=False):
     self.session = requests_html.AsyncHTMLSession()
     self.urls = [starturl]
     self.depth = depth
     self.urls_done = []
     self.host = urlparse.urlparse(starturl).netloc
     self.release = release
Esempio n. 2
0
def aggregate_zip_codes(state_links, state_list):
    asess = requests_html.AsyncHTMLSession()
    jobs = [
        partial(aggregate_helper, state_links, state, asess)
        for state in state_list
    ]
    asess.run(*jobs)
    sleep(20)
    asess.close()
Esempio n. 3
0
async def main():

    s = requests_html.AsyncHTMLSession()

    SINA_URL = 'https://news.sina.com.cn/roll/'

    r = await s.get(SINA_URL)
    await r.html.arender()
    print(r.html.encoding)
    f = open("./tmp.html", "w", encoding=r.html.encoding)
    f.write(r.html.html)
    f.close()
    print(r.url, r.is_redirect)
async def get_news_links_from_page_url(url: str) -> list:
    # print(f"get news links: url={url}")
    if not __isValidUrl__(url):
        raise Exception(f"not valid url :|{url}|")

    s = requests_html.AsyncHTMLSession()
    try:
        r = await s.get(url)
        await r.html.arender()
        html = r.html.html.encode(r.html.encoding).decode('utf8', 'ignore')
        bs = bs4.BeautifulSoup(r.html.html, 'html.parser')
        links = bs.findAll('a')
        res = []
        for link in links:
            url = link['href']
            if __isValidUrl__(url):
                res.append(url)
        return res
    finally:
        await s.close()
Esempio n. 5
0
async def get_page_html(page_link: str = '',
                        html_element: bool = True) -> lxml.html.Element:
    '''Given the URL of a JS rendered webpage, function will return the raw html from page in bytes format
    
        Must use 'await' command with function, setting html_element to True will return html.Element object, otherwise will return html page in bytes
    '''
    res = None
    # Check link
    if page_link:
        try:
            # Start Session
            asession = reqHTML.AsyncHTMLSession()
            # Request Page
            r = await asession.get(page_link,
                                   headers={'User-Agent': 'Mozilla/5.0'})
            await r.html.arender()
            res = lxml.html.fromstring(r.html.raw_html) if html_element else r
        except requests.exceptions.RequestException as e:
            print(e)

    return res
Esempio n. 6
0
 def __init__(self, start_session, end_session):
     self._start_session = start_session
     self._end_session = end_session
     self._session = requests_html.AsyncHTMLSession()
     self._trade_file_urls = None
     self._cached_asset_details = {}
Esempio n. 7
0
from starlette.applications import Starlette
from starlette.templating import Jinja2Templates
from starlette.requests import Request
import json
import requests_html
import asyncio

templates = Jinja2Templates(directory='templates')

app = Starlette()
session = requests_html.AsyncHTMLSession()
ENDPOINT = "http://52.35.39.131:1337/text-gen/predict"


@app.route('/', methods=["GET", "POST"])
async def homepage(request: Request):
    if request.method == "GET":
        return templates.TemplateResponse('index.html', {
            'result': ["result will appear here"],
            "request": request
        })
    else:
        form = await request.form()
        data = {
            "input":
            json.dumps({
                "text": form["text"],
                "num_words": int(form["num_words"]),
                "num_tries": 3
            })
        }
Esempio n. 8
0
 def __init__(self, **kwargs):
     self.session = requests_html.AsyncHTMLSession()
     self.url = kwargs['url']
     self.width = kwargs['width']
     self.run()