Beispiel #1
0
async def logic(urls):
    try:
        new_loop=asyncio.new_event_loop()
        asyncio.set_event_loop(new_loop)
        session = AsyncHTMLSession()
        browser = await launch({
            'ignoreHTTPSErrors':True, 
            'headless':True, 
            'handleSIGINT':False, 
            'handleSIGTERM':False, 
            'handleSIGHUP':False,
            'args': ['--no-sandbox', '--disable-setuid-sandbox']
        })
        session._browser = browser
        urls1=urls.split(',')
        emails1=[]
        for url in urls1:
            emails = await fetch(url, session)
            for email in emails:
                emails1.append(email)
        
        for i in range(len(emails1)):
            if(i % 2 == 1):
                emails1.pop(i-1)

        for i in range(len(emails1)):
            returndict.update({'email' + str(i+1):emails1[i]})

        return returndict
    except Exception as e:
        print(e)
        falseret=[]
        return falseret
Beispiel #2
0
 async def get_site(self):
     new_loop=asyncio.new_event_loop()
     asyncio.set_event_loop(new_loop)
     session = AsyncHTMLSession()
     browser = await pyppeteer.launch({
         'ignoreHTTPSErrors':True,
         'headless':True,
         'handleSIGINT':False,
         'handleSIGTERM':False,
         'handleSIGHUP':False
     })
     session._browser = browser
     url = 'https://money.tmx.com/en/quote/' + self.symbol
     resp_page = await session.get(url)
     await resp_page.html.arender()
     return resp_page
Beispiel #3
0
async def get_post(your_query_url):
    new_loop = asyncio.new_event_loop()
    asyncio.set_event_loop(new_loop)
    session = AsyncHTMLSession()
    browser = await pyppeteer.launch({
        'ignoreHTTPSErrors': True,
        'headless': True,
        'handleSIGINT': False,
        'handleSIGTERM': False,
        'handleSIGHUP': False
    })
    try:
        session._browser = browser
        resp_page = await session.get(your_query_url)
        await resp_page.html.arender()
        await browser.close()
        return resp_page
    except TimeoutError:
        await browser.close()
        raise TimeoutError
async def get_search_results_async(query: str) -> List[Tuple[str, str]]:
    encoded_query = quote_plus(query)
    session = None
    try:
        browser = await pyppeteer.launch(ignoreHTTPSErrors=False,
                                         headless=True,
                                         args=['--no-sandbox'],
                                         options=OPTIONS)

        session = AsyncHTMLSession(browser_args=OPTIONS)
        session._browser = browser

        # The library is doing sketchy stuff that breaks static-analysis here.
        # noinspection PyUnresolvedReferences
        response = await session.get(SEARCH_BASE + encoded_query)
        html: HTML = response.html
        await html.arender(timeout=RENDER_TIMEOUT_SECS)
        attrs = [
            link.attrs
            for link in html.find(SIMPLE_SEARCH_RESULT_CLASS_FINGERPRINT)
        ]

        if attrs:
            pairs = ((attr["title"], attr["href"]) for attr in attrs
                     if "title" in attr and "href" in attr)
            return [(title, href.split("=")[-1]) for title, href in pairs
                    ]  # Get the video codes out of the pairs
        else:
            logging.warning(f"Scraping return 0 results for {query}.")
            return []
    except TimeoutError:
        logging.warning(
            f"Rendering of request for {query} timed out (max {RENDER_TIMEOUT_SECS} seconds)."
        )
        return []
    finally:
        if session:
            await session.close()
Beispiel #5
0
async def getposts_async():
    htmls = []
    base_url = 'https://mbasic.facebook.com'
    base_page_url = 'https://mbasic.facebook.com/%E8%BB%9F%E9%AB%94%E6%AF%8F%E6%97%A5%E6%96%B0%E8%81%9E-102888707949836/'
    urls = []
    next_page_url = base_page_url
    """
    collect urls
    """

    regex_story = re.compile(r'^/story\.php.*=%2As-R$')
    regex_next_page = re.compile(r'^/profile\.php\?sectionLoadingID.*$')
    session = AsyncHTMLSession()
    browser = await pyppeteer.launch({
        'ignoreHTTPSErrors':
        True,
        'headless':
        True,
        'handleSIGINT':
        False,
        'handleSIGTERM':
        False,
        'handleSIGHUP':
        False,
        'args': ['--no-sandbox', '--disable-setuid-sandbox']
    })
    session._browser = browser
    cookies_str = os.environ.get('COOKIES_STR')
    cookie_dict = {}
    for kv_pair in cookies_str.split('; '):
        kv = kv_pair.split('=')
        cookie_dict[kv[0]] = kv[1]

    while True:
        resp_page = await session.get(next_page_url, cookies=cookie_dict)
        resp_page.encoding = 'utf-8'
        #await resp_page.html.arender()
        soup_page = resp_page.html.links
        url_tags = [link for link in soup_page if regex_story.search(link)]
        next_page_tags = [
            link for link in soup_page if regex_next_page.search(link)
        ]

        for url_tag in url_tags:
            urls.append('{base_url}{href}'.format(base_url=base_url,
                                                  href=url_tag))
        next_page_url = None if len(
            next_page_tags) == 0 else '{base_url}{href}'.format(
                base_url=base_url, href=next_page_tags[0])
        #print('next_page_url ', next_page_url)
        if (next_page_url is None):
            break

    #print('urls ', urls)
    """
    collect data
    """
    posts = []
    for url in urls:
        #httpproxy=choice(http_proxies)
        #httpsproxy=choice(https_proxies)
        resp = await session.get(url, cookies=cookie_dict)
        #print('host ', request.environ['REMOTE_ADDR'])
        resp.encoding = 'utf-8'
        #htmls.append(resp.html)
        posts.append({
            'title': '',
            'url': '',
            'content': resp.html.find('div.bc')[0].text
        })
        """sleep for 5 sec
        """
        # time.sleep(5)
    #print(htmls)
    #posts=[{'title': '123','url': 'okok','content':'okkk'}, {'title': '222', 'url': '222', 'content': '555'}]
    return posts