Esempio n. 1
0
async def open_driver(proxy=None):
    chromeOptions = {
        "args": [
            "enable-automation",
            "--headless",
            "--disable-gpu",
            "--no-sandbox",
            "--disable-infobars",
            "--disable-dev-shm-usage",
            "--disable-browser-side-navigation",
        ]
    }
    if proxy and False:
        # TODO arsenic proxy?
        prox = Proxy()
        prox.proxy_type = ProxyType.MANUAL
        prox.http_proxy = proxy
        prox.socks_proxy = proxy
        prox.ssl_proxy = proxy

        prox.add_to_capabilities(capabilities)

    if "SELENIUM_URL" in os.environ:
        driver = await start_session(
            services.Remote(os.getenv("SELENIUM_URL")),
            browsers.Chrome(chromeOptions=chromeOptions),
        )
    else:
        driver = await start_session(
            services.Chromedriver(),
            browsers.Chrome(chromeOptions=chromeOptions))
    return driver
async def create_source_selenium(url: str, proxy_list: list = None) -> str:
    service = services.Chromedriver(binary="./chromedriver")
    if proxy_list is not None:
        browser = browsers.Chrome(chromeOptions={
            'args':
            ['--headless', f"--proxy-server={random.choice(proxy_list)}"]
        })
    else:
        browser = browsers.Chrome(chromeOptions={'args': ['--headless']})
    async with get_session(service, browser) as session:
        await session.get(url)
        return await session.get_page_source()
async def session(request):
    global time_wait
    config = load_config(request.config.getoption('--config-file'))
    if 'db_path' in config:
        temp_dir = tempfile.gettempdir()
        temp_path = os.path.join(temp_dir, 'tmp_pythonz_db')
        db_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), config['db_path']))
        if not os.path.isfile(temp_path):
            shutil.copy2(db_path, temp_path)
    if config['browser'] == 'firefox':
        service = services.Geckodriver()
        browser = browsers.Firefox()
    elif config['browser'] == 'chrome':
        service = services.Chromedriver()
        browser = browsers.Chrome()
    else:
        raise ValueError(f"Unrecognized browser {config['browser']}")
    session = await start_session(service, browser, bind=config['base_url'])
    # настройка времени ожидания для каждого потока
    slaveinput = getattr(request.config, 'slaveinput', None)
    if slaveinput is None:  # запуск в 1 поток
        session.time_wait = 0
    else:
        session.time_wait = slaveinput['time_wait']
    try:
        yield session
    finally:
        await stop_session(session)
        if 'db_path' in config:
            if os.path.isfile(temp_path):
                shutil.copy2(temp_path, db_path)
                os.remove(temp_path)
Esempio n. 4
0
async def scraper(url: str, i=-1, timeout: int = 60, start=None) -> dict:
    """Scrapes the HTML of the passed URL using arsenic webdriver.
    Returns a list of dictionaries, with product id, slug, link."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    async with get_session(service, browser) as session:
        # if the page doesn't respond, return an empty URLs list.
        try:
            await asyncio.wait_for(session.get(url), timeout=60)
        except asyncio.TimeoutError:
            return []

        await asyncio.sleep(10)
        body = await session.get_page_source()  # getting raw HTML
        html_r = await get_parsable_html(body)  # converting to parsable HTML
        links = await get_fabric_links(html_r)  # getting relative links
        product_data = await get_product_data(url, html_r)

        dataset = {"links": links, "product_data": product_data}
        #_____________printing time consumption_________________#
        if start is not None:
            end = time.time() - start
            print(f"{i} took {end} seconds")

        return dataset
Esempio n. 5
0
async def scraper(url:str, i=-1, timeout:int=60, start=None)-> list:
    """Scrapes the HTML of the passed URL using arsenic webdriver.
    Returns a list of dictionaries, with product id, slug, link."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {'args': ['--headless', '--disable-gpu']}
    }

    async with get_session(service, browser) as session:
        # if the page doesn't respond, return an empty URLs list.
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []

        body = await session.get_page_source()
        links = await get_fabric_links(body)

        if start is not None:
            end = time.time() - start
            print(f"{i} took {end} seconds")

        return links
Esempio n. 6
0
async def hello_world():
    service = services.Chromedriver()
    browser = browsers.Chrome()
    async with get_session(service, browser) as session:
        await session.get('http://www.baidu.com/')
        search_box = await session.wait_for_element(5, '#kw')
        await search_box.send_keys('arsenic')
        await search_box.send_keys(keys.ENTER)
Esempio n. 7
0
 def __init__(self, bot):
     self.bot: commands.Bot = bot
     self.browser = browsers.Chrome(
         **
         {"goog:chromeOptions": {
             "args": ["--headless", "--disable-gpu"]
         }})
     self.service = services.Chromedriver(binary=DRIVER)
 async def get_browser(self):
     service = services.Chromedriver(
         binary="D:/Program Files/BitWebV3.0/Chrome/chromedriver.exe")
     browser = browsers.Chrome()
     for i in range(5):
         session = await start_session(service=service, browser=browser)
         info = PageInfo(session)
         self.pages.append(info)
Esempio n. 9
0
	def __init__(self, bot):
		self.bot = bot
		self.responses = {}
		self.google = gassistant
		self.service = services.Chromedriver(log_file=DEVNULL)
		self.browser = browsers.Chrome(chromeOptions={
			'args': ['--headless', '--disable-gpu', '--log-file=/dev/null']
		})
Esempio n. 10
0
async def session(server):
    session = await start_session(
        services.Chromedriver(binary=CHROME_DRIVER_PATH),
        browsers.Chrome(chromeOptions={'args': ['--headless', '--disable-gpu']}),
        bind=SERVER_ADDRESS)
    try:
        yield session
    finally:
        await stop_session(session)
Esempio n. 11
0
async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome(
        chromeOptions={'args': ['--headless', '--disable-gpu']})
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        print(body)
        return body
    def __init__(self):
        self.BINARY_PATH = Config.BINARY_PATH

        capabilities = { 'acceptSslCerts': True,'acceptInsecureCerts': True }            
        chromeOptions = {'args': [
            '--headless', '--no-sandbox', '--disable-dev-shm-usage', 
            '--disable-gpu', '--disable-setuid-sandbox', '--lang=ko_KR',
        ]}
        
        self.service = services.Chromedriver(binary=self.BINARY_PATH['CHROME_DRIVER'], log_file=os.devnull)
        self.browser = browsers.Chrome(chromeOptions=chromeOptions, **capabilities)
async def scraper(url, i=-1, timeout=60, start=None, body_delay=10):
    service = services.Chromedriver()
    browser = browsers.Chrome(chromeOptions={
        'args': ['--headless', '--disable-gpu']
    })
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        if body_delay > 0:
            await asyncio.sleep(body_delay)
        body = await session.get_page_source()
        return body
Esempio n. 14
0
async def scraper(url: str) -> str:
    """Scrapes the HTML of the passed URL using arsenic webdriver."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        return body
Esempio n. 15
0
async def scraper(url: str):
    """Returns the HTML of the passed URL using arsenic webdriver."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    # creating an arsenic session and running it inside of a context manager.
    async with get_session(service, browser) as session:
        await session.get(url)
        body = await session.get_page_source()
        return body
Esempio n. 16
0
async def scraper_all(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.findAll("article", {"class": "serp-item list"})
        for l in box:
            try:
                link = l.find('a', href=True)['href']
                img = l.find('img')['src']
                name = l.find('h2').find('a').getText()
                time = l.find('time').getText()
                price = l.find('strong', {'class': 'item-price'}).getText()

            except:
                link = ''
                img = ''
                name = ''
                price = ''
                time = ''

            try:
                region = l.find('div', {
                    'class': 'content'
                }).findAll('p')[1].getText()
            except:
                region = ''

            products.append({
                'link': link,
                'img': img,
                'name': name,
                'price': price,
                'time': time,
                'region': region
            })
        return products
async def get_google_answer_text(url_text):
    msg = None
    service = services.Chromedriver()
    browser = browsers.Chrome(**{"goog:chromeOptions": CHROME_OPTIONS})
    try:
        async with get_session(service, browser) as session:
            await session.get(
                f"https://www.google.com/search?hl=en&gl=UK&q={url_text}")
            msg = await get_financial_box_text(session)
            if not msg:
                msg = await get_kp_box_text(session)
            if not msg:
                msg = await get_kc_box_text(session)
    except:
        msg = None
        traceback.print_exc()
    return msg
Esempio n. 18
0
async def incredible(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.find("div",
                        {"class": "flash-product-wrapper bani-container"})
        li = box.findAll("a", {'class': 'flash-product'})
        for l in li:
            link = l['href']
            img = l.find('div', {'class': 'img-box'}).find('img')['src']
            name = l.find('p', {'class': 'f-p-name'}).getText()
            price = l.find('span', {'class': 'specific-price price'}).getText()
            brand = l.find('p', {'class': 'f-p-logo'}).getText()

            try:
                discount = l.find('div', {
                    'class': 'discount'
                }).find('p').getText()
                last_price = l.find('span', {
                    'class': 'old-price price'
                }).getText()
            except:
                discount = ''
                last_price = ''

            products.append({
                'link': link,
                'img': img,
                'name': name,
                'discount': discount,
                'last_price': last_price,
                'price': price,
                'brand': brand
            })
        return products
Esempio n. 19
0
async def scraper(url, i=-1, timeout=60, start=None):
    service = services.Chromedriver()
    browser = browsers.Chrome(
        chromeOptions={'args': ['--headless', '--disable-gpu']})
    async with get_session(service, browser) as session:
        try:
            await asyncio.wait_for(session.get(url), timeout=timeout)
        except asyncio.TimeoutError:
            return []
        await asyncio.sleep(10)
        body = await session.get_page_source()  # save this locally??
        content = await get_parsable_html(body)
        links = await get_links(content)
        product_data = await get_product_data(url, content)
        if start != None:
            end = time.time() - start
            print(f'{i} took {end} seconds')
        # print(body)
        dataset = {"links": links, "product_data": product_data}
        return dataset
Esempio n. 20
0
async def main():
    load_dotenv()

    service = services.Chromedriver(log_file=os.devnull)
    browser = browsers.Chrome()

    telegram_client = TelegramClient(token=os.getenv("TG_TOKEN"),
                                     default_channel=os.getenv("TG_CHANNEL"))
    loguru_client = LoguruClient()
    messenger = Messenger([loguru_client, telegram_client])

    async with get_session(service, browser) as session:
        extractor = CUExtractor(session)
        memory = Memory()

        while True:
            items = await extractor.extract()
            added: List[Entry] = memory.update(items)[0]
            for entry in added:
                messenger.send(entry.to_markdown())
            await asyncio.sleep(DELAY)
Esempio n. 21
0
    async def qdocs(self, ctx, arg):
        if len(arg) > self.limit:
            return await ctx.send('`Query length greater than {self.limit}`')
        query_url = f'https://qiskit.org/documentation/search.html?q={arg}&check_keywords=yes&area=default#'

        service = services.Chromedriver()
        browser = browsers.Chrome()
        browser.capabilities = {
            "goog:chromeOptions": {
                "args": ["--headless", "--disable-gpu"]
            }
        }
        async with get_session(service, browser) as session:
            try:
                await session.get(query_url)
            except asyncio.TimeoutError:
                return await ctx.send('`Failed | Time Limit Exceeded`')
            else:
                source = None
                try:
                    source = await asyncio.wait_for(session.get_page_source(),
                                                    timeout=10)
                except asyncio.TimeoutError:
                    return await ctx.send('`Failed | Time Limit Exceeded`')
                else:
                    soup = BeautifulSoup(source, 'html.parser')
                    summary = soup.select('.search')
                    res = []
                    description = f''
                    for li in summary[0].find_all('li'):
                        link = li.find('a', href=True)
                        res.append(
                            f'[`{link.contents[0]}`]({self.render_link + link["href"]})'
                        )

                    embed = discord.Embed(title=f'`Results for: {arg}`',
                                          description='\n'.join(res),
                                          color=0xe8e3e3)

                    return await ctx.send(embed=embed)
Esempio n. 22
0
 def __init__(
     self, url=None, height=1000, width=1000, file_name="screenshot.png", loop=None
 ):
     fix_arsenic_log()
     self.url = url
     self.height = height
     self.width = width
     self.file_name = file_name
     if loop is None:
         self.loop = asyncio.get_event_loop()
     else:
         self.loop = loop
     service = services.Chromedriver()
     browser = browsers.Chrome(
         **{
             "goog:chromeOptions": {
                 "args": ["--headless", "--no-sandbox", "--disable-dev-shm-usage"],
                 "w3c": True,
             }
         }
     )
     self.session: Session = loop.run_until_complete(start_session(service, browser))
Esempio n. 23
0
async def scraper(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {"args": ["--headless", "--disable-gpu"]}
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url),timeout=100)
        
        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.find("ul", {"class":"category_styles_product_card_list__1Xocv"})
        li = box.findAll("li")
        for l in li:
            try:
                link = 'https://timcheh.com' + l.find ('a', href=True)['href']
                img = l.find('img')['src']
                name = l.find('h3').getText()
                price = l.find('div', {'class':'styles_price__cldWW'}).getText()
            except:
                link = ''
                img = ''
                name = ''
                price = ''
            try:
                discount = l.find('div', {'class':'styles_discount_number__39goM'}).find('span').getText()
                old_price = l.find('div', {'class':'styles_old_price__35bDJ'}).getText()
            except:
                discount = ''
                old_price = ''
            try:
                bonous = l.find('span', {'class':'styles_caption__3SE4x'}).getText()
            except:
                bonous = ''
            
            products.append({'link':link, 'img':img, 'name':name, 'discount':discount, 'last_price':old_price,
                             'price':price, 'bonous':bonous})
        return products
Esempio n. 24
0
async def scraper(url: str, i=-1, timeout: int = 60, delay: int = 10) -> dict:
    """Scrapes the HTML of the passed URL using arsenic webdriver.
    Returns a list of dictionaries, with product id, slug, link."""

    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        'goog:chromeOptions': {
            'args': ['--headless', '--disable-gpu']
        }
    }

    async with get_session(service, browser) as session:
        # if the page doesn't respond, return an empty URLs list.
        try:
            await asyncio.wait_for(session.get(url), timeout=60)
        except asyncio.TimeoutError:
            return []

        if delay > 0:
            await asyncio.sleep(10)
        body = await session.get_page_source()  # getting raw HTML

        return body
Esempio n. 25
0
async def scraper_all(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.find("div", {"id": "product_list"})
        li = box.findAll("article")
        for l in li:
            try:
                link = 'https://banimode.com' + l.find('a', href=True)['href']
                img = l.find('img')['src']
                name = l.find('span', {'class': 'product-card-name'}).getText()
                price = l.find('span', {'class': 'price-disgit'}).getText()
                sizes = [(s.find('a').getText()).replace(' ',
                                                         '').replace('\n', '')
                         for s in l.find('ul', {
                             'class': 'product-card-size'
                         }).findAll('li')]
                brand = l.find('span', {
                    'class': 'product-card-brand'
                }).getText()

            except:
                link = ''
                img = ''
                name = ''
                price = ''
                sizes = []
                brand = ''
            try:
                discount = l.find('span', {
                    'class': 'product-card-discount'
                }).getText()
                last_price = l.find('span', {
                    'class': 'product-card-lastprice'
                }).getText()
            except:
                discount = ''
                last_price = ''
            try:
                bonous = (l.find('span', {
                    'class': 'product-card-size-tag'
                }).getText()).replace('  ', '').replace('\n', '')
            except:
                bonous = ''

            products.append({
                'link': link,
                'img': img,
                'name': name,
                'discount': discount,
                'last_price': last_price,
                'price': price,
                'bonous': bonous,
                'sizes': sizes,
                'brand': brand
            })
        return products
Esempio n. 26
0
 def __init__(self):
     self.service = services.Chromedriver()
     self.browser = browsers.Chrome(chromeOptions={
         'args': ['--headless', '--disable-gpu']
     })
     self.request_semaphore = asyncio.Semaphore(5)
Esempio n. 27
0
async def scraper(url):
	service = services.Chromedriver()
	browser = browsers.Chrome()
Esempio n. 28
0
    async def on_call(self, ctx, args, **flags):
        m = await ctx.send('Taking screenshot...')

        url = args[1]
        if url.startswith('<') and url.endswith('>'):
            url = url[1:-1]
        if not url.startswith(('http://', 'https://')):
            url = 'http://' + url

        proxy = random.choice(list(self.bot.proxies.keys()))

        try:
            async with self.bot.sess.head(url, timeout=15, proxy=proxy) as r:
                if (r.content_length or 0) > 100000000:
                    return await self.bot.edit_message(
                        m, 'Rejected to navigate')
        except Exception:
            return await self.bot.edit_message(m, 'Connection timeout')

        await self.lock.acquire()
 
        try:
            service = service = services.Chromedriver(log_file=devnull)
            browser = browsers.Chrome(
                chromeOptions={
                    'args': ['--headless', '--disable-gpu', f'proxy-server={proxy}']
                }
            )

            async with get_session(service, browser) as session:
                await session.set_window_size(1920, 1080)
                await session.get(url)
                opened_url = await session.get_url()
                await asyncio.sleep(2)
                screenshot = await session.get_screenshot()
        except UnknownArsenicError:
            await self.bot.edit_message(
                m, 'Unknown exception happened')
            return
        except Exception:
            return await self.bot.edit_message(
                m, 'Could not open page, please check url and try again')
        finally:
            try:
               self.lock.release()
            except Exception:
                pass

        try:
            title = opened_url.split('/')[2]
        except IndexError:
            title = "Screenshot"

        e = Embed(title=title[:256], colour=Colour.gold(), url=url)
        e.set_image(url='attachment://screenshot.png')

        f = File(screenshot, filename='screenshot.png')
        e.set_footer(
            text=f'Took {round(time.time() - (m.created_at or m.edited_at).timestamp(), 1)} seconds')

        await self.bot.delete_message(m)
        await ctx.send(embed=e, file=f)
Esempio n. 29
0
async def scraper_all(url):
    service = services.Chromedriver()
    browser = browsers.Chrome()
    browser.capabilities = {
        "goog:chromeOptions": {
            "args": ["--headless", "--disable-gpu"]
        }
    }
    async with get_session(service, browser) as session:
        await asyncio.wait_for(session.get(url), timeout=100)

        body = await session.get_page_source()
        soup = BeautifulSoup(body, 'html.parser')
        products = []
        box = soup.findAll("div", {"class": "cp-card cp-card--product-card"})
        for l in box:
            try:
                link = 'https://www.digistyle.com' + l.find('a',
                                                            href=True)['href']
                img = l.find('img')['src']
                name = l.find('div', {
                    'class': 'cp-card__footer'
                }).find('a').getText().replace('\n', '').replace('  ', '')
                price = l.find(
                    'div', {
                        'class':
                        'c-product-card__selling-price c-product-card__currency'
                    }).getText().replace('\n', '').replace('  ', '')
                # sizes = [(s.find('a').getText()).replace(' ','').replace('\n','') for s in l.find('ul', {'class':'product-card-size'}).findAll('li')]
                brand = l.find('div', {
                    'class': 'c-product-card__brand'
                }).getText()

            except:
                link = ''
                img = ''
                name = ''
                price = ''
                # sizes = []
                brand = ''

            try:
                tak_size = l.find('div', {
                    'class': 'c-product-card__badge'
                }).getText()
            except:
                tak_size = ''

            try:
                discount = l.find('div', {
                    'class': 'c-product-card__discount'
                }).getText().replace('\n', '').replace('  ', '')
                last_price = l.find('del', {
                    'class': 'c-product-card__rrp-price'
                }).getText().replace('\n', '').replace('  ', '')
            except:
                discount = ''
                last_price = ''

            products.append({
                'link': link,
                'img': img,
                'brand': brand,
                'name': name,
                'tak_size': tak_size,
                'price': price,
                'last_price': last_price,
                'discount': discount
            })
        return products