async def session(request): global time_wait config = load_config(request.config.getoption('--config-file')) if 'db_path' in config: temp_dir = tempfile.gettempdir() temp_path = os.path.join(temp_dir, 'tmp_pythonz_db') db_path = os.path.abspath( os.path.join(os.path.dirname(__file__), config['db_path'])) if not os.path.isfile(temp_path): shutil.copy2(db_path, temp_path) if config['browser'] == 'firefox': service = services.Geckodriver() browser = browsers.Firefox() elif config['browser'] == 'chrome': service = services.Chromedriver() browser = browsers.Chrome() else: raise ValueError(f"Unrecognized browser {config['browser']}") session = await start_session(service, browser, bind=config['base_url']) # настройка времени ожидания для каждого потока slaveinput = getattr(request.config, 'slaveinput', None) if slaveinput is None: # запуск в 1 поток session.time_wait = 0 else: session.time_wait = slaveinput['time_wait'] try: yield session finally: await stop_session(session) if 'db_path' in config: if os.path.isfile(temp_path): shutil.copy2(temp_path, db_path) os.remove(temp_path)
async def open_driver(proxy=None): chromeOptions = { "args": [ "enable-automation", "--headless", "--disable-gpu", "--no-sandbox", "--disable-infobars", "--disable-dev-shm-usage", "--disable-browser-side-navigation", ] } if proxy and False: # TODO arsenic proxy? prox = Proxy() prox.proxy_type = ProxyType.MANUAL prox.http_proxy = proxy prox.socks_proxy = proxy prox.ssl_proxy = proxy prox.add_to_capabilities(capabilities) if "SELENIUM_URL" in os.environ: driver = await start_session( services.Remote(os.getenv("SELENIUM_URL")), browsers.Chrome(chromeOptions=chromeOptions), ) else: driver = await start_session( services.Chromedriver(), browsers.Chrome(chromeOptions=chromeOptions)) return driver
async def scraper(url: str, i=-1, timeout: int = 60, start=None) -> dict: """Scrapes the HTML of the passed URL using arsenic webdriver. Returns a list of dictionaries, with product id, slug, link.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } async with get_session(service, browser) as session: # if the page doesn't respond, return an empty URLs list. try: await asyncio.wait_for(session.get(url), timeout=60) except asyncio.TimeoutError: return [] await asyncio.sleep(10) body = await session.get_page_source() # getting raw HTML html_r = await get_parsable_html(body) # converting to parsable HTML links = await get_fabric_links(html_r) # getting relative links product_data = await get_product_data(url, html_r) dataset = {"links": links, "product_data": product_data} #_____________printing time consumption_________________# if start is not None: end = time.time() - start print(f"{i} took {end} seconds") return dataset
async def scraper(url:str, i=-1, timeout:int=60, start=None)-> list: """Scrapes the HTML of the passed URL using arsenic webdriver. Returns a list of dictionaries, with product id, slug, link.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': {'args': ['--headless', '--disable-gpu']} } async with get_session(service, browser) as session: # if the page doesn't respond, return an empty URLs list. try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] body = await session.get_page_source() links = await get_fabric_links(body) if start is not None: end = time.time() - start print(f"{i} took {end} seconds") return links
async def get_browser(self): service = services.Chromedriver( binary="D:/Program Files/BitWebV3.0/Chrome/chromedriver.exe") browser = browsers.Chrome() for i in range(5): session = await start_session(service=service, browser=browser) info = PageInfo(session) self.pages.append(info)
def __init__(self, bot): self.bot = bot self.responses = {} self.google = gassistant self.service = services.Chromedriver(log_file=DEVNULL) self.browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', '--disable-gpu', '--log-file=/dev/null'] })
async def hello_world(): service = services.Chromedriver() browser = browsers.Chrome() async with get_session(service, browser) as session: await session.get('http://www.baidu.com/') search_box = await session.wait_for_element(5, '#kw') await search_box.send_keys('arsenic') await search_box.send_keys(keys.ENTER)
def __init__(self, bot): self.bot: commands.Bot = bot self.browser = browsers.Chrome( ** {"goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] }}) self.service = services.Chromedriver(binary=DRIVER)
async def session(server): session = await start_session( services.Chromedriver(binary=CHROME_DRIVER_PATH), browsers.Chrome(chromeOptions={'args': ['--headless', '--disable-gpu']}), bind=SERVER_ADDRESS) try: yield session finally: await stop_session(session)
async def scraper(url): service = services.Chromedriver() browser = browsers.Chrome( chromeOptions={'args': ['--headless', '--disable-gpu']}) async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() print(body) return body
def __init__(self): self.BINARY_PATH = Config.BINARY_PATH capabilities = { 'acceptSslCerts': True,'acceptInsecureCerts': True } chromeOptions = {'args': [ '--headless', '--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--disable-setuid-sandbox', '--lang=ko_KR', ]} self.service = services.Chromedriver(binary=self.BINARY_PATH['CHROME_DRIVER'], log_file=os.devnull) self.browser = browsers.Chrome(chromeOptions=chromeOptions, **capabilities)
async def create_source_selenium(url: str, proxy_list: list = None) -> str: service = services.Chromedriver(binary="./chromedriver") if proxy_list is not None: browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', f"--proxy-server={random.choice(proxy_list)}"] }) else: browser = browsers.Chrome(chromeOptions={'args': ['--headless']}) async with get_session(service, browser) as session: await session.get(url) return await session.get_page_source()
async def scraper(url, i=-1, timeout=60, start=None, body_delay=10): service = services.Chromedriver() browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', '--disable-gpu'] }) async with get_session(service, browser) as session: try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] if body_delay > 0: await asyncio.sleep(body_delay) body = await session.get_page_source() return body
async def scraper(url: str) -> str: """Scrapes the HTML of the passed URL using arsenic webdriver.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() return body
async def scraper(url: str): """Returns the HTML of the passed URL using arsenic webdriver.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } # creating an arsenic session and running it inside of a context manager. async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() return body
async def get_google_answer_text(url_text): msg = None service = services.Chromedriver() browser = browsers.Chrome(**{"goog:chromeOptions": CHROME_OPTIONS}) try: async with get_session(service, browser) as session: await session.get( f"https://www.google.com/search?hl=en&gl=UK&q={url_text}") msg = await get_financial_box_text(session) if not msg: msg = await get_kp_box_text(session) if not msg: msg = await get_kc_box_text(session) except: msg = None traceback.print_exc() return msg
async def scraper_all(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.findAll("article", {"class": "serp-item list"}) for l in box: try: link = l.find('a', href=True)['href'] img = l.find('img')['src'] name = l.find('h2').find('a').getText() time = l.find('time').getText() price = l.find('strong', {'class': 'item-price'}).getText() except: link = '' img = '' name = '' price = '' time = '' try: region = l.find('div', { 'class': 'content' }).findAll('p')[1].getText() except: region = '' products.append({ 'link': link, 'img': img, 'name': name, 'price': price, 'time': time, 'region': region }) return products
async def incredible(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.find("div", {"class": "flash-product-wrapper bani-container"}) li = box.findAll("a", {'class': 'flash-product'}) for l in li: link = l['href'] img = l.find('div', {'class': 'img-box'}).find('img')['src'] name = l.find('p', {'class': 'f-p-name'}).getText() price = l.find('span', {'class': 'specific-price price'}).getText() brand = l.find('p', {'class': 'f-p-logo'}).getText() try: discount = l.find('div', { 'class': 'discount' }).find('p').getText() last_price = l.find('span', { 'class': 'old-price price' }).getText() except: discount = '' last_price = '' products.append({ 'link': link, 'img': img, 'name': name, 'discount': discount, 'last_price': last_price, 'price': price, 'brand': brand }) return products
async def scraper(url, i=-1, timeout=60, start=None): service = services.Chromedriver() browser = browsers.Chrome( chromeOptions={'args': ['--headless', '--disable-gpu']}) async with get_session(service, browser) as session: try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] await asyncio.sleep(10) body = await session.get_page_source() # save this locally?? content = await get_parsable_html(body) links = await get_links(content) product_data = await get_product_data(url, content) if start != None: end = time.time() - start print(f'{i} took {end} seconds') # print(body) dataset = {"links": links, "product_data": product_data} return dataset
async def main(): load_dotenv() service = services.Chromedriver(log_file=os.devnull) browser = browsers.Chrome() telegram_client = TelegramClient(token=os.getenv("TG_TOKEN"), default_channel=os.getenv("TG_CHANNEL")) loguru_client = LoguruClient() messenger = Messenger([loguru_client, telegram_client]) async with get_session(service, browser) as session: extractor = CUExtractor(session) memory = Memory() while True: items = await extractor.extract() added: List[Entry] = memory.update(items)[0] for entry in added: messenger.send(entry.to_markdown()) await asyncio.sleep(DELAY)
async def qdocs(self, ctx, arg): if len(arg) > self.limit: return await ctx.send('`Query length greater than {self.limit}`') query_url = f'https://qiskit.org/documentation/search.html?q={arg}&check_keywords=yes&area=default#' service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: try: await session.get(query_url) except asyncio.TimeoutError: return await ctx.send('`Failed | Time Limit Exceeded`') else: source = None try: source = await asyncio.wait_for(session.get_page_source(), timeout=10) except asyncio.TimeoutError: return await ctx.send('`Failed | Time Limit Exceeded`') else: soup = BeautifulSoup(source, 'html.parser') summary = soup.select('.search') res = [] description = f'' for li in summary[0].find_all('li'): link = li.find('a', href=True) res.append( f'[`{link.contents[0]}`]({self.render_link + link["href"]})' ) embed = discord.Embed(title=f'`Results for: {arg}`', description='\n'.join(res), color=0xe8e3e3) return await ctx.send(embed=embed)
def __init__( self, url=None, height=1000, width=1000, file_name="screenshot.png", loop=None ): fix_arsenic_log() self.url = url self.height = height self.width = width self.file_name = file_name if loop is None: self.loop = asyncio.get_event_loop() else: self.loop = loop service = services.Chromedriver() browser = browsers.Chrome( **{ "goog:chromeOptions": { "args": ["--headless", "--no-sandbox", "--disable-dev-shm-usage"], "w3c": True, } } ) self.session: Session = loop.run_until_complete(start_session(service, browser))
async def scraper(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": {"args": ["--headless", "--disable-gpu"]} } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url),timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.find("ul", {"class":"category_styles_product_card_list__1Xocv"}) li = box.findAll("li") for l in li: try: link = 'https://timcheh.com' + l.find ('a', href=True)['href'] img = l.find('img')['src'] name = l.find('h3').getText() price = l.find('div', {'class':'styles_price__cldWW'}).getText() except: link = '' img = '' name = '' price = '' try: discount = l.find('div', {'class':'styles_discount_number__39goM'}).find('span').getText() old_price = l.find('div', {'class':'styles_old_price__35bDJ'}).getText() except: discount = '' old_price = '' try: bonous = l.find('span', {'class':'styles_caption__3SE4x'}).getText() except: bonous = '' products.append({'link':link, 'img':img, 'name':name, 'discount':discount, 'last_price':old_price, 'price':price, 'bonous':bonous}) return products
async def scraper(url: str, i=-1, timeout: int = 60, delay: int = 10) -> dict: """Scrapes the HTML of the passed URL using arsenic webdriver. Returns a list of dictionaries, with product id, slug, link.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } async with get_session(service, browser) as session: # if the page doesn't respond, return an empty URLs list. try: await asyncio.wait_for(session.get(url), timeout=60) except asyncio.TimeoutError: return [] if delay > 0: await asyncio.sleep(10) body = await session.get_page_source() # getting raw HTML return body
async def scraper_all(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.find("div", {"id": "product_list"}) li = box.findAll("article") for l in li: try: link = 'https://banimode.com' + l.find('a', href=True)['href'] img = l.find('img')['src'] name = l.find('span', {'class': 'product-card-name'}).getText() price = l.find('span', {'class': 'price-disgit'}).getText() sizes = [(s.find('a').getText()).replace(' ', '').replace('\n', '') for s in l.find('ul', { 'class': 'product-card-size' }).findAll('li')] brand = l.find('span', { 'class': 'product-card-brand' }).getText() except: link = '' img = '' name = '' price = '' sizes = [] brand = '' try: discount = l.find('span', { 'class': 'product-card-discount' }).getText() last_price = l.find('span', { 'class': 'product-card-lastprice' }).getText() except: discount = '' last_price = '' try: bonous = (l.find('span', { 'class': 'product-card-size-tag' }).getText()).replace(' ', '').replace('\n', '') except: bonous = '' products.append({ 'link': link, 'img': img, 'name': name, 'discount': discount, 'last_price': last_price, 'price': price, 'bonous': bonous, 'sizes': sizes, 'brand': brand }) return products
def __init__(self): self.service = services.Chromedriver() self.browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', '--disable-gpu'] }) self.request_semaphore = asyncio.Semaphore(5)
async def scraper(url): service = services.Chromedriver() browser = browsers.Chrome()
async def on_call(self, ctx, args, **flags): m = await ctx.send('Taking screenshot...') url = args[1] if url.startswith('<') and url.endswith('>'): url = url[1:-1] if not url.startswith(('http://', 'https://')): url = 'http://' + url proxy = random.choice(list(self.bot.proxies.keys())) try: async with self.bot.sess.head(url, timeout=15, proxy=proxy) as r: if (r.content_length or 0) > 100000000: return await self.bot.edit_message( m, 'Rejected to navigate') except Exception: return await self.bot.edit_message(m, 'Connection timeout') await self.lock.acquire() try: service = service = services.Chromedriver(log_file=devnull) browser = browsers.Chrome( chromeOptions={ 'args': ['--headless', '--disable-gpu', f'proxy-server={proxy}'] } ) async with get_session(service, browser) as session: await session.set_window_size(1920, 1080) await session.get(url) opened_url = await session.get_url() await asyncio.sleep(2) screenshot = await session.get_screenshot() except UnknownArsenicError: await self.bot.edit_message( m, 'Unknown exception happened') return except Exception: return await self.bot.edit_message( m, 'Could not open page, please check url and try again') finally: try: self.lock.release() except Exception: pass try: title = opened_url.split('/')[2] except IndexError: title = "Screenshot" e = Embed(title=title[:256], colour=Colour.gold(), url=url) e.set_image(url='attachment://screenshot.png') f = File(screenshot, filename='screenshot.png') e.set_footer( text=f'Took {round(time.time() - (m.created_at or m.edited_at).timestamp(), 1)} seconds') await self.bot.delete_message(m) await ctx.send(embed=e, file=f)
async def scraper_all(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.findAll("div", {"class": "cp-card cp-card--product-card"}) for l in box: try: link = 'https://www.digistyle.com' + l.find('a', href=True)['href'] img = l.find('img')['src'] name = l.find('div', { 'class': 'cp-card__footer' }).find('a').getText().replace('\n', '').replace(' ', '') price = l.find( 'div', { 'class': 'c-product-card__selling-price c-product-card__currency' }).getText().replace('\n', '').replace(' ', '') # sizes = [(s.find('a').getText()).replace(' ','').replace('\n','') for s in l.find('ul', {'class':'product-card-size'}).findAll('li')] brand = l.find('div', { 'class': 'c-product-card__brand' }).getText() except: link = '' img = '' name = '' price = '' # sizes = [] brand = '' try: tak_size = l.find('div', { 'class': 'c-product-card__badge' }).getText() except: tak_size = '' try: discount = l.find('div', { 'class': 'c-product-card__discount' }).getText().replace('\n', '').replace(' ', '') last_price = l.find('del', { 'class': 'c-product-card__rrp-price' }).getText().replace('\n', '').replace(' ', '') except: discount = '' last_price = '' products.append({ 'link': link, 'img': img, 'brand': brand, 'name': name, 'tak_size': tak_size, 'price': price, 'last_price': last_price, 'discount': discount }) return products