async def build_profile(self, device, headless): scenario = self.scenario profile = self.env.profile customization_data = self.customization_data scenario_func = scenarii[scenario] if scenario in customization_data.get("scenario", {}): options = customization_data["scenario"][scenario] LOG("Loaded options for that scenario %s" % str(options)) else: options = {} # Adding general options options["platform"] = self.env.target_platform if not self.force_new: try: custom_name = customization_data["name"] get_profile(profile, self.env.target_platform, scenario, custom_name) except ProfileNotFoundError: # XXX we'll use a fresh profile for now fresh_profile(profile, customization_data) else: fresh_profile(profile, customization_data) LOG("Updating profile located at %r" % profile) metadata = Metadata(profile) LOG("Starting the Gecko app...") self.env.prepare(logfile=self._log_filename("adb")) geckodriver_logs = self._log_filename("geckodriver") LOG("Writing geckodriver logs in %s" % geckodriver_logs) try: firefox_instance = Firefox(**self.env.get_browser_args(headless)) with open(geckodriver_logs, "w") as glog: async with get_session( self.env.get_geckodriver(log_file=glog), firefox_instance ) as session: self.env.check_session(session) LOG("Running the %s scenario" % scenario) metadata.update(await scenario_func(session, options)) LOG("%s scenario done." % scenario) except Exception: ERROR("%s scenario broke!" % scenario) self.env.stop_browser() self.env.collect_profile() # writing metadata metadata.write( name=self.scenario, customization=self.customization_data["name"], version=self.env.get_browser_version(), platform=self.env.target_platform, ) LOG("Profile at %s" % profile) LOG("Done.") return metadata
async def scraper(url:str, i=-1, timeout:int=60, start=None)-> list: """Scrapes the HTML of the passed URL using arsenic webdriver. Returns a list of dictionaries, with product id, slug, link.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': {'args': ['--headless', '--disable-gpu']} } async with get_session(service, browser) as session: # if the page doesn't respond, return an empty URLs list. try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] body = await session.get_page_source() links = await get_fabric_links(body) if start is not None: end = time.time() - start print(f"{i} took {end} seconds") return links
async def scraper(url: str, i=-1, timeout: int = 60, start=None) -> dict: """Scrapes the HTML of the passed URL using arsenic webdriver. Returns a list of dictionaries, with product id, slug, link.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } async with get_session(service, browser) as session: # if the page doesn't respond, return an empty URLs list. try: await asyncio.wait_for(session.get(url), timeout=60) except asyncio.TimeoutError: return [] await asyncio.sleep(10) body = await session.get_page_source() # getting raw HTML html_r = await get_parsable_html(body) # converting to parsable HTML links = await get_fabric_links(html_r) # getting relative links product_data = await get_product_data(url, html_r) dataset = {"links": links, "product_data": product_data} #_____________printing time consumption_________________# if start is not None: end = time.time() - start print(f"{i} took {end} seconds") return dataset
async def google(self, ctx, *, query): await ctx.channel.trigger_typing() try: await self.bot.loop.run_in_executor(None, func=functools.partial(self.assist, ctx.author.id, query)) except Exception as e: if 'text_query too long.' in str(e): return await ctx.error(f'That query is too long. Try something shorter') return await ctx.error(f'Something went wrong.') if ctx.author.id not in self.responses: return await ctx.send(f'<a:okaygoogle:661951491082551306> Something went wrong. Try again later') async with get_session(self.service, self.browser) as session: await session.set_window_size(1920, 1080) sub = 'devapi' if self.bot.dev else 'api' await session.get(f'https://{sub}.gaminggeek.dev/assist/{ctx.author.id}') try: await session.execute_script('document.body.style.backgroundImage = \'url("https://picsum.photos/1920/1080")\';') namere = '<div class=\"show_text_content\">Your name is .*\.<\/div>' namesub = f'<div class=\'show_text_content\'>Your name is {ctx.author.name}.</div>' await session.execute_script(f'document.body.innerHTML = document.body.innerHTML.replace(/{namere}/gm, "{namesub}");') namere = '<div class=\"show_text_content\">I remember you telling me your name was .*\.<\/div>' namesub = f'<div class=\'show_text_content\'>I remember you telling me your name was {ctx.author.name}.</div>' await session.execute_script(f'document.body.innerHTML = document.body.innerHTML.replace(/{namere}/gm, "{namesub}");') except Exception: pass # await ctx.error('script did an oopsie') await asyncio.sleep(1.5) await ctx.send(file=discord.File((await session.get_screenshot()), filename='google.png')) return await session.close() return await ctx.error('If you\'re seeing this, something went wrong I guess ¯\_(ツ)_/¯')
async def make_snapshot(self, website: str): if self.session is None: await self.init_session() while self.busy: await asyncio.sleep(1) async with get_session(self.service, self.browser) as session: self.busy = True await session.get(website) image = await session.get_screenshot() image.seek(0) session.close() headers = {"Authorization": "Client-ID 6656d64547a5031"} data = {"image": image} async with self.session.post("https://api.imgur.com/3/image", data=data, headers=headers) as r: link = (await r.json())["data"]["link"] r.close() del image self.busy = False return link
async def visit(config): service = services.Geckodriver() browser = browsers.Firefox( **{"moz:firefoxOptions": { "args": ["-headless"] }}) logging.info("Hitting url " + config["url"]) try: async with get_session(service, browser) as session: await session.delete_all_cookies() await session.get(config["url"]) for k, c in config.get("cookies", {}).items(): value = c.get("value", "") domain = c.get("domain", None) path = c.get("path", "/") secure = c.get("secure", False) await session.add_cookie(k, value, path=path, domain=domain, secure=secure) await session.get(config["url"]) except Exception as e: logging.info("Exception hitting url " + str(config) + " with exception " + e.message)
async def hello_world(): service = services.Chromedriver() browser = browsers.Chrome() async with get_session(service, browser) as session: await session.get('http://www.baidu.com/') search_box = await session.wait_for_element(5, '#kw') await search_box.send_keys('arsenic') await search_box.send_keys(keys.ENTER)
async def build_profile(args): scenarii = scenario[args.scenarii] # getting the latest archive from the server if TASK_CLUSTER: url = TC_LINK % args.scenarii basename = 'today-%s.tgz' % args.scenarii else: basename = '%s-latest.tar.gz' % args.scenarii url = args.archives_server + '/%s' % basename exists, headers = check_exists(url) metadata = {} if exists: target = os.path.join(args.archives_dir, basename) archive = download_file(url, target=target, check_file=False) with tarfile.open(archive, "r:gz") as tar: logger.msg("Checking the tarball content...") size = len(list(tar)) with progress.Bar(expected_size=size) as bar: def _extract(self, *args, **kw): if not TASK_CLUSTER: bar.show(bar.last_progress + 1) try: return self.old(*args, **kw) finally: pass # if args[0].name == ".hp.json": # import pdb; pdb.set_trace() tar.old = tar.extract tar.extract = functools.partial(_extract, tar) tar.extractall(args.profile) logger.msg("Updating profile located at %r" % args.profile) f_args = ["-profile", args.profile] if platform.system() != 'Darwin': f_args.append('-headless') caps = {"moz:firefoxOptions": {"args": f_args}} if args.firefox is not None: caps['moz:firefoxOptions']['binary'] = args.firefox logger.msg("Starting the Fox...") with open('gecko.log', 'a+') as glog: async with get_session(CustomGeckodriver(log_file=glog), Firefox(**caps)) as session: metadata = await scenarii(session, args) # writing metadata logger.msg("Creating metadata...") metadata['name'] = args.scenarii with open(os.path.join(args.profile, '.hp.json'), 'w') as f: f.write(json.dumps(metadata)) logger.msg("Done.")
async def fetch_content(self, queue=queue): """Корутина для сбора данных со страницы вакансии. Корутина получает на вход очередь c данными о вакансиях. Назначение: асинхронно собрать информацию о вакансиях и записать ее в базу данных. Сначала метод создает таблицу в базе данных для записи информации на текущую дату. Далее запускает бесконечный цикл, в котором ожидает получения элемента(списка с данными) из очереди. Получив элемент, загружает по полученной из элемента ссылке страницу с информацией о вакансии. Со страницы асинхронно получаем название компании, требуемый опыт кандидата, тип занятости и полное описание вакансии. Полученные данные записываем в базу. Условием выхода из цикла является получение элемента None из очереди. Работа данного метода связана с работой метода get_links. Получив ссылку со страницы результатов поиска, метод get_links передает ссылку в метод fetch_content. Метод fetch_content начинает загразку страницы с вакансией и сбор информации. Не дождавшись завершения процесса и получив новый элемент из очереди, метод fetch_content открывает новую страницу по ссылке и начинает сбор данных с нее. Так продолжается пока в очереди есть элементы. Таким образом одновременно идет сбор информации со всех ссылок из списка, что позволяет в разы сократить время выполнения задачи по сбору данных. """ await engine.execute( CreateTable(HeadHunter_db) ) # создаем таблицу для хранения данных о вакансии на текущую дату while True: item = await queue.get() if item is None: # если получен элемент None значит очередь закончилась break async with get_session(self.service, self.browser) as web_session: await web_session.get( item[0] ) # загражаем страницу по ссылке из списка и получаем объекты на странице company_object = await web_session.get_element( 'span[itemprop=name]') company = await company_object.get_text( ) # получаем название компании по текущей ссылке experience_object = await web_session.get_element( 'span[data-qa=vacancy-experience]') experience = await experience_object.get_text( ) # получаем требуемый опыт кандидата по текущей ссылке employment_mode_object = await web_session.get_element( 'p[data-qa=vacancy-view-employment-mode]') employment_mode = await employment_mode_object.get_text( ) # получаем тип занятости по текущей ссылке description_object = await web_session.get_element( 'div[data-qa=vacancy-description') description = await description_object.get_text( ) # получаем полное описание вакансии по текущей ссылке async with engine.connect() as conn: async with conn.begin( ) as trans: # записываем данные в базу await conn.execute(HeadHunter_db.insert().values( link=item[0], title=item[1], salary=item[2], responsibilites_short=item[3], requirements_short=item[4], company=company, experience=experience, employment_mode=employment_mode, description=description))
async def scraper(url): service = services.Chromedriver() browser = browsers.Chrome( chromeOptions={'args': ['--headless', '--disable-gpu']}) async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() print(body) return body
async def hello_world(): service = services.Geckodriver(binary=GECKODRIVER) browser = browsers.Firefox() async with get_session(service, browser) as session: await session.get("https://images.google.com/") search_box = await session.wait_for_element(5, "input[name=q]") await search_box.send_keys("Cats") await search_box.send_keys(keys.ENTER) await asyncio.sleep(10)
async def _fetch_articles(self, urls): async with get_session(self.service, self.browser) as session: tasks = [] for url in urls: tasks.append( self._request_url(url, session) ) htmls = await asyncio.gather(*tasks) return htmls
async def crawl_news(self, news_site_url): async with get_session(self.service, self.browser) as session: html, _ = await self._request_url(news_site_url, session) tree = self._parse_etree_from_html(html) newslinks = self._parse_interesting_links_from_tree(tree) validated_newslinks = self._validate_links('https://yle.fi', newslinks) article_htmls = await self._fetch_articles(validated_newslinks) news_articles = self._parse_articles_from_htmls(article_htmls) return news_articles
async def create_source_selenium(url: str, proxy_list: list = None) -> str: service = services.Chromedriver(binary="./chromedriver") if proxy_list is not None: browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', f"--proxy-server={random.choice(proxy_list)}"] }) else: browser = browsers.Chrome(chromeOptions={'args': ['--headless']}) async with get_session(service, browser) as session: await session.get(url) return await session.get_page_source()
async def scraper(url, i=-1, timeout=60, start=None, body_delay=10): service = services.Chromedriver() browser = browsers.Chrome(chromeOptions={ 'args': ['--headless', '--disable-gpu'] }) async with get_session(service, browser) as session: try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] if body_delay > 0: await asyncio.sleep(body_delay) body = await session.get_page_source() return body
async def scraper(url: str) -> str: """Scrapes the HTML of the passed URL using arsenic webdriver.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() return body
async def scraper(url: str): """Returns the HTML of the passed URL using arsenic webdriver.""" service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { 'goog:chromeOptions': { 'args': ['--headless', '--disable-gpu'] } } # creating an arsenic session and running it inside of a context manager. async with get_session(service, browser) as session: await session.get(url) body = await session.get_page_source() return body
async def search(self, quest): async with get_session(self.service, self.browser) as session: await session.get(self.link + quote(quest)) await session.wait_for_element(3, '.sg-layout__box') source = await session.get_page_source() soup = BeautifulSoup(source, features="lxml") links = [ link.get('href') for link in soup.find_all('a') if '/task/' in link.get('href') ] tasks = [links[n] for n in range(0, len(links), 2)] screen = await session.get_screenshot() await session.close() return tasks, self.crop(screen)
async def scraper_all(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.findAll("article", {"class": "serp-item list"}) for l in box: try: link = l.find('a', href=True)['href'] img = l.find('img')['src'] name = l.find('h2').find('a').getText() time = l.find('time').getText() price = l.find('strong', {'class': 'item-price'}).getText() except: link = '' img = '' name = '' price = '' time = '' try: region = l.find('div', { 'class': 'content' }).findAll('p')[1].getText() except: region = '' products.append({ 'link': link, 'img': img, 'name': name, 'price': price, 'time': time, 'region': region }) return products
async def get_google_answer_text(url_text): msg = None service = services.Chromedriver() browser = browsers.Chrome(**{"goog:chromeOptions": CHROME_OPTIONS}) try: async with get_session(service, browser) as session: await session.get( f"https://www.google.com/search?hl=en&gl=UK&q={url_text}") msg = await get_financial_box_text(session) if not msg: msg = await get_kp_box_text(session) if not msg: msg = await get_kc_box_text(session) except: msg = None traceback.print_exc() return msg
async def get_remote_session(root_url: str): if "REMOTE_BROWSER" not in os.environ: raise pytest.skip("No remote browser configured (REMOTE_BROWSER)") if "REMOTE_SERVICE" not in os.environ: raise pytest.skip("No remote service configured (REMOTE_SERVICE)") if "BROWSERSTACK_API_KEY" not in os.environ: raise pytest.skip( "No browserstack api key configured (BROWSERSTACK_API_KEY)") remote_browser = json.loads(os.environ["REMOTE_BROWSER"]) browser_cls = getattr(browsers, remote_browser["browserName"]) with bsl_context(): async with get_session( services.Remote(url=os.environ["REMOTE_SERVICE"]), browser_cls(**remote_browser), root_url, ) as session: yield session
async def incredible(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url), timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.find("div", {"class": "flash-product-wrapper bani-container"}) li = box.findAll("a", {'class': 'flash-product'}) for l in li: link = l['href'] img = l.find('div', {'class': 'img-box'}).find('img')['src'] name = l.find('p', {'class': 'f-p-name'}).getText() price = l.find('span', {'class': 'specific-price price'}).getText() brand = l.find('p', {'class': 'f-p-logo'}).getText() try: discount = l.find('div', { 'class': 'discount' }).find('p').getText() last_price = l.find('span', { 'class': 'old-price price' }).getText() except: discount = '' last_price = '' products.append({ 'link': link, 'img': img, 'name': name, 'discount': discount, 'last_price': last_price, 'price': price, 'brand': brand }) return products
async def get_remote_session(root_url: str): if "REMOTE_BROWSER" not in os.environ: raise pytest.skip("No remote browser configured (REMOTE_BROWSER)") if "REMOTE_SERVICE" not in os.environ: raise pytest.skip("No remote service configured (REMOTE_SERVICE)") if "BROWSERSTACK_API_KEY" in os.environ: context = bsl_context else: context = null_context remote_browser = json.loads(os.environ["REMOTE_BROWSER"]) browser_cls = getattr(browsers, remote_browser.pop("type")) with context(): async with get_session( services.Remote(url=os.environ["REMOTE_SERVICE"]), browser_cls(**remote_browser), root_url, ) as session: yield session
async def google(self, ctx, *, query): await ctx.channel.trigger_typing() await self.bot.loop.run_in_executor(None, func=functools.partial(self.assist, ctx.author.id, query)) if ctx.author.id not in self.responses: return await ctx.send(f'<a:okaygoogle:661951491082551306> Something went wrong. Try again later') async with get_session(self.service, self.browser) as session: await session.set_window_size(1366, 768) sub = 'devapi' if self.bot.dev else 'api' await session.get(f'https://{sub}.gaminggeek.dev/assist/{ctx.author.id}') try: await session.execute_script('document.body.style.backgroundImage = \'url("https://picsum.photos/1366/768")\';') except Exception: pass # await ctx.error('script did an oopsie') await asyncio.sleep(1.5) await ctx.send(file=discord.File((await session.get_screenshot()), filename='google.png')) return await session.close() return await ctx.error('If you\'re seeing this, something went wrong I guess ¯\_(ツ)_/¯')
async def fetch_content(self): """Корутина сбора данных со страницы вакансии. Корутина получает на вход очередь c данными о вакансиях. Назначение: асинхронно собрать информацию о вакансиях и записать ее в базу данных. Сначала метод создает таблицу в базе данных для записи информации на текущую дату. Далее запускает бесконечный цикл, в котором ожидает получения элемента(списка с данными) из очереди. Получив элемент, загружает по полученной из элемента ссылке страницу с информацией о вакансии. Со страницы асинхронно получает полное описание вакансии. Полученные данные(ссылка на вакансию, название вакансии, зарплата, требования к кандидату, название компнании, тип занятости, полное описание вакансии)записываем в базу. Условием выхода из цикла является получение элемента None из очереди. Работа данного метода связана с работой метода get_links. Получив ссылку со страницы результатов поиска, метод get_links передает ссылку в метод fetch_content. Метод fetch_content начинает загрузку страницы с вакансией и сбор информации. Не дождавшись завершения процесса и получив новый элемент из очереди, метод fetch_content открывает новую страницу по ссылке и начинает сбор данных с нее. Так продолжается пока в очереди есть элементы. Таким образом одновременно идет сбор информации со всех ссылок из списка, что позволяет в разы сократить время выполнения задачи по сбору данных. """ await engine.execute( CreateTable(MoiKrug_db) ) # создаем таблицу для хранения данных о вакансии на текущую дату while True: item = await self.queue.get( ) # ждем пока появится новый элемент в очереди if item is None: # Элемент None означает конец очереди break async with get_session(self.service, self.browser) as web_session: await web_session.get(item[0]) # загружаем страницу вакансии description_object = await web_session.get_element( 'div[class=vacancy_description]') description = await description_object.get_text( ) # получаем описание вакансии async with engine.connect() as conn: async with conn.begin() as trans: await conn.execute(MoiKrug_db.insert().values( link=item[0], # записываем данные в базу title=item[1], salary=item[2], skills=item[3], company=item[4], occupation=item[5], description=description))
async def build_profile(args): scenarii = scenario[args.scenarii] if not args.force_new: get_profile(args) logger.msg("Updating profile located at %r" % args.profile) metadata_file = os.path.join(args.profile, ".hp.json") with open(metadata_file) as f: metadata = json.loads(f.read()) f_args = ["-profile", args.profile] if platform.system() != "Darwin": f_args.append("-headless") caps = {"moz:firefoxOptions": {"args": f_args}} if args.firefox is not None: caps["moz:firefoxOptions"]["binary"] = args.firefox logger.msg("Starting the Fox...") with open("gecko.log", "a+") as glog: async with get_session(CustomGeckodriver(log_file=glog), Firefox(**caps)) as session: logger.msg("Running the %s scenario" % args.scenarii) metadata.update(await scenarii(session, args)) # writing metadata logger.msg("Creating metadata...") ts = str(datetime.datetime.now()) if "created" not in metadata: metadata["created"] = ts metadata["updated"] = ts metadata["name"] = args.scenarii metadata["platform"] = sys.platform metadata["age"] = get_age(metadata) metadata["version"] = "69.0a1" # add the build id XXX metadata["customization"] = "vanilla" # add themes with open(metadata_file, "w") as f: f.write(json.dumps(metadata)) logger.msg("Profile at %s" % args.profile) logger.msg("Done.")
async def scraper(url, i=-1, timeout=60, start=None): service = services.Chromedriver() browser = browsers.Chrome( chromeOptions={'args': ['--headless', '--disable-gpu']}) async with get_session(service, browser) as session: try: await asyncio.wait_for(session.get(url), timeout=timeout) except asyncio.TimeoutError: return [] await asyncio.sleep(10) body = await session.get_page_source() # save this locally?? content = await get_parsable_html(body) links = await get_links(content) product_data = await get_product_data(url, content) if start != None: end = time.time() - start print(f'{i} took {end} seconds') # print(body) dataset = {"links": links, "product_data": product_data} return dataset
async def main(): load_dotenv() service = services.Chromedriver(log_file=os.devnull) browser = browsers.Chrome() telegram_client = TelegramClient(token=os.getenv("TG_TOKEN"), default_channel=os.getenv("TG_CHANNEL")) loguru_client = LoguruClient() messenger = Messenger([loguru_client, telegram_client]) async with get_session(service, browser) as session: extractor = CUExtractor(session) memory = Memory() while True: items = await extractor.extract() added: List[Entry] = memory.update(items)[0] for entry in added: messenger.send(entry.to_markdown()) await asyncio.sleep(DELAY)
async def qdocs(self, ctx, arg): if len(arg) > self.limit: return await ctx.send('`Query length greater than {self.limit}`') query_url = f'https://qiskit.org/documentation/search.html?q={arg}&check_keywords=yes&area=default#' service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": { "args": ["--headless", "--disable-gpu"] } } async with get_session(service, browser) as session: try: await session.get(query_url) except asyncio.TimeoutError: return await ctx.send('`Failed | Time Limit Exceeded`') else: source = None try: source = await asyncio.wait_for(session.get_page_source(), timeout=10) except asyncio.TimeoutError: return await ctx.send('`Failed | Time Limit Exceeded`') else: soup = BeautifulSoup(source, 'html.parser') summary = soup.select('.search') res = [] description = f'' for li in summary[0].find_all('li'): link = li.find('a', href=True) res.append( f'[`{link.contents[0]}`]({self.render_link + link["href"]})' ) embed = discord.Embed(title=f'`Results for: {arg}`', description='\n'.join(res), color=0xe8e3e3) return await ctx.send(embed=embed)
async def scraper(url): service = services.Chromedriver() browser = browsers.Chrome() browser.capabilities = { "goog:chromeOptions": {"args": ["--headless", "--disable-gpu"]} } async with get_session(service, browser) as session: await asyncio.wait_for(session.get(url),timeout=100) body = await session.get_page_source() soup = BeautifulSoup(body, 'html.parser') products = [] box = soup.find("ul", {"class":"category_styles_product_card_list__1Xocv"}) li = box.findAll("li") for l in li: try: link = 'https://timcheh.com' + l.find ('a', href=True)['href'] img = l.find('img')['src'] name = l.find('h3').getText() price = l.find('div', {'class':'styles_price__cldWW'}).getText() except: link = '' img = '' name = '' price = '' try: discount = l.find('div', {'class':'styles_discount_number__39goM'}).find('span').getText() old_price = l.find('div', {'class':'styles_old_price__35bDJ'}).getText() except: discount = '' old_price = '' try: bonous = l.find('span', {'class':'styles_caption__3SE4x'}).getText() except: bonous = '' products.append({'link':link, 'img':img, 'name':name, 'discount':discount, 'last_price':old_price, 'price':price, 'bonous':bonous}) return products