def _get_hours_info(self, hours_node): hours_info = {} day_nodes = HtmlUtils.get_elements(hours_node, './/tr') for day_node in day_nodes: key = HtmlUtils.get_element_value(day_node, './td[contains(text(),"day")]/text()').lower() value = HtmlUtils.get_element_value(day_node,'.//strong/text()') hours_info[key] = value return hours_info
def _get_social_media_info(self, html_doc): social_media_info = {} social_media_info['facebook'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"facebook")]/parent::a/@href') social_media_info['twitter'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"twitter")]/parent::a/@href') social_media_info['instagram'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"instagram")]/parent::a/@href') social_media_info['google-plus'] = HtmlUtils.get_element_value(html_doc, '//div[@class="brand-page"]//ul[contains(@class,"list-unstyled")]//a/i[contains(@class,"google-plus")]/parent::a/@href') return social_media_info
def _get_menu_info(self, html_doc): category_nodes = HtmlUtils.get_elements(html_doc, '//div[@class="panel panel-default"]') categories = {} for category_node in category_nodes: category_name = HtmlUtils.get_element_value(category_node, './/h2[@class="panel-title"]/a/text()') if category_name != '': categories[category_name] = self._get_category_from_menu_info(category_node) return categories
def _get_menu_item_prices_info(self, item_node): price_nodes = HtmlUtils.get_elements(item_node, './/div[contains(@class,"menu-item-prices hidden")]//li[not(contains(@class,"price-empty")) and contains(@class, "rounded gram-price")]') price_infos = [] for price_node in price_nodes: unit = HtmlUtils.get_element_value(price_node, './/div[@class="unit"]/text()') if unit != '': info = {} info[unit] = HtmlUtils.get_element_value(price_node, './/div[@class="price rounded"]/text()') price_infos.append(info) return price_infos
def _get_category_from_menu_info(self, category_node): item_nodes = HtmlUtils.get_elements(category_node, './/div[@class="menu-item"]') items_info = [] for item_node in item_nodes: item_name = HtmlUtils.get_element_value(item_node, './/h3[@class="menu-item-name"]/text()') if item_name != '': info = {} info['name'] = item_name info['prices'] = self._get_menu_item_prices_info(item_node) items_info.append(info) return items_info
def _get_city_urls(self, page_html, host): html_doc = html.fromstring(page_html) city_nodes = HtmlUtils.get_elements( html_doc, '(//div[@id="maincolumn"]//ul[@class="dropdown-menu"])[1]/li/a') result = [] for node in city_nodes: city_name = HtmlUtils.get_element_value(node, './text()') if self._dispensary_filter.match_city(city_name): result.append(host + HtmlUtils.get_element_value(node, './@href')) return result
def get_about_info(self, url): response = self._http_client.get(url) if response.success: html_doc = html.fromstring(response.content) return HtmlUtils.get_element_value( html_doc, "//div[@class='store-about']/text()") return ''
def get_list_page(self, url): global content retry = 0 st = 0 # st 1:fail 1:success while not st and (retry < 100): st, content = HtmlUtils.download_html(url, headers=self.headers) return st, content
def get_rank_status(self, item_node): imgurl = HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-rank-image"]/img/@src') if 'rank-down' in imgurl: return 'down' if 'rank-up' in imgurl: return 'up' return 'unkown'
def produce(self, state_name): response = self._http_client.get(self._url.format(state_name.lower())) if response.success: html_doc = html.fromstring(response.content) category_urls = HtmlUtils.get_elements( html_doc, './/div[@class="w-dyn-items"]//div[@class="w-embed"]/a/@href') return category_urls return []
def produce(self, state_name): resp, host = self._get_state_response(state_name) if resp.success: city_urls = self._get_city_urls(resp.content, host) for city_url in city_urls: res = self._http_client.get(city_url) if res.success: html_doc = html.fromstring(res.content) store_urls = HtmlUtils.get_elements( html_doc, '//div[contains(@class,"-listing")]//*[self::h3 or self::h4]/a/@href' ) for url in store_urls: yield host + url, host
def parse(self): site_result = NewsSite(self.url) html_string = self.html_getter.get(self.url) if not html_string: return html_document = html.fromstring(html_string) news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath()) for n in news: title = self._get_title(n) url = self._get_url(n) if not url: continue html_string = self.html_getter.get(url) if not html_string: continue article = html.fromstring(html_string) image_url = self._get_image_url(article) date = self._get_date(article) self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths()) text_html = self._get_text_html(article) text_plain = self._get_text_plain(article) site_result.add_article(title, url, image_url, date, text_html, text_plain) return site_result.to_dict()
def get_brand_image(self, item_node): image_url = HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-brand-image"]//img/@src') return image_url if 'http' in image_url else ''
def get_rank(self, item_node): return HtmlUtils.get_element_value( item_node, './div[@class="bs-product-rank"]/h2/text()')
def get_category_name(self, html_doc): return HtmlUtils.get_element_value( html_doc, '//div[@class="flex-horiz-product-list"]//h2[@class="best-seller-cat-title"]/text()' )
def get_items(self, html_doc): nodes = HtmlUtils.get_elements( html_doc, '//div[@class="w-dyn-list"]//div[@class="product-listing"]') return map(self.parse_item, nodes)
def _strip_quantity(self, quantity): if quantity is str: return quantity.strip() return HtmlUtils.get_element_value(quantity, './text()')
def get_image_chart(self, item_node): return HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-chart"]//img/@src')
def _get_image_url(self, deal_node): return 'https://www.leafbuyer.com' + HtmlUtils.get_element_value( deal_node, '//div[contains(@class, "img-block")]/a/img/@src')
def _get_hours(self, html_doc): hours_nodes = HtmlUtils.get_elements(html_doc, '//table[@class="table table-striped"]') if len(hours_nodes) > 0: return self._get_hours_info(hours_nodes[0]) return {}
def _get_dispensary_address(self, deal_node): return HtmlUtils.get_element_value( deal_node, './/div[@class="text-box"]//span[@class="txt"]/text()').strip()
def _get_dispensary_phone_number(self, deal_node): return HtmlUtils.get_element_value( deal_node, './/div[@class="text-box"]//span[@class="tel-link"]/text()')
def _get_dispensary_minimum_age(self, deal_node): return '21' if HtmlUtils.get_element_value( deal_node, '//ul[@class="detail-list"]/li/span[contains(@class, "icon-retail")]' ) != '' else 'unkown'
def is_top_deal(self, deal_node): return HtmlUtils.get_element_value( deal_node, './/div[@class="deal-box"]/text()') != ''
def get_brand_name(self, item_node): return HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-name"]/div/text()')
def _get_dispensary_name(self, deal_node): return HtmlUtils.get_element_value( deal_node, './/div[@class="profile-link"]/text()')
def get_product_price(self, item_node): value = HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-price"]/h4[2]/text()') currency = HtmlUtils.get_element_value( item_node, './/div[@class="bs-product-price"]/h4[1]/text()') return value + currency
def _get_dispensary_url(self, deal_node): return 'https://www.leafbuyer.com' + HtmlUtils.get_element_value( deal_node, './/div[@class="loc-name-addr"]/strong/a/@href')
def _has_menu(self, html_doc): return len(HtmlUtils.get_elements(html_doc, '//div[contains(@id,"cpg-menu")]')) > 0
def _get_deal_name(self, deal_node): return HtmlUtils.get_element_value( deal_node, './/div[@class="text-wrap"]/h1/text()')