def _get_hours_info(self, hours_node): hours_info = {} day_nodes = HtmlUtils.get_elements(hours_node, './/tr') for day_node in day_nodes: key = HtmlUtils.get_element_value(day_node, './td[contains(text(),"day")]/text()').lower() value = HtmlUtils.get_element_value(day_node,'.//strong/text()') hours_info[key] = value return hours_info
def produce(self, state_name): response = self._http_client.get(self._url.format(state_name.lower())) if response.success: html_doc = html.fromstring(response.content) category_urls = HtmlUtils.get_elements( html_doc, './/div[@class="w-dyn-items"]//div[@class="w-embed"]/a/@href') return category_urls return []
def _get_menu_info(self, html_doc): category_nodes = HtmlUtils.get_elements(html_doc, '//div[@class="panel panel-default"]') categories = {} for category_node in category_nodes: category_name = HtmlUtils.get_element_value(category_node, './/h2[@class="panel-title"]/a/text()') if category_name != '': categories[category_name] = self._get_category_from_menu_info(category_node) return categories
def _get_menu_item_prices_info(self, item_node): price_nodes = HtmlUtils.get_elements(item_node, './/div[contains(@class,"menu-item-prices hidden")]//li[not(contains(@class,"price-empty")) and contains(@class, "rounded gram-price")]') price_infos = [] for price_node in price_nodes: unit = HtmlUtils.get_element_value(price_node, './/div[@class="unit"]/text()') if unit != '': info = {} info[unit] = HtmlUtils.get_element_value(price_node, './/div[@class="price rounded"]/text()') price_infos.append(info) return price_infos
def _get_category_from_menu_info(self, category_node): item_nodes = HtmlUtils.get_elements(category_node, './/div[@class="menu-item"]') items_info = [] for item_node in item_nodes: item_name = HtmlUtils.get_element_value(item_node, './/h3[@class="menu-item-name"]/text()') if item_name != '': info = {} info['name'] = item_name info['prices'] = self._get_menu_item_prices_info(item_node) items_info.append(info) return items_info
def _get_city_urls(self, page_html, host): html_doc = html.fromstring(page_html) city_nodes = HtmlUtils.get_elements( html_doc, '(//div[@id="maincolumn"]//ul[@class="dropdown-menu"])[1]/li/a') result = [] for node in city_nodes: city_name = HtmlUtils.get_element_value(node, './text()') if self._dispensary_filter.match_city(city_name): result.append(host + HtmlUtils.get_element_value(node, './@href')) return result
def produce(self, state_name): resp, host = self._get_state_response(state_name) if resp.success: city_urls = self._get_city_urls(resp.content, host) for city_url in city_urls: res = self._http_client.get(city_url) if res.success: html_doc = html.fromstring(res.content) store_urls = HtmlUtils.get_elements( html_doc, '//div[contains(@class,"-listing")]//*[self::h3 or self::h4]/a/@href' ) for url in store_urls: yield host + url, host
def parse(self): site_result = NewsSite(self.url) html_string = self.html_getter.get(self.url) if not html_string: return html_document = html.fromstring(html_string) news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath()) for n in news: title = self._get_title(n) url = self._get_url(n) if not url: continue html_string = self.html_getter.get(url) if not html_string: continue article = html.fromstring(html_string) image_url = self._get_image_url(article) date = self._get_date(article) self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths()) text_html = self._get_text_html(article) text_plain = self._get_text_plain(article) site_result.add_article(title, url, image_url, date, text_html, text_plain) return site_result.to_dict()
def get_items(self, html_doc): nodes = HtmlUtils.get_elements( html_doc, '//div[@class="w-dyn-list"]//div[@class="product-listing"]') return map(self.parse_item, nodes)
def _get_hours(self, html_doc): hours_nodes = HtmlUtils.get_elements(html_doc, '//table[@class="table table-striped"]') if len(hours_nodes) > 0: return self._get_hours_info(hours_nodes[0]) return {}
def _has_menu(self, html_doc): return len(HtmlUtils.get_elements(html_doc, '//div[contains(@id,"cpg-menu")]')) > 0
def _get_deal_nodes(self, page_html): html_doc = html.fromstring(page_html) return HtmlUtils.get_elements( html_doc, '//div[contains(@class,"detail-holder")]')