Beispiel #1
0
 def _get_hours_info(self, hours_node):
     hours_info = {}
     day_nodes = HtmlUtils.get_elements(hours_node, './/tr')
     for day_node in day_nodes:
         key = HtmlUtils.get_element_value(day_node, './td[contains(text(),"day")]/text()').lower()
         value = HtmlUtils.get_element_value(day_node,'.//strong/text()')
         hours_info[key] = value
     return hours_info
 def produce(self, state_name):
     response = self._http_client.get(self._url.format(state_name.lower()))
     if response.success:
         html_doc = html.fromstring(response.content)
         category_urls = HtmlUtils.get_elements(
             html_doc,
             './/div[@class="w-dyn-items"]//div[@class="w-embed"]/a/@href')
         return category_urls
     return []
Beispiel #3
0
 def _get_menu_info(self, html_doc):
     category_nodes = HtmlUtils.get_elements(html_doc, '//div[@class="panel panel-default"]')
     categories = {}
     for category_node in category_nodes:
         category_name = HtmlUtils.get_element_value(category_node, './/h2[@class="panel-title"]/a/text()')
         if category_name != '':
             categories[category_name] = self._get_category_from_menu_info(category_node)
             
     return categories
Beispiel #4
0
 def _get_menu_item_prices_info(self, item_node):
     price_nodes = HtmlUtils.get_elements(item_node, './/div[contains(@class,"menu-item-prices hidden")]//li[not(contains(@class,"price-empty")) and contains(@class, "rounded gram-price")]')
     price_infos = []
     for price_node in price_nodes:
         unit = HtmlUtils.get_element_value(price_node, './/div[@class="unit"]/text()')
         if unit != '':
             info = {}
             info[unit] = HtmlUtils.get_element_value(price_node, './/div[@class="price rounded"]/text()')
             price_infos.append(info)
             
     return price_infos
Beispiel #5
0
 def _get_category_from_menu_info(self, category_node):
     item_nodes = HtmlUtils.get_elements(category_node, './/div[@class="menu-item"]')
     items_info = []
     for item_node in item_nodes:
         item_name = HtmlUtils.get_element_value(item_node, './/h3[@class="menu-item-name"]/text()')
         if item_name != '':
             info = {}
             info['name'] = item_name
             info['prices'] = self._get_menu_item_prices_info(item_node)
             items_info.append(info)
     
     return items_info
 def _get_city_urls(self, page_html, host):
     html_doc = html.fromstring(page_html)
     city_nodes = HtmlUtils.get_elements(
         html_doc,
         '(//div[@id="maincolumn"]//ul[@class="dropdown-menu"])[1]/li/a')
     result = []
     for node in city_nodes:
         city_name = HtmlUtils.get_element_value(node, './text()')
         if self._dispensary_filter.match_city(city_name):
             result.append(host +
                           HtmlUtils.get_element_value(node, './@href'))
     return result
 def produce(self, state_name):
     resp, host = self._get_state_response(state_name)
     if resp.success:
         city_urls = self._get_city_urls(resp.content, host)
         for city_url in city_urls:
             res = self._http_client.get(city_url)
             if res.success:
                 html_doc = html.fromstring(res.content)
                 store_urls = HtmlUtils.get_elements(
                     html_doc,
                     '//div[contains(@class,"-listing")]//*[self::h3 or self::h4]/a/@href'
                 )
                 for url in store_urls:
                     yield host + url, host
Beispiel #8
0
    def parse(self):
        site_result = NewsSite(self.url)

        html_string = self.html_getter.get(self.url)
        if not html_string:
            return

        html_document = html.fromstring(html_string)

        news = HtmlUtils.get_elements(html_document, self.xpaths_container.get_news_xpath())

        for n in news:
            title = self._get_title(n)
            url = self._get_url(n)

            if not url:
                continue

            html_string = self.html_getter.get(url)

            if not html_string:
                continue

            article = html.fromstring(html_string)

            image_url = self._get_image_url(article)
            date = self._get_date(article)

            self.remove_elements(article, self.xpaths_container.get_elements_to_remove_xpaths())

            text_html = self._get_text_html(article)
            text_plain = self._get_text_plain(article)

            site_result.add_article(title, url, image_url, date, text_html, text_plain)

        return site_result.to_dict()
 def get_items(self, html_doc):
     nodes = HtmlUtils.get_elements(
         html_doc,
         '//div[@class="w-dyn-list"]//div[@class="product-listing"]')
     return map(self.parse_item, nodes)
Beispiel #10
0
 def _get_hours(self, html_doc):
     hours_nodes = HtmlUtils.get_elements(html_doc, '//table[@class="table table-striped"]')
     if len(hours_nodes) > 0:
        return self._get_hours_info(hours_nodes[0])
     return {}
Beispiel #11
0
 def _has_menu(self, html_doc):
     return len(HtmlUtils.get_elements(html_doc, '//div[contains(@id,"cpg-menu")]')) > 0
 def _get_deal_nodes(self, page_html):
     html_doc = html.fromstring(page_html)
     return HtmlUtils.get_elements(
         html_doc, '//div[contains(@class,"detail-holder")]')