Esempio n. 1
0
    def parse_wizard_image(self, element, block_xpath, sample):
        wizard = Component()
        wizard.type = "WIZARD"
        wizard.wizard_type = "WIZARD_IMAGE"
        wizard.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        inner_xpath = self.extract_xpath(sample.media_links[0].xpath)
        for img in sample.media_links:
            inner_xpath = self.great_common_prefix(
                inner_xpath, self.extract_xpath(img.xpath))
        inner_xpath = self.combine_xpath(inner_xpath[len(block_xpath):], True)

        wizard.media_links = list()
        img_list = element.xpath(inner_xpath)
        for img in img_list:
            wizard.media_links.append(
                self.get_attr(img, sample.media_links[0].attr))

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        wizard.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        wizard.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)
        return wizard
Esempio n. 2
0
    def parse_search_result(self, element, block_xpath, sample):
        search_result = Component()
        search_result.type = "SEARCH_RESULT"
        search_result.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        search_result.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        search_result.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)

        snippet_xpath = self.extract_xpath(
            sample.snippet.xpath)[len(block_xpath):]
        search_result.snippet = self.get_attr(
            element.xpath(self.combine_xpath(snippet_xpath, True)),
            sample.snippet.attr)

        view_url_xpath = self.extract_xpath(
            sample.view_url.xpath)[len(block_xpath):]
        search_result.view_url = self.get_attr(
            element.xpath(self.combine_xpath(view_url_xpath, True)),
            sample.view_url.attr)
        return search_result
 def parse_image(self, element):
     image = Component()
     image.type = "IMAGE"
     image.alignment = "LEFT"
     image.page_url = self.get_from_page(element, "./a", "href")
     image.view_url = self.get_from_page(element, "./cite", "title")
     image.title = self.get_from_page(element, ".", "string")
     return image
 def parse_wizard_news(self, element):
     wizard = Component()
     wizard.type = "WIZARD"
     wizard.wizard_type = "WIZARD_NEWS"
     wizard.alignment = "LEFT"
     wizard.page_url = self.get_from_page(element, ".", "href")
     wizard.title = self.get_from_page(element, ".", "string")
     return wizard
Esempio n. 5
0
 def parse_component(self, element):
     component = Component()
     component.type = "WIKI"
     component.alignment = "JSON"
     component.page_url = jsonpath.jsonpath(element, "$.pageid")[0]
     component.title = jsonpath.jsonpath(element, "$.title")[0]
     component.snippet = jsonpath.jsonpath(element, "$.snippet")[0]
     return component
 def parse_search_result(self, element):
     search_result = Component()
     search_result.type = "SEARCH_RESULT"
     search_result.alignment = "LEFT"
     search_result.page_url = self.get_from_page(element, "./h3/a", "href")
     search_result.title = self.get_from_page(element, "./h3/a", "string")
     search_result.snippet = self.get_from_page(element, "./div/span", "strings")
     search_result.view_url = self.get_from_page(element, "./div/div/cite", "string")
     return search_result
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h3/a", "href")
     adv.title = self.get_from_page(element, "./h3/a", "string")
     adv.snippet = self.get_from_page(element, "./div[2]", "string")
     adv.view_url = self.get_from_page(element, "./div/cite", "string")
     return adv
Esempio n. 8
0
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h2/a", "href")
     adv.title = self.get_from_page(element, "./h2/a", "string")
     adv.snippet = self.get_from_page(element, "./div[@class='organic__content-wrapper clearfix']/div[1]", "string")
     adv.view_url = self.get_from_page(element, "./div[@class='organic__subtitle typo typo_type_greenurl']/div[1]/a", "string")
     return adv
Esempio n. 9
0
 def parse_actor(self, element):
     cinema = Component()
     cinema.type = "Actor"
     cinema.alignment = "LEFT"
     cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href")
     cinema.title = self.get_from_page(element, "./div[2]/p/a", "string")
     cinema.snippet = self.get_from_page(element, "./div[2]/span[2]",
                                         "string")
     cinema.image = self.get_from_page(element, "./p/a/img", "src")
     return cinema
 def parse_wizard_image(self, element):
     wizard = Component()
     wizard.type = "WIZARD"
     wizard.wizard_type = "WIZARD_IMAGE"
     wizard.alignment = "LEFT"
     img_list = element.xpath("./div/a/img")
     wizard.media_links = list()
     for img in img_list:
         wizard.media_links.append(self.get_from_page(img, ".", "src"))
     wizard.page_url = self.get_from_page(element, "./h3/a", "href")
     wizard.title = self.get_from_page(element, "./h3/a", "string")
     return wizard
Esempio n. 11
0
 def parse_evaluated_cinema(self, element):
     cinema = Component()
     cinema.type = "EvaluatedCinema"
     cinema.alignment = "LEFT"
     cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href")
     cinema.title = self.get_from_page(element, "./div[2]/p/a", "string")
     cinema.snippet = self.get_from_page(element, "./div[2]/span[2]",
                                         "string")
     cinema.actors = self.get_from_page(element, "./div[2]/span[3]",
                                        "string")
     cinema.year = self.get_from_page(element, "./div[2]/p/span", "string")
     cinema.value = self.get_from_page(element, "./div[1]/div", "string")
     cinema.image = self.get_from_page(element, "./p/a/img", "src")
     return cinema
Esempio n. 12
0
 def get_substitution_component(self, tree, component):
     subst = Component()
     subst.type = component.type
     subst.alignment = component.alignment
     subst.page_url = self.get_from_page(tree, component.page_url)
     subst.title = self.get_from_page(tree, component.title)
     if component.type == "SEARCH_RESULT":
         subst = self.get_substitution_search_result(tree, component, subst)
     if component.type == "WIZARD":
         if component.wizard_type == "WIZARD_IMAGE":
             subst = self.get_substitution_wizard_image(
                 tree, component, subst)
         if component.wizard_type == "WIZARD_NEWS":
             subst = self.get_substitution_wizard_news(
                 tree, component, subst)
     return subst
Esempio n. 13
0
    def parse_wizard_news(self, element, block_xpath, sample):
        wizard = Component()
        wizard.type = "WIZARD"
        wizard.wizard_type = "WIZARD_NEWS"
        wizard.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        wizard.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        wizard.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)
        return wizard