Beispiel #1
0
    def parse_search_result(self, element, block_xpath, sample):
        search_result = Component()
        search_result.type = "SEARCH_RESULT"
        search_result.alignment = "LEFT"

        block_xpath = self.extract_xpath(block_xpath)

        page_url_xpath = self.extract_xpath(
            sample.page_url.xpath)[len(block_xpath):]
        search_result.page_url = self.get_attr(
            element.xpath(self.combine_xpath(page_url_xpath, True)),
            sample.page_url.attr)

        title_xpath = self.extract_xpath(sample.title.xpath)[len(block_xpath):]
        search_result.title = self.get_attr(
            element.xpath(self.combine_xpath(title_xpath, True)),
            sample.title.attr)

        snippet_xpath = self.extract_xpath(
            sample.snippet.xpath)[len(block_xpath):]
        search_result.snippet = self.get_attr(
            element.xpath(self.combine_xpath(snippet_xpath, True)),
            sample.snippet.attr)

        view_url_xpath = self.extract_xpath(
            sample.view_url.xpath)[len(block_xpath):]
        search_result.view_url = self.get_attr(
            element.xpath(self.combine_xpath(view_url_xpath, True)),
            sample.view_url.attr)
        return search_result
Beispiel #2
0
 def parse_component(self, element):
     component = Component()
     component.type = "WIKI"
     component.alignment = "JSON"
     component.page_url = jsonpath.jsonpath(element, "$.pageid")[0]
     component.title = jsonpath.jsonpath(element, "$.title")[0]
     component.snippet = jsonpath.jsonpath(element, "$.snippet")[0]
     return component
 def parse_search_result(self, element):
     search_result = Component()
     search_result.type = "SEARCH_RESULT"
     search_result.alignment = "LEFT"
     search_result.page_url = self.get_from_page(element, "./h3/a", "href")
     search_result.title = self.get_from_page(element, "./h3/a", "string")
     search_result.snippet = self.get_from_page(element, "./div/span", "strings")
     search_result.view_url = self.get_from_page(element, "./div/div/cite", "string")
     return search_result
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h3/a", "href")
     adv.title = self.get_from_page(element, "./h3/a", "string")
     adv.snippet = self.get_from_page(element, "./div[2]", "string")
     adv.view_url = self.get_from_page(element, "./div/cite", "string")
     return adv
Beispiel #5
0
 def parse_adv(self, element):
     adv = Component()
     adv.type = "ADV"
     adv.alignment = "LEFT"
     adv.page_url = self.get_from_page(element, "./h2/a", "href")
     adv.title = self.get_from_page(element, "./h2/a", "string")
     adv.snippet = self.get_from_page(element, "./div[@class='organic__content-wrapper clearfix']/div[1]", "string")
     adv.view_url = self.get_from_page(element, "./div[@class='organic__subtitle typo typo_type_greenurl']/div[1]/a", "string")
     return adv
Beispiel #6
0
 def parse_actor(self, element):
     cinema = Component()
     cinema.type = "Actor"
     cinema.alignment = "LEFT"
     cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href")
     cinema.title = self.get_from_page(element, "./div[2]/p/a", "string")
     cinema.snippet = self.get_from_page(element, "./div[2]/span[2]",
                                         "string")
     cinema.image = self.get_from_page(element, "./p/a/img", "src")
     return cinema
Beispiel #7
0
 def parse_evaluated_cinema(self, element):
     cinema = Component()
     cinema.type = "EvaluatedCinema"
     cinema.alignment = "LEFT"
     cinema.page_url = self.get_from_page(element, "./div[2]/p/a", "href")
     cinema.title = self.get_from_page(element, "./div[2]/p/a", "string")
     cinema.snippet = self.get_from_page(element, "./div[2]/span[2]",
                                         "string")
     cinema.actors = self.get_from_page(element, "./div[2]/span[3]",
                                        "string")
     cinema.year = self.get_from_page(element, "./div[2]/p/span", "string")
     cinema.value = self.get_from_page(element, "./div[1]/div", "string")
     cinema.image = self.get_from_page(element, "./p/a/img", "src")
     return cinema