Esempio n. 1
0
    def xpath_for_products_next(self):
        def is_only_url_in_es(url, es):
            return not bool(
                filter(lambda a: not cmp_urls(a.get("href"), url), es))

        first_url, second_url, third_url = self.products_first_second_third[
            0:3]
        _, first = wget_root(first_url)
        candidates = e_by_url_from_page(first, second_url)

        if self.debug:
            print "XPath raw candidates: ", pformat(candidates)
            #print "XPATHs"
            #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates))

        candidates = map(
            lambda e: (
                e,
                EInfo(e).search_for_xpath_ng(first, lambda es:
                                             is_only_url_in_es(second_url, es)
                                             #lambda x: True
                                             )),
            candidates)
        if self.debug:
            print "XPath first candidates: ", pformat(candidates)

        # reverify by second -> third_url
        _, second = wget_root(second_url)

        if self.debug:
            for e, xpaths in candidates:
                for xpath in xpaths:
                    print xpath
                    print third_url
                    es = second.xpath(xpath)
                    print_es(es)

        candidates = filter(
            lambda (e, xpaths): len(
                filter(
                    lambda xpath: is_only_url_in_es(
                        third_url, second.xpath(xpath)), xpaths)), candidates)
        #print "XPath last candidates: ", pformat(candidates)

        if candidates:
            self.next_products_xpath = candidates[0][1][0]
Esempio n. 2
0
 def xpath_for_categories(self):
     text, root = wget_root(self.categories_url)
     grouped_hrefs = grouped_hrefs_from_page_sets(root)
     grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs)
     #pprint(grouped_hrefs)
     grouped_hrefs = filter(
         lambda x: is_grouped_hrefs_has_urls(x, self.products),
         grouped_hrefs)
     self.categories_xpath = map(lambda es: xpath_for_es(es, root),
                                 grouped_hrefs)
Esempio n. 3
0
 def xpath_for_products(self):
     url = self.products[0]
     #print "url: ", url
     text, root = wget_root(url)
     ess = traversal(root, 1, min_elements=4,
           mintreeheight=3, maxtreeheight=4, maxmismatch=0.28)
     # TODO: grouping by level
     # TODO: finding xpath
     self.products_ess = ess
     print_ess(ess, root=root)
Esempio n. 4
0
 def xpath_for_categories(self):
     text, root = wget_root(self.categories_url)
     grouped_hrefs = grouped_hrefs_from_page_sets(root)
     grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs)
     #pprint(grouped_hrefs)
     grouped_hrefs = filter(lambda x: is_grouped_hrefs_has_urls(x,
                                                                self.products),
                            grouped_hrefs)
     self.categories_xpath = map(lambda es: xpath_for_es(es, root),
                                 grouped_hrefs)
Esempio n. 5
0
    def xpath_for_products_next(self):
        
        def is_only_url_in_es(url, es):
            return not bool(filter(lambda a: not cmp_urls(a.get("href"), url), es))
        
        first_url, second_url, third_url = self.products_first_second_third[0:3]
        _, first = wget_root(first_url)
        candidates = e_by_url_from_page(first, second_url)

        if self.debug:
            print "XPath raw candidates: ", pformat(candidates)
            #print "XPATHs"
            #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates))
               
        candidates = map(lambda e: (e, EInfo(e).search_for_xpath_ng(first,
                                        lambda es: is_only_url_in_es(second_url, es)
                                        #lambda x: True
                                        )),
                         candidates)
        if self.debug:
            print "XPath first candidates: ", pformat(candidates)

        # reverify by second -> third_url
        _, second = wget_root(second_url)

        if self.debug:
            for e, xpaths in candidates:
                for xpath in xpaths:
                    print xpath
                    print third_url
                    es = second.xpath(xpath)
                    print_es(es)

        candidates = filter(lambda (e, xpaths):
               len(filter(
                          lambda xpath: is_only_url_in_es(third_url, second.xpath(xpath)),
                          xpaths))
               , candidates)
        #print "XPath last candidates: ", pformat(candidates)
        
        if candidates:
            self.next_products_xpath = candidates[0][1][0]
Esempio n. 6
0
 def xpath_for_products(self):
     url = self.products[0]
     #print "url: ", url
     text, root = wget_root(url)
     ess = traversal(root,
                     1,
                     min_elements=4,
                     mintreeheight=3,
                     maxtreeheight=4,
                     maxmismatch=0.28)
     # TODO: grouping by level
     # TODO: finding xpath
     self.products_ess = ess
     print_ess(ess, root=root)