def xpath_for_products_next(self): def is_only_url_in_es(url, es): return not bool( filter(lambda a: not cmp_urls(a.get("href"), url), es)) first_url, second_url, third_url = self.products_first_second_third[ 0:3] _, first = wget_root(first_url) candidates = e_by_url_from_page(first, second_url) if self.debug: print "XPath raw candidates: ", pformat(candidates) #print "XPATHs" #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates)) candidates = map( lambda e: ( e, EInfo(e).search_for_xpath_ng(first, lambda es: is_only_url_in_es(second_url, es) #lambda x: True )), candidates) if self.debug: print "XPath first candidates: ", pformat(candidates) # reverify by second -> third_url _, second = wget_root(second_url) if self.debug: for e, xpaths in candidates: for xpath in xpaths: print xpath print third_url es = second.xpath(xpath) print_es(es) candidates = filter( lambda (e, xpaths): len( filter( lambda xpath: is_only_url_in_es( third_url, second.xpath(xpath)), xpaths)), candidates) #print "XPath last candidates: ", pformat(candidates) if candidates: self.next_products_xpath = candidates[0][1][0]
def xpath_for_categories(self): text, root = wget_root(self.categories_url) grouped_hrefs = grouped_hrefs_from_page_sets(root) grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs) #pprint(grouped_hrefs) grouped_hrefs = filter( lambda x: is_grouped_hrefs_has_urls(x, self.products), grouped_hrefs) self.categories_xpath = map(lambda es: xpath_for_es(es, root), grouped_hrefs)
def xpath_for_products(self): url = self.products[0] #print "url: ", url text, root = wget_root(url) ess = traversal(root, 1, min_elements=4, mintreeheight=3, maxtreeheight=4, maxmismatch=0.28) # TODO: grouping by level # TODO: finding xpath self.products_ess = ess print_ess(ess, root=root)
def xpath_for_categories(self): text, root = wget_root(self.categories_url) grouped_hrefs = grouped_hrefs_from_page_sets(root) grouped_hrefs = map_list_list(lambda x: x[-1], grouped_hrefs) #pprint(grouped_hrefs) grouped_hrefs = filter(lambda x: is_grouped_hrefs_has_urls(x, self.products), grouped_hrefs) self.categories_xpath = map(lambda es: xpath_for_es(es, root), grouped_hrefs)
def xpath_for_products_next(self): def is_only_url_in_es(url, es): return not bool(filter(lambda a: not cmp_urls(a.get("href"), url), es)) first_url, second_url, third_url = self.products_first_second_third[0:3] _, first = wget_root(first_url) candidates = e_by_url_from_page(first, second_url) if self.debug: print "XPath raw candidates: ", pformat(candidates) #print "XPATHs" #pprint(map(lambda e: EInfo(e).search_for_xpath_ng(first, lambda z: True), candidates)) candidates = map(lambda e: (e, EInfo(e).search_for_xpath_ng(first, lambda es: is_only_url_in_es(second_url, es) #lambda x: True )), candidates) if self.debug: print "XPath first candidates: ", pformat(candidates) # reverify by second -> third_url _, second = wget_root(second_url) if self.debug: for e, xpaths in candidates: for xpath in xpaths: print xpath print third_url es = second.xpath(xpath) print_es(es) candidates = filter(lambda (e, xpaths): len(filter( lambda xpath: is_only_url_in_es(third_url, second.xpath(xpath)), xpaths)) , candidates) #print "XPath last candidates: ", pformat(candidates) if candidates: self.next_products_xpath = candidates[0][1][0]