def looking_for_next_page(root, href_sets): ret = {} ret['urls_with_next_somehow'] = grouped_hrefs_from_page( root, xpath= "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] " ) # add url r = map(lambda x: map(lambda y: [y[-1], y[-1].get("href")], x), href_sets) r = map(lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1]), r) r = filter(None, r) r = reduce_list_nesting(r) map_list_list(lambda x: x.append(EInfo(x[0]).create_xpath()), r) ret['urls_grouped_by_levei'] = r return ret
def grouped_hrefs_from_page_sets(root, xpath="//a"): #es = root.xpath(xpath) es = e_with_url_from_root(root, xpath=xpath) paths = [EInfo(e).parentpath() for e in es] # grouping_by_pathlen paths_by_length = sorted(paths, key=len) p = itertools.groupby(paths_by_length, key=len) p = map(lambda (k, i): list(i), p) #pprint(p) p = map(grouping_sorted_paths_by_xpath, p) href_sets = reduce_list_nesting(p) return href_sets
def looking_for_next_page(root, href_sets): ret = {} ret['urls_with_next_somehow'] = grouped_hrefs_from_page(root, xpath = "//a[contains(@id, 'next')] | //a[contains(@class, 'next')] | //a[contains(text(), 'Next')] | //a[contains(text(), 'next')] ") # add url r = map( lambda x: map( lambda y: [y[-1], y[-1].get("href")], x) , href_sets) r = map( lambda x: grouping_list_by_levenstein(x, key=lambda y: y[1] ) , r) r = filter(None, r) r = reduce_list_nesting(r) map_list_list(lambda x:x.append(EInfo(x[0]).create_xpath()), r) ret['urls_grouped_by_levei'] = r return ret