def _get_result_with_stack(self, stack, soup, url, attr_fuzz_ratio, **kwargs): parents = [soup] stack_content = stack['content'] contain_sibling_leaves = kwargs.get('contain_sibling_leaves', False) for index, item in enumerate(stack_content): children = [] for parent in parents: attrs = item[1] if attr_fuzz_ratio < 1.0: attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio) found = parent.findAll(item[0], attrs, recursive=False) if not found: continue if not contain_sibling_leaves and index == len(stack_content) - 1: idx = min(len(found) - 1, stack_content[index - 1][2]) found = [found[idx]] children += found parents = children wanted_attr = stack['wanted_attr'] is_full_url = stack['is_full_url'] is_non_rec_text = stack.get('is_non_rec_text', False) result = [ResultItem(self._fetch_result_from_child(i, wanted_attr, is_full_url, url, is_non_rec_text), getattr(i, 'child_index', 0)) for i in parents] result = [x for x in result if x.text] return result
def _get_result_with_stack_index_based(self, stack, soup, url, attr_fuzz_ratio, **kwargs): p = soup.findChildren(recursive=False)[0] stack_content = stack["content"] for index, item in enumerate(stack_content[:-1]): if item[0] == "[document]": continue content = stack_content[index + 1] attrs = content[1] if attr_fuzz_ratio < 1.0: attrs = self._get_fuzzy_attrs(attrs, attr_fuzz_ratio) p = p.findAll(content[0], attrs, recursive=False) if not p: return [] idx = min(len(p) - 1, item[2]) p = p[idx] result = [ ResultItem( self._fetch_result_from_child( p, stack["wanted_attr"], stack["is_full_url"], url, stack["is_non_rec_text"], ), getattr(p, "child_index", 0), ) ] result = [x for x in result if x.text] return result
def _get_result_with_stack_index_based(self, stack, soup, url, **kwargs): p = soup.findChildren(recursive=False)[0] stack_content = stack['content'] for index, item in enumerate(stack_content[:-1]): content = stack_content[index + 1] p = p.findAll(content[0], content[1], recursive=False) if not p: return [] idx = min(len(p) - 1, item[2]) p = p[idx] result = [ ResultItem( self._fetch_result_from_child(p, stack['wanted_attr'], stack['is_full_url'], url), getattr(p, 'child_index', 0)) ] result = [x for x in result if x.text] return result