def _child_has_text(child, text, url, text_fuzz_ratio): child_text = child.getText().strip() if text_match(text, child_text, text_fuzz_ratio): parent_text = child.parent.getText().strip() if child_text == parent_text: return False child.wanted_attr = None return True if text_match(text, get_non_rec_text(child), text_fuzz_ratio): child.is_non_rec_text = True child.wanted_attr = None return True for key, value in child.attrs.items(): if not isinstance(value, str): continue value = value.strip() if text_match(text, value, text_fuzz_ratio): child.wanted_attr = key return True if key in {'href', 'src'}: full_url = urljoin(url, value) if text == full_url: child.wanted_attr = key child.is_full_url = True return True return False
def _fetch_result_from_child(child, wanted_attr, is_full_url, url, is_non_rec_text): if wanted_attr is None: if is_non_rec_text: return get_non_rec_text(child) return child.getText().strip() if wanted_attr not in child.attrs: return None if is_full_url: return urljoin(url, child.attrs[wanted_attr]) return child.attrs[wanted_attr]