def search_regulars(self): """ Search urls inside the <A> tags """ urls = set() tree = XPathExtractor().get_object(self.response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not urlparse.urlparse(url).netloc: url = self._fix_url(url) url = self._normalize_url(url) urls.add(url) return urls
def get_urls(self, response): """ Returns a list of urls found in the current html page """ urls = [] for url_match in self._url_regex.finditer(response.raw_html): urls.append(url_match.group(0)) tree = XPathExtractor().get_object(response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not self._url_regex.match(url): parsed_url = urlparse.urlparse(response.url) new_url = "%s://%s%s" % (parsed_url.scheme, parsed_url.netloc, url) urls.append(new_url) return urls
def _highlight_nodes(self, html, nodes): """ Highlights the nodes selected by the user in the current page """ html_tree = XPathExtractor().get_object(html) for xpath in nodes: tags = html_tree.xpath(xpath) if tags: tag = tags[0] classes = tag.attrib.get("class", "") classes = "%s %s" % (classes, SELECTED_CLASS) tag.attrib["class"] = classes.strip() tag.attrib["id"] = xpath return etree.tostring(html_tree.getroot(), pretty_print=True, method="html")
class HTMLFixer(object): def __init__(self, url_regex, url, html): self._url_regex = url_regex self.url = url self.html_tree = XPathExtractor().get_object(html) def get_fixed_html(self): self._fix_tags("link", "href") self._fix_tags("img", "src") return etree.tostring(self.html_tree.getroot(), pretty_print=True, method="html") def _fix_tags(self, tag, attrib): tags = self.html_tree.xpath("//%s" % tag) for tag in tags: if not self._url_regex.match(tag.attrib[attrib]): tag.attrib[attrib] = "%s/%s" % (self.url, tag.attrib[attrib])