def search_regulars(self): """ Search urls inside the <A> tags """ urls = set() tree = XPathExtractor().get_object(self.response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not urlparse.urlparse(url).netloc: url = self._fix_url(url) url = self._normalize_url(url) urls.add(url) return urls
def get_urls(self, response): """ Returns a list of urls found in the current html page """ urls = [] for url_match in self._url_regex.finditer(response.raw_html): urls.append(url_match.group(0)) tree = XPathExtractor().get_object(response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not self._url_regex.match(url): parsed_url = urlparse.urlparse(response.url) new_url = "%s://%s%s" % (parsed_url.scheme, parsed_url.netloc, url) urls.append(new_url) return urls
def _highlight_nodes(self, html, nodes): """ Highlights the nodes selected by the user in the current page """ html_tree = XPathExtractor().get_object(html) for xpath in nodes: tags = html_tree.xpath(xpath) if tags: tag = tags[0] classes = tag.attrib.get("class", "") classes = "%s %s" % (classes, SELECTED_CLASS) tag.attrib["class"] = classes.strip() tag.attrib["id"] = xpath return etree.tostring(html_tree.getroot(), pretty_print=True, method="html")
class HTMLFixer(object): def __init__(self, url_regex, url, html): self._url_regex = url_regex self.url = url self.html_tree = XPathExtractor().get_object(html) def get_fixed_html(self): self._fix_tags("link", "href") self._fix_tags("img", "src") return etree.tostring(self.html_tree.getroot(), pretty_print=True, method="html") def _fix_tags(self, tag, attrib): tags = self.html_tree.xpath("//%s" % tag) for tag in tags: if not self._url_regex.match(tag.attrib[attrib]): tag.attrib[attrib] = "%s/%s" % (self.url, tag.attrib[attrib])
def execute(self): try: import IPython except ImportError: exit_with_error("Please install the ipython console") url = self.args[0] crawler = BaseCrawler() response = crawler._get_response(url) html = XPathExtractor().get_object(response) shell = IPython.Shell.IPShellEmbed(argv=[], user_ns={'response': response}) shell()
def __init__(self, url_regex, url, html): self._url_regex = url_regex self.url = url self.html_tree = XPathExtractor().get_object(html)