Exemple #1
0
    def extract(self, source, url):

        try:
            tree = etree.fromstring(
                source, etree.XMLParser(recover=True, ns_clean=True))
        except:
            self.logger.error('Could not parse HTML %s' %
                              traceback.format_exc())
            return []

        links = set()
        iterator = tree.getiterator()

        for element in iterator:
            #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0)
            #link = link[0]

            href = element.get('href')
            if not href or self.expr.match(href):
                continue

            href = urljoin(url, href)

            try:
                links.add(urlquote(href))
            except:
                # link şema dandik
                self.logger.error('XHtmlExtractor %s ' %
                                  traceback.format_exc())
                pass

        return list(links)
Exemple #2
0
    def extract(self, source, url):

        try:
            tree = etree.fromstring(source, etree.XMLParser(recover=True,
                                                            ns_clean=True))
        except:
            self.logger.error('Could not parse HTML %s' % traceback.format_exc())
            return []
        
        links = set()
        iterator = tree.getiterator()

        for element in iterator:
            #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0)
            #link = link[0]
            
            href = element.get('href')
            if not href or self.expr.match(href):
                continue

            href = urljoin(url, href)

            try:
                links.add(urlquote(href))
            except:
                # link şema dandik
                self.logger.error('XHtmlExtractor %s ' % traceback.format_exc())
                pass

        return list(links)
Exemple #3
0
    def extract(self, source, url):

        try:
            tree = html.fromstring(source)
            tree.rewrite_links(self.ignore)
            tree.make_links_absolute(url, resolve_base_href=True)
        except:
            self.logger.error('Could not parse HTML %s' %
                              traceback.format_exc())
            return Set([])

        uris = Set([])
        iterator = tree.getiterator()

        for link in iterator:
            #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0)
            keywords = LINK.get(link.tag)
            if not keywords:
                continue

            # img, (src, lowsrc..)
            for word in keywords:
                uri = link.get(word)
                if not uri:
                    continue
                try:
                    uris.add(urlquote(uri))
                except:
                    # link şema dandik
                    pass
        return uris
Exemple #4
0
    def extract(self, source, url):

        try:
            tree = html.fromstring(source)
            tree.rewrite_links(self.ignore)
            tree.make_links_absolute(url, resolve_base_href=True)
        except:
            self.logger.error('Could not parse HTML %s' % traceback.format_exc())
            return Set([])
        
        uris = Set([])
        iterator = tree.getiterator()

        for link in iterator:
            #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0)
            keywords = LINK.get(link.tag)
            if not keywords:
                continue

            # img, (src, lowsrc..) 
            for word in keywords:
                uri = link.get(word)
                if not uri:
                    continue
                try:
                    uris.add(urlquote(uri))
                except:
                    # link şema dandik
                    pass
        return uris