Ejemplo n.º 1
0
def test_dlurljoin():
    eq_(dlurljoin('http://a.b/', 'f'), 'http://a.b/f')
    eq_(dlurljoin('http://a.b/page', 'f'), 'http://a.b/f')
    eq_(dlurljoin('http://a.b/dir/', 'f'), 'http://a.b/dir/f')
    eq_(dlurljoin('http://a.b/dir/', 'http://url'), 'http://url')
    eq_(dlurljoin('http://a.b/dir/', '/'), 'http://a.b/')
    eq_(dlurljoin('http://a.b/dir/', '/x/y'), 'http://a.b/x/y')
Ejemplo n.º 2
0
    def _select_and_extract(self, selector, query, data):
        prev_url = data.get('url', None)
        url_query = re.compile(query)
        for url_e in selector.xpath('//a'):
            url = url_href = url_e.xpath('@href').extract_first()
            if not url:
                # it was an <a> without href
                continue

            # make it a full URL, if there was an original URL
            if prev_url:
                url = dlurljoin(prev_url, url_href)

            if self._TARGET == 'href':
                regex_target = url
            elif self._TARGET == 'text':
                regex_target = url_e.xpath('text()').extract_first()
            else:
                raise ValueError("Unknown _TARGET=%r" % (self._TARGET, ))

            regex = url_query.match(regex_target)
            if not regex:
                continue

            # enrich data with extracted keywords
            data_ = data.copy()
            for k, v in regex.groupdict().items():
                data_[k] = v

            # TODO: such actions we might want to perform also in other cases,
            # e.g. operating on some extracted with XPATH content
            data_['url'] = url
            data_['url_href'] = url_href
            data_['url_text'] = url_e.xpath('text()').extract_first()
            lgr.log(5, "Matched %(url)s" % data_)

            yield url_e, data_