def test_dlurljoin(): eq_(dlurljoin('http://a.b/', 'f'), 'http://a.b/f') eq_(dlurljoin('http://a.b/page', 'f'), 'http://a.b/f') eq_(dlurljoin('http://a.b/dir/', 'f'), 'http://a.b/dir/f') eq_(dlurljoin('http://a.b/dir/', 'http://url'), 'http://url') eq_(dlurljoin('http://a.b/dir/', '/'), 'http://a.b/') eq_(dlurljoin('http://a.b/dir/', '/x/y'), 'http://a.b/x/y')
def _select_and_extract(self, selector, query, data): prev_url = data.get('url', None) url_query = re.compile(query) for url_e in selector.xpath('//a'): url = url_href = url_e.xpath('@href').extract_first() if not url: # it was an <a> without href continue # make it a full URL, if there was an original URL if prev_url: url = dlurljoin(prev_url, url_href) if self._TARGET == 'href': regex_target = url elif self._TARGET == 'text': regex_target = url_e.xpath('text()').extract_first() else: raise ValueError("Unknown _TARGET=%r" % (self._TARGET, )) regex = url_query.match(regex_target) if not regex: continue # enrich data with extracted keywords data_ = data.copy() for k, v in regex.groupdict().items(): data_[k] = v # TODO: such actions we might want to perform also in other cases, # e.g. operating on some extracted with XPATH content data_['url'] = url data_['url_href'] = url_href data_['url_text'] = url_e.xpath('text()').extract_first() lgr.log(5, "Matched %(url)s" % data_) yield url_e, data_