def test_generate_start_urls_from_params(self): genny = UrlGenerator() spec = self.specs['params'] base = "https://encrypted.google.com/search?hl=en&q=%s&location=%s" n, t, d, c = "nosetests", "tox", "dublin", "cork" arg = [(n, d), (n, c), (t, d), (t, c)] self.assertEqual([base % (q, l) for q, l in arg], list(genny(spec[0])))
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'feed': FeedGenerator(self.parse), 'generated': FragmentGenerator(), } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) self._configure_js(spec, settings) self.plugins = self._configure_plugins(settings, spec, item_schemas, all_extractors) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', [])
def setUp(self): self.generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(), 'url': IdentityGenerator(), 'generated': FragmentGenerator(), }
def test_generate_start_urls_from_date(self): now = datetime.now() genny = UrlGenerator() spec = self.specs['dates'] url = "http://www.commitstrip.com/en/{}/{:02}".format( now.year, now.month) self.assertEqual([url], list(genny(spec[0])))
def test_generate_start_urls_from_spider_arg(self): genny = UrlGenerator( spider_args={ 'categories': ['cars-for-sale', 'houses-for-sale'], 'sections': ['pets-for-sale', 'kitchens-for-sale'] }) spec = self.specs['spider_args'] self.assertEqual(self.donedeal_start_urls, list(genny(spec[0])))
def test_generate_start_urls_from_params_range(self): genny = UrlGenerator() spec = self.specs['params_range'] urls = [ "http://www.smbc-comics.com/index.php?p=%s&q=comic" % i for i in range(20, 30, 5) ] self.assertEqual(urls, list(genny(spec[0])))
def test_generate_start_urls_from_range(self): genny = UrlGenerator() spec = self.specs['range'] urls = [ "https://www.donedeal.ie/cars-for-sale/i/%s" % i for i in range(100000010, 100000000, -1) ] self.assertEqual(urls, list(genny(spec[0])))
def test_generate_start_urls_from_setting(self): genny = UrlGenerator( Settings( values={ 'categories': 'cars-for-sale,houses-for-sale', 'sections': ['pets-for-sale', 'kitchens-for-sale'] })) spec = self.specs['settings'] self.assertEqual(self.donedeal_start_urls, list(genny(spec[0])))
def test_misconfigured_start_urls_spec_type(self): genny = UrlGenerator() spec = [{ "template": "http://www.smbc-comics.com/{}", "paths": [{ "type": "defaults", "values": ["index.php"] }], "params": [], "params_template": {} }] self.assertEqual([], list(genny(spec[0])))
def test_missing_arg_for_start_urls_spec(self): genny = UrlGenerator(Settings(values={'home': 'home.php'}), {'index': 'index.php'}) spec = [{ "template": "http://www.smbc-comics.com/{}", "paths": [{ "type": "spider_args", "values": ["home"] }], "params": [], "params_template": {} }] self.assertEqual([], list(genny(spec[0]))) spec = [{ "template": "http://www.smbc-comics.com/{}", "paths": [{ "type": "settings", "values": ["index"] }], "params": [], "params_template": {} }] self.assertEqual([], list(genny(spec[0])))
def __init__(self, name, spec, item_schemas, all_extractors, settings=None, **kw): self.start_url_generators = { 'start_urls': IdentityGenerator(), 'generated_urls': UrlGenerator(settings, kw), 'url': IdentityGenerator(), 'feed': FeedGenerator(self.parse), 'generated': FragmentGenerator(), } self.generic_form = GenericForm(**kw) super(IblSpider, self).__init__(name, **kw) spec = deepcopy(spec) self._add_spider_args_to_spec(spec, kw) #if actions configured, then set js_enabled as true, and put url of each action to js_enable_patterns. self.actions = spec.get('actions', []) ''' if len(self.actions): spec['js_enabled']= True enable_patterns = spec.get('js_enable_patterns', [] ) for action in self.actions: enable_patterns.append(action.get('url')) spec['js_enable_patterns']= enable_patterns ''' self._configure_js(spec, settings) self.plugins = self._configure_plugins( settings, spec, item_schemas, all_extractors) self.login_requests, self.form_requests = [], [] self._start_urls = self._create_start_urls(spec) self._start_requests = self._create_start_requests(spec) self._create_init_requests(spec) self._add_allowed_domains(spec) self.page_actions = spec.get('page_actions', [])
def test_generate_start_urls_from_options(self): genny = UrlGenerator() spec = self.specs['options'] self.assertEqual(self.github_start_urls, list(genny(spec[0])))
def test_generate_start_urls_from_defaults(self): genny = UrlGenerator() spec = self.specs['defaults'] self.assertEqual(["https://github.com/scrapinghub"], list(genny(spec[0])))