def test_extract_repeated_field(self): sample = { 'plugins': {'annotations-plugin': {}}, 'url': 'https://stackoverflow.com', 'original_body': re.sub( 'data-scrapy-annotate=".*"', '', html_page._body), 'scrapes': 'default', 'page_id': '507f520c3bf361f4c5cd55c44307a271bccb2218', 'version': '0.13.0' } data = open_spec('so_annotations.json') annos, items, results = data['annos'], data['items'], data['results'] sample['plugins']['annotations-plugin']['extracts'] = annos spider = IblSpider('so', make_spider(sample=sample), items, {}, Settings()) page = HtmlResponse('http://url', body=sample['original_body'], encoding='utf-8') items = [i for i in spider.parse(page) if not isinstance(i, Request)] keys = {(u'_index', u'_template', u'_type', u'answered', u'tags', u'title', 'url')} self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys) self.assertEqual([items[0], items[52], items[-1]], results) self.assertEqual(len(items), 96) spider, page, results = open_spider_page_and_results('autoevolution.json') items = [i for i in spider.parse(page) if not isinstance(i, Request)] self.assertEqual(items, results)
def test_trained(self): base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format daft_url = base(10) spec = { 'start_urls': [daft_url], 'links_to_follow': 'auto', 'respect_nofollow': False, 'follow_patterns': [], 'exclude_patterns': [], 'init_requests': [], 'templates': [daft_sample] } settings = Settings() settings.set('LOADED_PLUGINS', load_plugins(settings)) spider = IblSpider('hn', spec, {}, {}, settings=settings) request = Request(daft_url) response = HtmlResponse(url=daft_url, body=daft_body, request=request, encoding="utf-8") data = { r.url for r in spider.handle_html(response) if isinstance(r, Request) } self.assertEqual({base(i) for i in (90, 80, 70)}, data)
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) self.assertEqual(data[:6], xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list( spider.parse( HtmlResponse( 'http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8'))) self.assertEqual(data[:6], xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) items = [d for d in data if not isinstance(d, Request)] self.assertEqual(items, xceed_spider['results'])
def test_extract_multiple_item_types(self): spider = IblSpider('xceed', xceed_spider, xceed_spider['items'], {}, Settings()) data = list(spider.parse( HtmlResponse('http://url', body=xceed_spider['templates'][0]['original_body'], encoding='utf-8') )) items = sorted([d for d in data if not isinstance(d, Request)], key=lambda x: ('ticket', 'venue', 'event').index(x['_type'])) self.assertEqual(items, xceed_spider['results'])
def create_spider(self, project, auth_info, params, **kwargs): spider = params.get('spider') if spider is None: return None, None pspec = self.bot.spec_manager.project_spec(project, auth_info) try: spider_spec = pspec.resource('spiders', spider) spider_spec['templates'] = [] for template in spider_spec.get('template_names', []): try: spider_spec['templates'].append( pspec.resource('spiders', spider, template)) except TypeError: # Template names not consistent with templates pspec.remove_template(spider, template) items_spec = pspec.resource('items') extractors = pspec.resource('extractors') return (IblSpider(spider, spider_spec, items_spec, extractors, self.bot.runner.settings, **kwargs), spider_spec['templates']) except IOError as ex: if ex.errno == errno.ENOENT: log.msg("skipping extraction, no spec: %s" % ex.filename) return None, None else: raise
def open_spider(self, meta, storage=None, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return { 'error': 4004, 'reason': 'Project "%s" not found' % meta['project'] } spider_name = meta['spider'] if project is None: project = Project(storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except (IOError, KeyError): return { 'error': 4004, 'reason': 'Spider "%s" not found' % spider_name } spider_name, spider, items, extractors = load_spider_data(spider_model) if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(project, spider_name, spider, items, extractors)
def load_spider(storage, model): items = json.load(storage.open_with_default('items.json', '{}')) extractors = json.load(storage.open_with_default('extractors.json', '{}')) spider = json.loads(model.dumps()) samples = [json.loads(sample.dumps()) for sample in model.samples] spider['templates'] = samples return IblSpider(model.id, spider, items, extractors, Settings())
def update_spider(self, meta, spider=None, template=None, items=None, extractors=None): if not hasattr(self.factory[self], 'spiderspec'): return self.open_spider(meta) spec = self.factory[self].spiderspec if spec is None or spec.name != meta.get('spider'): return self.open_spider(meta) items = items or spec.items extractors = extractors or spec.extractors if spider: spider['templates'] = spec.spider['templates'] else: spider = spec.spider if template: for idx, tmpl in enumerate(spider['templates']): if template['original_body'] == tmpl['original_body']: spider['templates'][idx] = template break else: spider['templates'].append(template) spider['template_names'] = [t['name'] for t in spider['templates']] self.factory[self].spider = IblSpider(meta['spider'], spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(meta['spider'], spider, items, extractors)
def open_spider(self, meta): if ('project' not in meta or 'spider' not in meta or (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff)): return {'error': 4004, 'reason': 'Project "%s" not found' % meta['project']} spider_name = meta['spider'] spec = self.spec_manager.project_spec(meta['project'], self.user.auth) spider = spec.resource('spiders', spider_name) items = spec.resource('items') extractors = spec.resource('extractors') templates = [] for template in spider.get('template_names', []): try: templates.append(spec.resource('spiders', spider_name, template)) except TypeError: # Template names not consistent with templates spec.remove_template(spider_name, template) spider['templates'] = templates if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(spider_name, spider, items, extractors)
def load_project_data(open_func, spiders_list_func, project_dir): """Load project data using provided open_func and project directory.""" # Load items and extractors from project schemas = open_func(project_dir, 'items') extractors = open_func(project_dir, 'extractors') # Load spiders and templates spiders = {} spiders_list = spiders_list_func(project_dir) for spider_name in spiders_list: spider = open_func(project_dir, 'spiders', spider_name) if not spider: log.warning('Skipping "%s" spider as there is no data', spider_name) continue if 'template_names' in spider: samples = spider.get('template_names', []) spider['templates'] = [] for sample_name in samples: sample = open_func(project_dir, 'spiders', spider_name, sample_name) _build_sample(sample) spider['templates'].append(sample) else: for sample in spider.get('templates', []): _build_sample(sample) spiders[spider_name] = (IblSpider(spider_name, spider, schemas, extractors, Settings()), spider) return schemas, extractors, spiders
def open_spider_page_and_results(name): sample_spec = open_spec(name) schemas = sample_spec['schemas'] results = sample_spec['results'] page = UTF8HtmlResponse('http://url', body=sample_spec['original_body']) spider = IblSpider(name, make_spider(sample=sample_spec), schemas, {}, Settings()) return spider, page, results
def open_spider_page_and_results(name): sample_spec = open_spec(name) schemas = sample_spec['schemas'] results = sample_spec['results'] if 'original_body' not in sample_spec: sample_spec['original_body'] = open_spec('{}.html'.format( name[:-len('.json')])) page = UTF8HtmlResponse('http://url', body=sample_spec['original_body']) spider = IblSpider(name, make_spider(sample=sample_spec), schemas, {}, Settings()) return spider, page, results
def test_trained(self): base = 'http://www.daft.ie/ireland/houses-for-sale/?offset={}'.format daft_url = base(10) spec = { 'start_urls': [daft_url], 'links_to_follow': 'auto', 'respect_nofollow': False, 'follow_patterns': [], 'exclude_patterns': [], 'init_requests': [], 'templates': [daft_sample] } settings = Settings() settings.set('LOADED_PLUGINS', load_plugins(settings)) spider = IblSpider('hn', spec, {}, {}, settings=settings) request = Request(daft_url) response = UTF8HtmlResponse(url=daft_url, body=daft_body, request=request) data = {r.url for r in spider.handle_html(response) if isinstance(r, Request)} self.assertEqual({base(i) for i in (90, 80, 70)}, data)
def load_spider(spec, spider_name): try: spider = spec.spider_with_templates(spider_name) except (TypeError, ValueError): raise BadRequest('The spider %s, could not be cound' % spider_name) try: items = spec.resource('items') except (TypeError, ValueError): items = {} try: extractors = spec.resource('extractors') except (TypeError, ValueError): extractors = {} return IblSpider(spider_name, spider, items, extractors, Settings())
def create_spider(self, project, params, **kwargs): spider = params.get('spider') if spider is None: return pspec = self.bot.spec_manager.project_spec(project) try: spider_spec = pspec.resource('spiders', spider) items_spec = pspec.resource('items') extractors = pspec.resource('extractors') return IblSpider(spider, spider_spec, items_spec, extractors, **kwargs) except IOError as ex: if ex.errno == errno.ENOENT: log.msg("skipping extraction, no spec: %s" % ex.filename) else: raise
def load_project_data(storage): """Load project data using provided open_func and project directory.""" # Load items and extractors from project schemas = storage.open('items.json') extractors = storage.open('extractors.json') # Load spiders and templates spider_loader = SpiderLoader(storage) spiders = {} for spider_name in spider_loader.spider_names: spider = spider_loader[spider_name] crawler = IblSpider(spider_name, spider, schemas, extractors, Settings()) spiders[spider_name] = (crawler, spider) return schemas, extractors, spiders
def create_spider(self, project, auth_info, params, **kwargs): spider = params.get('spider') if spider is None: return None, None pspec = self.bot.spec_manager.project_spec(project, auth_info) try: spider_spec = pspec.spider_with_templates(spider) items_spec = pspec.resource('items') extractors = pspec.resource('extractors') return (IblSpider(spider, spider_spec, items_spec, extractors, self.bot.runner.settings, **kwargs), spider_spec['templates']) except IOError as ex: if ex.errno == errno.ENOENT: log.msg("skipping extraction, no spec: %s" % ex.filename) return None, None else: raise
def open_spider(self, meta): if ('project' not in meta or 'spider' not in meta or (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff)): return {'error': 4004, 'reason': 'Project "%s" not found' % meta['project']} spider_name = meta['spider'] spec = self.spec_manager.project_spec(meta['project'], self.user.auth) spider = spec.spider_with_templates(spider_name) items = spec.resource('items') extractors = spec.resource('extractors') if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(spider_name, spider, items, extractors)
def open_spider(self, meta, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return { 'error': 4004, 'reason': 'Project "%s" not found' % meta['project'] } spider_name = meta['spider'] # project_meta = meta.get('project') # project_id = (project_meta if isinstance(project_meta, six.string_types) # else project_meta.id) # project = Project(self.storage, id=project_id) if project is None: project = Project(self.storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except IOError: return { 'error': 4003, 'reason': 'Spider "%s" not found' % spider_name } spider = spider_model.dump() spider['templates'] = [] for sample in spider_model.samples: sample = sample.dump() for key in ('original_body', 'rendered_body'): if not (sample.get(key) or '').strip(): sample[key] = u'<html></html>' spider['templates'].append(sample) items, extractors = project.schemas.dump(), project.extractors.dump() if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(project, spider_name, spider, items, extractors)
def load_spider(spec, spider_name): spider = spec.spider_with_templates(spider_name) items = spec.resource('items') extractors = spec.resource('extractors') return IblSpider(spider_name, spider, items, extractors, Settings())
def load_spider(model): name, spider, items, extractors = load_spider_data(model) return IblSpider(name, spider, items, extractors, Settings())