def open_spider(self, meta, storage=None, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return { 'error': 4004, 'reason': 'Project "%s" not found' % meta['project'] } spider_name = meta['spider'] if project is None: project = Project(storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except (IOError, KeyError): return { 'error': 4004, 'reason': 'Spider "%s" not found' % spider_name } spider_name, spider, items, extractors = load_spider_data(spider_model) if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(project, spider_name, spider, items, extractors)
def __init__(self, command, project, spider=None, sample=None): self.command, self.socket = command, command.socket if isinstance(project, six.string_types): project_name = self.socket.user.project_map.get(project, project) project = Project(command.storage, id=project, name=project_name) self.project = project if not self.socket.spider: try: self.socket.open_spider( { 'project': self.project.id, 'spider': spider }, project=project) except KeyError: pass # Ignore extraction as it is not fully set up yet self.spider = spider self.sample = sample if (self.spider and (not self.socket.spider or self.socket.spiderspec.name != spider)): self.socket.open_spider( { 'project': self.project, 'spider': self.spider }, project=project)
def project(self): project_id = self.kwargs.get('project_id') try: name = self.projects[project_id] return Project(self.storage, id=project_id, name=name) except KeyError: raise JsonApiNotFoundError()
def create(self, request): """Create a new project from the provided attributes""" try: name = self.data['data']['attributes']['name'] except KeyError: raise JsonApiBadRequestError('No `name` provided') self.kwargs['project_id'] = name projects = self.projects if not self.storage.is_valid_filename(name) or '.' in name: raise JsonApiBadRequestError( '"{}" is not a valid project name,\nProject names may only ' 'contain letters and numbers'.format(name)) if name in projects: raise JsonApiBadRequestError( 'A project with the name "{}" already exists'.format(name)) # Bootstrap project storage = self.storage storage.commit() project = Project(storage, id=name, name=name) serializer = self.get_serializer(project, storage=storage) data = serializer.data headers = self.get_success_headers(data) return Response(data, status=HTTP_201_CREATED, headers=headers)
def save_html(data, socket, item_checker=None): if item_checker is None: item_checker = ItemChecker(socket, data['project'], data['spider'], data['sample']) project = Project(socket.storage, id=data['project']) socket.spiderspec.project = project spider = project.spiders[data['spider']] samples = spider.samples sample = samples[data['sample']] return _update_sample(data, socket, sample)
def __init__(self, project, storage, from_project_id): self.project, self.storage = project, storage self.from_storage = storage.__class__(from_project_id, author=storage.author) self.from_project = Project(self.from_storage, id=from_project_id, name=from_project_id) # Populating projects to avoid overwrites self.project.schemas self.project.extractors self.from_project.schemas self.from_project.extractors
def save_html(self, item_checker=None): data = self.data if item_checker is None: item_checker = ItemChecker(self, data['project'], data['spider'], data['sample']) project = Project(self.storage, id=data['project']) self.socket.spiderspec.project = project spider = project.spiders[data['spider']] samples = spider.samples try: sample = samples[data['sample']] self._update_sample(sample) except (IOError, KeyError): pass # Sample doesn't exist or may not exist yet return {'ok': True}
def save_html(self, item_checker=None): data = self.data if item_checker is None: item_checker = ItemChecker(self, data['project'], data['spider'], data['sample']) project = Project(self.storage, id=data['project']) self.socket.spiderspec.project = project spider = project.spiders[data['spider']] samples = spider.samples sample = samples[data['sample']] try: self._update_sample(sample) except IOError: pass return {'ok': True}
def __init__(self, socket, project, spider=None, sample=None): self.socket = socket if isinstance(project, six.string_types): project_name = socket.user.project_map.get(project, project) project = Project(socket.storage, id=project, name=project_name) self.project = project if not socket.spider: socket.open_spider({'project': self.project.id, 'spider': spider}, project) self.spider = spider self.sample = sample if (self.spider and (not self.socket.spider or self.socket.spiderspec.name != spider)): self.socket.open_spider({'project': self.project, 'spider': self.spider}, project)
def __init__(self, project, storage, from_project_id): self.project, self.storage = project, storage self.from_storage = storage.__class__(from_project_id, author=storage.author) self.from_project = Project(self.from_storage, id=from_project_id, name=from_project_id) # Populating projects to avoid overwrites self.project.schemas self.project.extractors self.from_project.schemas self.from_project.extractors self.spider_ids = set(spider.id for spider in self.project.spiders) self.copied_fields = {} self.copied_schemas = {} self.copied_extractors = {}
def open_spider(self, meta, project=None): if not (meta.get('project') and meta.get('spider')): return {'error': 4005, 'reason': 'No project specified'} if (self.user.authorized_projects is not None and meta['project'] not in self.user.authorized_projects and not self.user.staff): return { 'error': 4004, 'reason': 'Project "%s" not found' % meta['project'] } spider_name = meta['spider'] # project_meta = meta.get('project') # project_id = (project_meta if isinstance(project_meta, six.string_types) # else project_meta.id) # project = Project(self.storage, id=project_id) if project is None: project = Project(self.storage, id=meta.get('project')) try: spider_model = project.spiders[spider_name] except IOError: return { 'error': 4003, 'reason': 'Spider "%s" not found' % spider_name } spider = spider_model.dump() spider['templates'] = [] for sample in spider_model.samples: sample = sample.dump() for key in ('original_body', 'rendered_body'): if not (sample.get(key) or '').strip(): sample[key] = u'<html></html>' spider['templates'].append(sample) items, extractors = project.schemas.dump(), project.extractors.dump() if not self.settings.get('SPLASH_URL'): self.settings.set('SPLASH_URL', 'portia') self.factory[self].spider = IblSpider(spider_name, spider, items, extractors, self.settings) self.factory[self].spiderspec = SpiderSpec(project, spider_name, spider, items, extractors)
def get_collection(self): storage = self.FakeStorage() return Project.collection( Project(storage, id=project_id, name=name) for project_id, name in iteritems(self.projects))