Example #1
0
    def open_spider(self, meta, storage=None, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None
                and meta['project'] not in self.user.authorized_projects
                and not self.user.staff):
            return {
                'error': 4004,
                'reason': 'Project "%s" not found' % meta['project']
            }
        spider_name = meta['spider']

        if project is None:
            project = Project(storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except (IOError, KeyError):
            return {
                'error': 4004,
                'reason': 'Spider "%s" not found' % spider_name
            }
        spider_name, spider, items, extractors = load_spider_data(spider_model)
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(project, spider_name,
                                                   spider, items, extractors)
Example #2
0
 def __init__(self, command, project, spider=None, sample=None):
     self.command, self.socket = command, command.socket
     if isinstance(project, six.string_types):
         project_name = self.socket.user.project_map.get(project, project)
         project = Project(command.storage, id=project, name=project_name)
     self.project = project
     if not self.socket.spider:
         try:
             self.socket.open_spider(
                 {
                     'project': self.project.id,
                     'spider': spider
                 },
                 project=project)
         except KeyError:
             pass  # Ignore extraction as it is not fully set up yet
     self.spider = spider
     self.sample = sample
     if (self.spider and
         (not self.socket.spider or self.socket.spiderspec.name != spider)):
         self.socket.open_spider(
             {
                 'project': self.project,
                 'spider': self.spider
             },
             project=project)
Example #3
0
 def project(self):
     project_id = self.kwargs.get('project_id')
     try:
         name = self.projects[project_id]
         return Project(self.storage, id=project_id, name=name)
     except KeyError:
         raise JsonApiNotFoundError()
Example #4
0
    def create(self, request):
        """Create a new project from the provided attributes"""
        try:
            name = self.data['data']['attributes']['name']
        except KeyError:
            raise JsonApiBadRequestError('No `name` provided')
        self.kwargs['project_id'] = name

        projects = self.projects
        if not self.storage.is_valid_filename(name) or '.' in name:
            raise JsonApiBadRequestError(
                '"{}" is not a valid project name,\nProject names may only '
                'contain letters and numbers'.format(name))
        if name in projects:
            raise JsonApiBadRequestError(
                'A project with the name "{}" already exists'.format(name))

        # Bootstrap project
        storage = self.storage
        storage.commit()

        project = Project(storage, id=name, name=name)
        serializer = self.get_serializer(project, storage=storage)
        data = serializer.data
        headers = self.get_success_headers(data)
        return Response(data, status=HTTP_201_CREATED, headers=headers)
Example #5
0
def save_html(data, socket, item_checker=None):
    if item_checker is None:
        item_checker = ItemChecker(socket, data['project'], data['spider'],
                                   data['sample'])
    project = Project(socket.storage, id=data['project'])
    socket.spiderspec.project = project
    spider = project.spiders[data['spider']]
    samples = spider.samples
    sample = samples[data['sample']]
    return _update_sample(data, socket, sample)
Example #6
0
 def __init__(self, project, storage, from_project_id):
     self.project, self.storage = project, storage
     self.from_storage = storage.__class__(from_project_id,
                                           author=storage.author)
     self.from_project = Project(self.from_storage, id=from_project_id,
                                 name=from_project_id)
     # Populating projects to avoid overwrites
     self.project.schemas
     self.project.extractors
     self.from_project.schemas
     self.from_project.extractors
Example #7
0
 def save_html(self, item_checker=None):
     data = self.data
     if item_checker is None:
         item_checker = ItemChecker(self, data['project'], data['spider'],
                                    data['sample'])
     project = Project(self.storage, id=data['project'])
     self.socket.spiderspec.project = project
     spider = project.spiders[data['spider']]
     samples = spider.samples
     try:
         sample = samples[data['sample']]
         self._update_sample(sample)
     except (IOError, KeyError):
         pass  # Sample doesn't exist or may not exist yet
     return {'ok': True}
Example #8
0
 def save_html(self, item_checker=None):
     data = self.data
     if item_checker is None:
         item_checker = ItemChecker(self, data['project'], data['spider'],
                                    data['sample'])
     project = Project(self.storage, id=data['project'])
     self.socket.spiderspec.project = project
     spider = project.spiders[data['spider']]
     samples = spider.samples
     sample = samples[data['sample']]
     try:
         self._update_sample(sample)
     except IOError:
         pass
     return {'ok': True}
Example #9
0
 def __init__(self, socket, project, spider=None, sample=None):
     self.socket = socket
     if isinstance(project, six.string_types):
         project_name = socket.user.project_map.get(project, project)
         project = Project(socket.storage, id=project, name=project_name)
     self.project = project
     if not socket.spider:
         socket.open_spider({'project': self.project.id, 'spider': spider},
                            project)
     self.spider = spider
     self.sample = sample
     if (self.spider and (not self.socket.spider or
                          self.socket.spiderspec.name != spider)):
         self.socket.open_spider({'project': self.project,
                                  'spider': self.spider},
                                 project)
Example #10
0
    def __init__(self, project, storage, from_project_id):
        self.project, self.storage = project, storage
        self.from_storage = storage.__class__(from_project_id,
                                              author=storage.author)
        self.from_project = Project(self.from_storage,
                                    id=from_project_id,
                                    name=from_project_id)
        # Populating projects to avoid overwrites
        self.project.schemas
        self.project.extractors
        self.from_project.schemas
        self.from_project.extractors
        self.spider_ids = set(spider.id for spider in self.project.spiders)

        self.copied_fields = {}
        self.copied_schemas = {}
        self.copied_extractors = {}
Example #11
0
    def open_spider(self, meta, project=None):
        if not (meta.get('project') and meta.get('spider')):
            return {'error': 4005, 'reason': 'No project specified'}

        if (self.user.authorized_projects is not None
                and meta['project'] not in self.user.authorized_projects
                and not self.user.staff):
            return {
                'error': 4004,
                'reason': 'Project "%s" not found' % meta['project']
            }
        spider_name = meta['spider']

        # project_meta = meta.get('project')
        # project_id = (project_meta if isinstance(project_meta, six.string_types)
        #               else project_meta.id)
        # project = Project(self.storage, id=project_id)

        if project is None:
            project = Project(self.storage, id=meta.get('project'))

        try:
            spider_model = project.spiders[spider_name]
        except IOError:
            return {
                'error': 4003,
                'reason': 'Spider "%s" not found' % spider_name
            }
        spider = spider_model.dump()
        spider['templates'] = []
        for sample in spider_model.samples:
            sample = sample.dump()
            for key in ('original_body', 'rendered_body'):
                if not (sample.get(key) or '').strip():
                    sample[key] = u'<html></html>'
            spider['templates'].append(sample)
        items, extractors = project.schemas.dump(), project.extractors.dump()
        if not self.settings.get('SPLASH_URL'):
            self.settings.set('SPLASH_URL', 'portia')
        self.factory[self].spider = IblSpider(spider_name, spider, items,
                                              extractors, self.settings)
        self.factory[self].spiderspec = SpiderSpec(project, spider_name,
                                                   spider, items, extractors)
Example #12
0
 def get_collection(self):
     storage = self.FakeStorage()
     return Project.collection(
         Project(storage, id=project_id, name=name)
         for project_id, name in iteritems(self.projects))
Example #13
0
 def get_collection(self):
     storage = self.FakeStorage()
     return Project.collection(
         Project(storage, id=project_id, name=name)
         for project_id, name in iteritems(self.projects))