def dump_templates(self, data): if not data.get('templates'): path = '/'.join(strip_json(self.context['path']).split('/')[:2]) storage = self.context['storage'] try: names = OrderedDict((strip_json(fname), 1) for fname in storage.listdir(path)[1]) data['samples'] = list(names) return data except OSError: # Directory does not exist data['samples'] = [] return data templates = [] for template in data['templates']: # Only migrate item templates if template.get('page_type') != 'item': continue template['id'] = template.get('page_id') or template.get('name') templates.append(template['id']) path = self.context['path'] path = '/'.join((strip_json(path).strip('/'), '{}.json'.format(template['id']))) sample = json.dumps(template, sort_keys=True, indent=4) self.context['storage'].save(path, ContentFile(sample, path)) data['samples'] = templates path, storage = self.context['path'], self.context['storage'] spider = json.dumps(data, indent=4, sort_keys=True) storage.save(path, ContentFile(spider, path)) return data
def _update_sample(self, sample=None, project=None, data=None): """Recompile sample with latest annotations""" if sample is None: sample = self._load_sample(data, project) path = 'spiders/{}/templates/{}/{{}}.html'.format( self.data['spider'], self.data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(self.storage.open(path).read()) except IOError: if not self.tab: six.reraise(*sys.exc_info()) html = decoded_html(self.tab, type_) if html: self.storage.save(path, ContentFile(html, path)) else: html = '<html></html>' sample[name] = html return sample
def open_with_default(name, default=None, *args, **kwargs): try: return open_(name, *args, **kwargs) except IOError as error: if error.errno == errno.ENOENT: return ContentFile(json.dumps(default), name) raise error
def _commit_delete(self, collector, saved_paths=None, deleted_paths=None): if saved_paths is None: saved_paths = set() if deleted_paths is None: deleted_paths = set() for model, fields in iteritems(collector.save): model.resolve_attributes(snapshots=('committed',)) model._stage_changes(fields) for model in collector.delete: path = model.storage_path(model, snapshots=('committed',)) if model.opts.owner: if path and path not in saved_paths and path not in deleted_paths: to_save = self._get_object_to_dump( model, parent_snapshots=('committed',)) model.storage.save(path, ContentFile( to_save.dumps(state='staged'), path)) saved_paths.add(path) else: if path not in deleted_paths: model.storage.delete(path) deleted_paths.add(path) for model, fields in iteritems(collector.save): model._commit_changes(saved_paths, deleted_paths) for model in collector.delete: store = model.data_store store.update_snapshot('working', ('working', 'staged', 'committed')) store.clear_snapshot('staged') store.clear_snapshot('committed')
def dump_templates_to_file(self, data): if 'template_names' in data or 'templates' not in data: return data template_names = [] for template in data['templates']: # Only migrate item templates if template.get('page_type') != 'item': continue template['id'] = template.get('page_id') or template.get('name') template_names.append(template['id']) path = self.context['path'] path = '/'.join((path[:-len('.json')].strip('/'), '{}.json'.format(template['id']))) sample = json.dumps(template, sort_keys=True, indent=4) self.context['storage'].save(path, ContentFile(sample, path)) data['template_names'] = template_names del data['templates'] path, storage = self.context['path'], self.context['storage'] spider = json.dumps(data, indent=4, sort_keys=True) storage.save(path, ContentFile(spider, path)) return data
def create_project(self, name): self.validate_project_name(name) project_filename = self.project_filename(name) project_files = { 'project.json': templates['PROJECT'], 'scrapy.cfg': templates['SCRAPY'], 'setup.py': templates['SETUP'] % str(name), 'items.json': templates['ITEMS'], join('spiders', '__init__.py'): '', join('spiders', 'settings.py'): templates['SETTINGS'], } for filename, template in project_files.items(): path = join(project_filename, filename) self.storage.save(path, ContentFile(template, path))
def dump_actions(self, data): if not data.get('actions'): path = '/'.join(strip_json(self.context['path']).split('/')[:2]) path = path + '/actions' storage = self.context['storage'] try: names = OrderedDict((strip_json(fname), 1) for fname in storage.listdir(path)[1]) data['actions'] = list(names) return data except OSError: # Directory does not exist data['actions'] = [] return data actions = [] for action in data['actions']: # Only migrate item templates #if template.get('page_type') != 'item' and template.get('page_type') != 'links': # continue action['id'] = action.get('id') or action.get('name') actions.append(action['id']) path = self.context['path'] + '/actions' path = '/'.join( (strip_json(path).strip('/'), '{}.json'.format(action['id']))) action_content = json.dumps(action, sort_keys=True, indent=4) self.context['storage'].save(path, ContentFile(action_content, path)) data['actions'] = actions path, storage = self.context['path'], self.context['storage'] spider = json.dumps(data, indent=4, sort_keys=True) storage.save(path, ContentFile(spider, path)) return data
def _migrate_html(self, sample): base_path = strip_json(self.context['path']).strip('/') # Clean and use annotated body if there is no original body present if 'annotation_body' in sample and not sample.get('original_body'): sample['original_body'] = self._clean(sample['annotated_body']) storage = self.context['storage'] for key, value in sample.items(): if (not value or not key.endswith('_body') or key == 'annotated_body'): continue path = '/'.join((base_path, '{}.html'.format(key))) html = value.encode('utf-8') if hasattr(html, 'encode') and isinstance(html, six.text_type): html = encode(html).decode('utf-8') if not storage.exists(path): storage.save(path, ContentFile(html, path)) return sample
def _commit_changes(self, saved_paths=None, deleted_paths=None): if saved_paths is None: saved_paths = set() if deleted_paths is None: deleted_paths = set() for model in chain([self], (model for model, _ in self._staged_model_references())): store = model.data_store dirty = (model._file_fields.intersection(iterkeys(store['staged'])) or 'project' in store.dirty_fields( 'working', ('committed', ))) path = model.storage_path(model, snapshots=('staged', 'committed')) old_path = model.storage_path(model, snapshots=('committed', 'staged')) if dirty or old_path != path: if path not in saved_paths and path not in deleted_paths: to_save = self._get_object_to_dump( model, parent_snapshots=('staged', 'committed')) model.storage.save( path, ContentFile(to_save.dumps(state='staged'), path)) saved_paths.add(path) if old_path != path and old_path not in deleted_paths: try: model.storage.delete(old_path) except IOError as ex: # Assume missing files are already deleted if ex.errno != errno.ENOENT: six.reraise(*sys.exc_info()) deleted_paths.add(old_path) for model in chain([self], (model for model, _ in self._staged_model_references())): store = model.data_store dirty = set(iterkeys(store['staged'])) if dirty: store.update_snapshot('committed', ('staged', ), fields=dirty) store.clear_snapshot('staged') store.clear_snapshot('working', fields=dirty.intersection( iterkeys(store['working'])))
def _commit_delete(self, collector, saved_paths=None, deleted_paths=None): if saved_paths is None: saved_paths = set() if deleted_paths is None: deleted_paths = set() for model, fields in iteritems(collector.save): model.resolve_attributes(snapshots=('committed', )) model._stage_changes(fields) for model in collector.delete: path = model.storage_path(model, snapshots=('committed', 'staged', 'working')) if model.opts.owner: if path and path not in saved_paths and path not in deleted_paths: to_save = self._get_object_to_dump( model, parent_snapshots=('committed', )) model.storage.save( path, ContentFile(to_save.dumps(state='staged'), path)) saved_paths.add(path) else: if path not in deleted_paths: try: model.storage.delete(path) except IOError as ex: # Assume missing files are already deleted if ex.errno != errno.ENOENT: six.reraise(*sys.exc_info()) deleted_paths.add(path) for model, fields in iteritems(collector.save): model._commit_changes(saved_paths, deleted_paths) for model in collector.delete: store = model.data_store store.update_snapshot('working', ('working', 'staged', 'committed')) store.clear_snapshot('staged') store.clear_snapshot('committed')
def _update_sample(data, socket, sample=None, project=None): """Recompile sample with latest annotations""" if sample is None: project = project or socket.spiderspec.project spiders = project.spiders spider = spiders[data['spider']] samples = spider.samples sample = samples[data['sample']] path = 'spiders/{}/{}/{{}}.html'.format(data['spider'], data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(socket.storage.open(path).read()) except IOError: html = decoded_html(socket.tab, type_) socket.storage.save(path, ContentFile(html, path)) sample[name] = html return sample
def save_file(self, name, file_path, file_contents): self._open_repo(name) self.storage.save( file_path, ContentFile(json.dumps(file_contents, sort_keys=True, indent=4), file_path))
def save_raw(serializer, data): context = serializer.context path, storage = context['path'], context['storage'] data = {k: v for k, v in data.items() if not k.endswith('_body')} sample = json.dumps(data, indent=4, sort_keys=True) storage.save(path, ContentFile(sample, path))
def open_(name, *args, **kwargs): try: data = files[name] except KeyError: raise IOError(2, 'No file or directory', name) return ContentFile(data, name)
def savejson(self, obj, resources): # convert to json in a way that will make sense in diffs fname = self._rfilename(*resources) self.storage.save( fname, ContentFile(json.dumps(obj, sort_keys=True, indent=4), fname))