def end_response(self, request, original_url, referer, connection_status, tabid, reply): if connection_status["finished"]: return if hasattr(reply, 'readAll'): content = bytes(reply.readAll()) status_code = to_py( reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)) if status_code == 400: return self._load_resource(request, original_url, referer) request.setResponseCode(status_code or 500) else: content = b''.join(chunk for chunk in reply.iter_content(65535)) request.setResponseCode(reply.status_code) headers = { b'cache-control': b'private', b'pragma': b'no-cache', b'content-type': b'application/octet-stream', } for header in (b'content-type', b'cache-control', b'pragma', b'vary', b'max-age'): if hasattr(reply, 'hasRawHeader') and reply.hasRawHeader(header): headers[header] = bytes(reply.rawHeader(header)) elif hasattr(reply, 'headers') and header in reply.headers: headers[header] = bytes(reply.headers.get(header)) if header in headers: request.setHeader(header, headers[header]) if bytes(headers[b'content-type']).strip().startswith(b'text/css'): content = encode(process_css(content, tabid, original_url)) request.write(content) request.finish()
def end_response(self, request, original_url, referer, connection_status, tabid, reply): if connection_status["finished"]: return if hasattr(reply, 'readAll'): content = bytes(reply.readAll()) status_code = to_py(reply.attribute(QNetworkRequest.HttpStatusCodeAttribute)) if status_code == 400: return self._load_resource(request, original_url, referer) request.setResponseCode(status_code or 500) else: content = b''.join(chunk for chunk in reply.iter_content(65535)) request.setResponseCode(reply.status_code) headers = { b'cache-control': b'private', b'pragma': b'no-cache', b'content-type': b'application/octet-stream', } for header in (b'content-type', b'cache-control', b'pragma', b'vary', b'max-age'): if hasattr(reply, 'hasRawHeader') and reply.hasRawHeader(header): headers[header] = bytes(reply.rawHeader(header)) elif hasattr(reply, 'headers') and header in reply.headers: headers[header] = bytes(reply.headers.get(header)) if header in headers: request.setHeader(header, headers[header]) if bytes(headers[b'content-type']).strip().startswith(b'text/css'): content = encode(process_css(content, tabid, original_url)) request.write(content) request.finish()
def _update_sample(self, sample=None, project=None, data=None): """Recompile sample with latest annotations""" if sample is None: sample = self._load_sample(data, project) path = 'spiders/{}/{}/{{}}.html'.format(self.data['spider'], self.data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(self.storage.open(path).read()) assert html except (AssertionError, IOError): if not self.tab: six.reraise(*sys.exc_info()) html = None if type_ == 'raw': html = self.tab._raw_html if not html: html = self.tab.html() if html: self.storage.save(path, ContentFile(encode(html), path)) html = decode(html) else: html = '<html></html>' sample[name] = decode(html) return sample
def _update_sample(self, sample=None, project=None, data=None): """Recompile sample with latest annotations""" if sample is None: sample = self._load_sample(data, project) path = 'spiders/{}/{}/{{}}.html'.format( self.data['spider'], self.data['sample']) else: path = _html_path(sample) if hasattr(sample, 'dump'): sample = sample.dump() html_path = path.format for name, type_ in (('original_body', 'raw'), ('rendered_body', None)): try: path = html_path(name) html = decode(self.storage.open(path).read()) assert html except (AssertionError, IOError): if not self.tab: six.reraise(*sys.exc_info()) html = None if type_ == 'raw': html = self.tab._raw_html if not html: html = self.tab.html() if html: self.storage.save(path, ContentFile(encode(html), path)) html = decode(html) else: html = '<html></html>' sample[name] = decode(html) return sample
def process_css(css_source, tabid, base_uri): """ Wraps urls in css source. >>> url = 'http://scrapinghub.com/style.css' >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS '@import "/proxy?..."' """ def _absolutize_css_import(match): return '@import "{}"'.format(wrap_url(match.group(1), tabid, base_uri).replace('"', '%22')) def _absolutize_css_url(match): url = match.group(1).strip("\"'") return 'url("{}")'.format(wrap_url(url, tabid, base_uri).replace('"', '%22')) css_source = encode(css_source) css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source) css_source = CSS_URL.sub(_absolutize_css_url, css_source) css_source = BAD_CSS.sub('portia-blocked', css_source) return encode(css_source)
def process_css(css_source, tabid, base_uri): """ Wraps urls in css source. >>> url = 'http://scrapinghub.com/style.css' >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS '@import "/proxy?..."' """ def _absolutize_css_import(match): return '@import "{}"'.format( wrap_url(match.group(1), tabid, base_uri).replace('"', '%22')) def _absolutize_css_url(match): url = match.group(1).strip("\"'") return 'url("{}")'.format( wrap_url(url, tabid, base_uri).replace('"', '%22')) css_source = encode(css_source) css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source) css_source = CSS_URL.sub(_absolutize_css_url, css_source) css_source = BAD_CSS.sub('portia-blocked', css_source) return encode(css_source)
def extract_data(url, html, spider, templates): items, links = [], [] if isinstance(html, six.text_type): html = encode(html) for value in spider.parse(page(url, html)): if isinstance(value, Request): links.append(value.url) elif isinstance(value, DictItem): value['_template_name'] = _get_template_name( value['_template'], templates) items.append(value._values) else: raise ValueError("Unexpected type %s from spider" % type(value)) return items, links
def extract_data(url, html, spider, templates): items, links = [], [] if isinstance(html, six.text_type): html = encode(html) for value in spider.parse(page(url, html)): if isinstance(value, Request): links.append(value.url) elif isinstance(value, DictItem): value['_template_name'] = _get_template_name(value['_template'], templates) items.append(value._values) else: raise ValueError("Unexpected type %s from spider" % type(value)) return items, links
def _migrate_html(self, sample): base_path = strip_json(self.context['path']).strip('/') # Clean and use annotated body if there is no original body present if 'annotation_body' in sample and not sample.get('original_body'): sample['original_body'] = self._clean(sample['annotated_body']) storage = self.context['storage'] for key, value in sample.items(): if (not value or not key.endswith('_body') or key == 'annotated_body'): continue path = '/'.join((base_path, '{}.html'.format(key))) html = value.encode('utf-8') if hasattr(html, 'encode') and isinstance(html, six.text_type): html = encode(html).decode('utf-8') if not storage.exists(path): storage.save(path, ContentFile(html, path)) return sample
def start_scrapy_project(project_name): """Bootstrap a portia project with default scrapy files.""" if PY2: project_name = encode(project_name) files = find_files(project_name) out_files = {} for path, contents in files.items(): contents = string.Template(contents).substitute( project_name=project_name, ProjectName=string_camelcase(project_name)) if path.endswith('.tmpl'): path = path[:-len('.tmpl')] if path.endswith('scrapy.cfg'): path = 'scrapy.cfg' out_files[path] = contents out_files['setup.py'] = SETUP(project_name) return out_files