Beispiel #1
0
    def end_response(self, request, original_url, referer, connection_status,
                     tabid, reply):
        if connection_status["finished"]:
            return

        if hasattr(reply, 'readAll'):
            content = bytes(reply.readAll())
            status_code = to_py(
                reply.attribute(QNetworkRequest.HttpStatusCodeAttribute))
            if status_code == 400:
                return self._load_resource(request, original_url, referer)
            request.setResponseCode(status_code or 500)
        else:
            content = b''.join(chunk for chunk in reply.iter_content(65535))
            request.setResponseCode(reply.status_code)

        headers = {
            b'cache-control': b'private',
            b'pragma': b'no-cache',
            b'content-type': b'application/octet-stream',
        }
        for header in (b'content-type', b'cache-control', b'pragma', b'vary',
                       b'max-age'):
            if hasattr(reply, 'hasRawHeader') and reply.hasRawHeader(header):
                headers[header] = bytes(reply.rawHeader(header))
            elif hasattr(reply, 'headers') and header in reply.headers:
                headers[header] = bytes(reply.headers.get(header))
            if header in headers:
                request.setHeader(header, headers[header])

        if bytes(headers[b'content-type']).strip().startswith(b'text/css'):
            content = encode(process_css(content, tabid, original_url))
        request.write(content)
        request.finish()
Beispiel #2
0
    def end_response(self, request, original_url, referer, connection_status,
                     tabid, reply):
        if connection_status["finished"]:
            return

        if hasattr(reply, 'readAll'):
            content = bytes(reply.readAll())
            status_code = to_py(reply.attribute(QNetworkRequest.HttpStatusCodeAttribute))
            if status_code == 400:
                return self._load_resource(request, original_url, referer)
            request.setResponseCode(status_code or 500)
        else:
            content = b''.join(chunk for chunk in reply.iter_content(65535))
            request.setResponseCode(reply.status_code)

        headers = {
            b'cache-control': b'private',
            b'pragma': b'no-cache',
            b'content-type': b'application/octet-stream',
        }
        for header in (b'content-type', b'cache-control', b'pragma', b'vary',
                       b'max-age'):
            if hasattr(reply, 'hasRawHeader') and reply.hasRawHeader(header):
                headers[header] = bytes(reply.rawHeader(header))
            elif hasattr(reply, 'headers') and header in reply.headers:
                headers[header] = bytes(reply.headers.get(header))
            if header in headers:
                request.setHeader(header, headers[header])

        if bytes(headers[b'content-type']).strip().startswith(b'text/css'):
            content = encode(process_css(content, tabid, original_url))
        request.write(content)
        request.finish()
Beispiel #3
0
 def _update_sample(self, sample=None, project=None, data=None):
     """Recompile sample with latest annotations"""
     if sample is None:
         sample = self._load_sample(data, project)
         path = 'spiders/{}/{}/{{}}.html'.format(self.data['spider'],
                                                 self.data['sample'])
     else:
         path = _html_path(sample)
     if hasattr(sample, 'dump'):
         sample = sample.dump()
     html_path = path.format
     for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
         try:
             path = html_path(name)
             html = decode(self.storage.open(path).read())
             assert html
         except (AssertionError, IOError):
             if not self.tab:
                 six.reraise(*sys.exc_info())
             html = None
             if type_ == 'raw':
                 html = self.tab._raw_html
             if not html:
                 html = self.tab.html()
             if html:
                 self.storage.save(path, ContentFile(encode(html), path))
                 html = decode(html)
             else:
                 html = '<html></html>'
         sample[name] = decode(html)
     return sample
Beispiel #4
0
 def _update_sample(self, sample=None, project=None, data=None):
     """Recompile sample with latest annotations"""
     if sample is None:
         sample = self._load_sample(data, project)
         path = 'spiders/{}/{}/{{}}.html'.format(
             self.data['spider'], self.data['sample'])
     else:
         path = _html_path(sample)
     if hasattr(sample, 'dump'):
         sample = sample.dump()
     html_path = path.format
     for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
         try:
             path = html_path(name)
             html = decode(self.storage.open(path).read())
             assert html
         except (AssertionError, IOError):
             if not self.tab:
                 six.reraise(*sys.exc_info())
             html = None
             if type_ == 'raw':
                 html = self.tab._raw_html
             if not html:
                 html = self.tab.html()
             if html:
                 self.storage.save(path, ContentFile(encode(html), path))
                 html = decode(html)
             else:
                 html = '<html></html>'
         sample[name] = decode(html)
     return sample
Beispiel #5
0
def process_css(css_source, tabid, base_uri):
    """
    Wraps urls in css source.

    >>> url = 'http://scrapinghub.com/style.css'
    >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS
    '@import "/proxy?..."'
    """
    def _absolutize_css_import(match):
        return '@import "{}"'.format(wrap_url(match.group(1), tabid,
                                              base_uri).replace('"', '%22'))

    def _absolutize_css_url(match):
        url = match.group(1).strip("\"'")
        return 'url("{}")'.format(wrap_url(url, tabid,
                                           base_uri).replace('"', '%22'))
    css_source = encode(css_source)
    css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source)
    css_source = CSS_URL.sub(_absolutize_css_url, css_source)
    css_source = BAD_CSS.sub('portia-blocked', css_source)
    return encode(css_source)
def process_css(css_source, tabid, base_uri):
    """
    Wraps urls in css source.

    >>> url = 'http://scrapinghub.com/style.css'
    >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS
    '@import "/proxy?..."'
    """
    def _absolutize_css_import(match):
        return '@import "{}"'.format(
            wrap_url(match.group(1), tabid, base_uri).replace('"', '%22'))

    def _absolutize_css_url(match):
        url = match.group(1).strip("\"'")
        return 'url("{}")'.format(
            wrap_url(url, tabid, base_uri).replace('"', '%22'))

    css_source = encode(css_source)
    css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source)
    css_source = CSS_URL.sub(_absolutize_css_url, css_source)
    css_source = BAD_CSS.sub('portia-blocked', css_source)
    return encode(css_source)
Beispiel #7
0
def extract_data(url, html, spider, templates):
    items, links = [], []
    if isinstance(html, six.text_type):
        html = encode(html)
    for value in spider.parse(page(url, html)):
        if isinstance(value, Request):
            links.append(value.url)
        elif isinstance(value, DictItem):
            value['_template_name'] = _get_template_name(
                value['_template'], templates)
            items.append(value._values)
        else:
            raise ValueError("Unexpected type %s from spider" % type(value))
    return items, links
Beispiel #8
0
def extract_data(url, html, spider, templates):
    items, links = [], []
    if isinstance(html, six.text_type):
        html = encode(html)
    for value in spider.parse(page(url, html)):
        if isinstance(value, Request):
            links.append(value.url)
        elif isinstance(value, DictItem):
            value['_template_name'] = _get_template_name(value['_template'],
                                                         templates)
            items.append(value._values)
        else:
            raise ValueError("Unexpected type %s from spider" %
                             type(value))
    return items, links
Beispiel #9
0
 def _migrate_html(self, sample):
     base_path = strip_json(self.context['path']).strip('/')
     # Clean and use annotated body if there is no original body present
     if 'annotation_body' in sample and not sample.get('original_body'):
         sample['original_body'] = self._clean(sample['annotated_body'])
     storage = self.context['storage']
     for key, value in sample.items():
         if (not value or not key.endswith('_body') or
                 key == 'annotated_body'):
             continue
         path = '/'.join((base_path, '{}.html'.format(key)))
         html = value.encode('utf-8')
         if hasattr(html, 'encode') and isinstance(html, six.text_type):
             html = encode(html).decode('utf-8')
         if not storage.exists(path):
             storage.save(path, ContentFile(html, path))
     return sample
Beispiel #10
0
def start_scrapy_project(project_name):
    """Bootstrap a portia project with default scrapy files."""
    if PY2:
        project_name = encode(project_name)
    files = find_files(project_name)
    out_files = {}
    for path, contents in files.items():
        contents = string.Template(contents).substitute(
            project_name=project_name,
            ProjectName=string_camelcase(project_name))
        if path.endswith('.tmpl'):
            path = path[:-len('.tmpl')]
        if path.endswith('scrapy.cfg'):
            path = 'scrapy.cfg'
        out_files[path] = contents
    out_files['setup.py'] = SETUP(project_name)

    return out_files