Example #1
0
 def _update_sample(self, sample=None, project=None, data=None):
     """Recompile sample with latest annotations"""
     if sample is None:
         sample = self._load_sample(data, project)
         path = 'spiders/{}/{}/{{}}.html'.format(
             self.data['spider'], self.data['sample'])
     else:
         path = _html_path(sample)
     if hasattr(sample, 'dump'):
         sample = sample.dump()
     html_path = path.format
     for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
         try:
             path = html_path(name)
             html = decode(self.storage.open(path).read())
             assert html
         except (AssertionError, IOError):
             if not self.tab:
                 six.reraise(*sys.exc_info())
             html = None
             if type_ == 'raw':
                 html = self.tab._raw_html
             if not html:
                 html = self.tab.html()
             if html:
                 self.storage.save(path, ContentFile(encode(html), path))
                 html = decode(html)
             else:
                 html = '<html></html>'
         sample[name] = decode(html)
     return sample
Example #2
0
 def _update_sample(self, sample=None, project=None, data=None):
     """Recompile sample with latest annotations"""
     if sample is None:
         sample = self._load_sample(data, project)
         path = 'spiders/{}/{}/{{}}.html'.format(self.data['spider'],
                                                 self.data['sample'])
     else:
         path = _html_path(sample)
     if hasattr(sample, 'dump'):
         sample = sample.dump()
     html_path = path.format
     for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
         try:
             path = html_path(name)
             html = decode(self.storage.open(path).read())
             assert html
         except (AssertionError, IOError):
             if not self.tab:
                 six.reraise(*sys.exc_info())
             html = None
             if type_ == 'raw':
                 html = self.tab._raw_html
             if not html:
                 html = self.tab.html()
             if html:
                 self.storage.save(path, ContentFile(encode(html), path))
                 html = decode(html)
             else:
                 html = '<html></html>'
         sample[name] = decode(html)
     return sample
Example #3
0
 def open(self, *args, **kwargs):
     raw = kwargs.get('raw')
     path = self.rel_path(*args)
     if raw:
         fp = self.storage.open(path)
     else:
         fp = self.storage.open_with_default(path, {})
     return decode(fp.read()) if raw else json.loads(fp.read())
Example #4
0
 def open(self, *args, **kwargs):
     raw = kwargs.get('raw')
     path = self.rel_path(*args)
     if raw:
         fp = self.storage.open(path)
     else:
         fp = self.storage.open_with_default(path, {})
     return decode(fp.read()) if raw else json.loads(fp.read())
Example #5
0
def start_scrapy_project(project_name):
    """Bootstrap a portia project with default scrapy files."""
    files = find_files(project_name)
    out_files = {}
    for path, contents in files.items():
        contents = string.Template(decode(contents)).substitute(
            project_name=project_name,
            ProjectName=string_camelcase(project_name))
        if path.endswith('.tmpl'):
            path = path[:-len('.tmpl')]
        if path.endswith('scrapy.cfg'):
            path = 'scrapy.cfg'
        out_files[path] = contents
    out_files['setup.py'] = SETUP(project_name)

    return out_files
Example #6
0
def start_scrapy_project(project_name):
    """Bootstrap a portia project with default scrapy files."""
    files = find_files(project_name)
    out_files = {}
    for path, contents in files.items():
        contents = string.Template(decode(contents)).substitute(
            project_name=project_name,
            ProjectName=string_camelcase(project_name)
        )
        if path.endswith('.tmpl'):
            path = path[:-len('.tmpl')]
        if path.endswith('scrapy.cfg'):
            path = 'scrapy.cfg'
        out_files[path] = contents
    out_files['setup.py'] = SETUP(project_name)

    return out_files
Example #7
0
def process_css(css_source, tabid, base_uri):
    """
    Wraps urls in css source.

    >>> url = 'http://scrapinghub.com/style.css'
    >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS
    '@import "/proxy?..."'
    """
    def _absolutize_css_import(match):
        return '@import "{}"'.format(wrap_url(match.group(1), tabid,
                                              base_uri).replace('"', '%22'))

    def _absolutize_css_url(match):
        url = match.group(1).strip("\"'")
        return 'url("{}")'.format(wrap_url(url, tabid,
                                           base_uri).replace('"', '%22'))
    css_source = decode(css_source)
    css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source)
    css_source = CSS_URL.sub(_absolutize_css_url, css_source)
    css_source = BAD_CSS.sub('portia-blocked', css_source)
    return encode(css_source)
Example #8
0
def process_css(css_source, tabid, base_uri):
    """
    Wraps urls in css source.

    >>> url = 'http://scrapinghub.com/style.css'
    >>> process_css('@import "{}"'.format(url), 0, url) # doctest: +ELLIPSIS
    '@import "/proxy?..."'
    """
    def _absolutize_css_import(match):
        return '@import "{}"'.format(
            wrap_url(match.group(1), tabid, base_uri).replace('"', '%22'))

    def _absolutize_css_url(match):
        url = match.group(1).strip("\"'")
        return 'url("{}")'.format(
            wrap_url(url, tabid, base_uri).replace('"', '%22'))

    css_source = decode(css_source)
    css_source = CSS_IMPORT.sub(_absolutize_css_import, css_source)
    css_source = CSS_URL.sub(_absolutize_css_url, css_source)
    css_source = BAD_CSS.sub('portia-blocked', css_source)
    return encode(css_source)
Example #9
0
def _update_sample(data, socket, sample=None, project=None):
    """Recompile sample with latest annotations"""
    if sample is None:
        sample = _load_sample(data, socket, project)
        path = 'spiders/{}/{}/{{}}.html'.format(data['spider'], data['sample'])
    else:
        path = _html_path(sample)
    if hasattr(sample, 'dump'):
        sample = sample.dump()
    html_path = path.format
    for name, type_ in (('original_body', 'raw'), ('rendered_body', None)):
        try:
            path = html_path(name)
            html = decode(socket.storage.open(path).read())
        except IOError:
            if not socket.tab:
                six.reraise(*sys.exc_info())
            html = decoded_html(socket.tab, type_)
            if html:
                socket.storage.save(path, ContentFile(html, path))
            else:
                html = '<html></html>'
        sample[name] = html
    return sample
Example #10
0
def decoded_html(tab, type_=None):
    if type_ == 'raw':
        stated_encoding = tab.evaljs('document.characterSet')
        return decode(tab._raw_html or tab.html(), default=stated_encoding)
    return tab.html()
Example #11
0
def decoded_html(tab, type_=None):
    if type_ == 'raw':
        stated_encoding = tab.evaljs('document.characterSet')
        return decode(tab.network_manager._raw_html or tab.html(),
                      default=stated_encoding)
    return tab.html()
Example #12
0
 def _set_tab_html(self, reply, har, content):
     url = reply.url().toString()
     if content is not None and url == self.tab.url:
         self.tab._raw_html = decode(content)
         self.tab._raw_url = decode(url)
Example #13
0
 def url(self):
     """ Current URL """
     if self._closing:
         return ''
     return decode(self.web_page.mainFrame().url().toString())
Example #14
0
 def _set_tab_html(self, reply, har, content):
     url = decode(reply.url().toString())
     if content is not None and url == self.tab.url:
         self.tab._raw_html = decode(content)
         self.tab._raw_url = url
Example #15
0
 def url(self):
     """ Current URL """
     if self._closing:
         return ''
     return decode(self.web_page.mainFrame().url().toString())