Exemple #1
0
 def fetch_links(self, document, base_url):
     global fetchlinks
     try:
         for el, attr, attr_val in self._iter_links(document):
             if self._match_element(el.tag, attr):
                 if attr_val.find('http://') > 0:
                     url = attr_val[attr_val.find('http://'):]
                 elif attr_val.find('https://') > 0:
                     url = attr_val[attr_val.find('https://'):]
                 elif attr_val[:2] == '//':
                     url = base_url[:base_url.find('://') + 1] + attr_val
                 else:
                     url = urljoin(base_url, attr_val)
                 _url = str(url)
                 #if isinstance(url, unicode):
                 #url = url.encode(response_encoding, errors='ignore')
                 url = escape_ajax(safe_url_string(url))
                 n = url.find('#')
                 if n != -1:
                     url = url[:n]
                 urlmd5 = self.get_md5(url)
                 _tag = str(el.tag)
                 _attr = str(attr if attr is not None else '')
                 _txt = str(el.text if el.text is not None else '')
                 fetchlinks[urlmd5] = (_tag, _attr, _txt, _url)
     except AttributeError as e:
         print 'Exception: ', e, base_url
    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: %s' % self._url)
Exemple #3
0
    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError(f'请求必须为 str 或者 unicode,而实际内容为 {type(url).__name__} 类型')

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ":" not in self._url:
            raise ValueError(f"缺失请求:{self._url}")
Exemple #4
0
    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__))

        url = to_native_str(url, self.encoding)
        self._url = escape_ajax(safe_url_string(url))

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))
Exemple #5
0
    def _set_url(self, url):
        if not isinstance(url, str):
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ('://' not in self._url) and (not self._url.startswith('data:')):
            raise ValueError('Missing scheme in request url: %s' % self._url)
Exemple #6
0
    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: %s' % self._url)
Exemple #7
0
def createMetaResources(md5v, dataset):
    with Timer(key='createMetaResources'):
        res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs(
            dataset)
        bulk_mr = []
        uris = []
        for uri in res:
            valid = True
            try:
                uri = urlnorm.norm(uri.strip())
            except Exception as e:
                log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message)
                uri = uri
                valid = False

            f = getDistributionFormatWithURL(dataset, uri)
            m = getDistributionMediaTypeWithURL(dataset, uri)
            s = getDistributionSizeWithURL(dataset, uri)
            c = getDistributionCreationDateWithURL(dataset, uri)
            mod = getDistributionModificationDateWithURL(dataset, uri)
            try:
                s_uri = safe_url_string(uri, 'utf-8')
                uri = escape_ajax(s_uri)
            except Exception as exc:
                ErrorHandler.handleError(log,
                                         "safe_url_string",
                                         exception=exc,
                                         md5=md5,
                                         uri=uri,
                                         exc_info=True)
                uri = uri

            if uri in uris:
                log.debug("WARNING, duplicate URI",
                          dataset=dataset.id,
                          md5=md5v,
                          uri=uri,
                          format=f,
                          media=m)
                continue
            try:
                s = int(float(s)) if s is not None else None
            except Exception as e:
                s = None

            MR = MetaResource(uri=uri,
                              md5=md5v,
                              media=m,
                              valid=valid,
                              format=normaliseFormat(f),
                              size=s,
                              created=toDatetime(c),
                              modified=toDatetime(mod))
            bulk_mr.append(MR)
            uris.append(uri)
        return bulk_mr
Exemple #8
0
    def _set_url(self, url: str) -> None:
        if not isinstance(url, str):
            raise TypeError(
                f"Request url must be str, got {type(url).__name__}")

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ('://' not in self._url and not self._url.startswith('about:')
                and not self._url.startswith('data:')):
            raise ValueError(f'Missing scheme in request url: {self._url}')
Exemple #9
0
 def _set_url(self, url):
     if isinstance(url, str):
         self._url = escape_ajax(safe_url_string(url))
     elif isinstance(url, six.text_type):
         if self.encoding is None:
             raise TypeError('Cannot convert unicode url - %s has no encoding' % type(self).__name__)
         self._set_url(url.encode(self.encoding))
     else:
         raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
     if ':' not in self._url:
         raise ValueError('Missing scheme in request url: %s' % self._url)
Exemple #10
0
 def _set_url(self, url):
     if isinstance(url, str):
         self._url = escape_ajax(safe_url_string(url))
     elif isinstance(url, six.text_type):
         if self.encoding is None:
             raise TypeError(
                 'Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         self._set_url(url.encode(self.encoding))
     else:
         raise TypeError('Request url must be str or unicode, got %s:' %
                         type(url).__name__)
     if ':' not in self._url:
         raise ValueError('Missing scheme in request url: %s' % self._url)