def fetch_links(self, document, base_url): global fetchlinks try: for el, attr, attr_val in self._iter_links(document): if self._match_element(el.tag, attr): if attr_val.find('http://') > 0: url = attr_val[attr_val.find('http://'):] elif attr_val.find('https://') > 0: url = attr_val[attr_val.find('https://'):] elif attr_val[:2] == '//': url = base_url[:base_url.find('://') + 1] + attr_val else: url = urljoin(base_url, attr_val) _url = str(url) #if isinstance(url, unicode): #url = url.encode(response_encoding, errors='ignore') url = escape_ajax(safe_url_string(url)) n = url.find('#') if n != -1: url = url[:n] urlmd5 = self.get_md5(url) _tag = str(el.tag) _attr = str(attr if attr is not None else '') _txt = str(el.text if el.text is not None else '') fetchlinks[urlmd5] = (_tag, _attr, _txt, _url) except AttributeError as e: print 'Exception: ', e, base_url
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) s = safe_url_string(url, self.encoding) self._url = escape_ajax(s) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError(f'请求必须为 str 或者 unicode,而实际内容为 {type(url).__name__} 类型') s = safe_url_string(url, self.encoding) self._url = escape_ajax(s) if ":" not in self._url: raise ValueError(f"缺失请求:{self._url}")
def _set_url(self, url): if not isinstance(url, six.string_types): raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__)) url = to_native_str(url, self.encoding) self._url = escape_ajax(safe_url_string(url)) if ':' not in self._url: raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))
def _set_url(self, url): if not isinstance(url, str): raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) s = safe_url_string(url, self.encoding) self._url = escape_ajax(s) if ('://' not in self._url) and (not self._url.startswith('data:')): raise ValueError('Missing scheme in request url: %s' % self._url)
def createMetaResources(md5v, dataset): with Timer(key='createMetaResources'): res = getDistributionAccessURLs(dataset) + getDistributionDownloadURLs( dataset) bulk_mr = [] uris = [] for uri in res: valid = True try: uri = urlnorm.norm(uri.strip()) except Exception as e: log.debug("URIFormat", uri=uri, md5=md5v, msg=e.message) uri = uri valid = False f = getDistributionFormatWithURL(dataset, uri) m = getDistributionMediaTypeWithURL(dataset, uri) s = getDistributionSizeWithURL(dataset, uri) c = getDistributionCreationDateWithURL(dataset, uri) mod = getDistributionModificationDateWithURL(dataset, uri) try: s_uri = safe_url_string(uri, 'utf-8') uri = escape_ajax(s_uri) except Exception as exc: ErrorHandler.handleError(log, "safe_url_string", exception=exc, md5=md5, uri=uri, exc_info=True) uri = uri if uri in uris: log.debug("WARNING, duplicate URI", dataset=dataset.id, md5=md5v, uri=uri, format=f, media=m) continue try: s = int(float(s)) if s is not None else None except Exception as e: s = None MR = MetaResource(uri=uri, md5=md5v, media=m, valid=valid, format=normaliseFormat(f), size=s, created=toDatetime(c), modified=toDatetime(mod)) bulk_mr.append(MR) uris.append(uri) return bulk_mr
def _set_url(self, url: str) -> None: if not isinstance(url, str): raise TypeError( f"Request url must be str, got {type(url).__name__}") s = safe_url_string(url, self.encoding) self._url = escape_ajax(s) if ('://' not in self._url and not self._url.startswith('about:') and not self._url.startswith('data:')): raise ValueError(f'Missing scheme in request url: {self._url}')
def _set_url(self, url): if isinstance(url, str): self._url = escape_ajax(safe_url_string(url)) elif isinstance(url, six.text_type): if self.encoding is None: raise TypeError('Cannot convert unicode url - %s has no encoding' % type(self).__name__) self._set_url(url.encode(self.encoding)) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)
def _set_url(self, url): if isinstance(url, str): self._url = escape_ajax(safe_url_string(url)) elif isinstance(url, six.text_type): if self.encoding is None: raise TypeError( 'Cannot convert unicode url - %s has no encoding' % type(self).__name__) self._set_url(url.encode(self.encoding)) else: raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) if ':' not in self._url: raise ValueError('Missing scheme in request url: %s' % self._url)