def check_and_correct_url(url, method='GET', keep_fragments=False): """Check a url for issues, record exceptions, and attempt to correct the url. :param url: URL to check and correct :param method: http method to use, as a string. Default is 'GET' """ returnval = ResultDict({'initial_url': url}) try: logger.info('Checking URL: {0}'.format(url)) scheme, netloc, path, params, query, fragments = urlparse(str(url)) if scheme is '': # Maybe it is an http url without the scheme? scheme, netloc, path, params, query, fragments = urlparse( "http://{0}".format(str(url))) elif not (scheme.startswith('http') or scheme.startswith('sftp') or scheme.startswith('ftp')): # Not a typical 'web' scheme raise InvalidURL('Invalid scheme (not http(s) or (s)ftp)') if netloc is '': raise InvalidURL('Invalid network location') corrected_url = urlunparse((scheme, netloc, path, params, query, fragments if keep_fragments else None)) returnval['valid_url'] = True returnval['corrected_url'] = corrected_url except Exception as e: logger.warn("Error validating url '{url}'".format(url=url)) returnval.add_error(e) returnval['valid_url'] = False return returnval
def get_root_uri(service_endpoint: str) -> str: provider_uri = service_endpoint api_version = DataServiceProvider.get_api_version() if api_version in provider_uri: i = provider_uri.find(api_version) provider_uri = provider_uri[:i] parts = provider_uri.split("/") if len(parts) < 2: raise InvalidURL(f"InvalidURL {service_endpoint}.") if parts[-2] == "services": provider_uri = "/".join(parts[:-2]) result = DataServiceProvider._remove_slash(provider_uri) if not result: raise InvalidURL(f"InvalidURL {service_endpoint}.") try: root_result = "/".join(parts[0:3]) response = requests.get(root_result).json() except (requests.exceptions.RequestException, JSONDecodeError): raise InvalidURL(f"InvalidURL {service_endpoint}.") if "providerAddress" not in response: raise InvalidURL( f"Invalid Provider URL {service_endpoint}, no providerAddress." ) return result
def __download_climate_observations_data(remote_file: str) -> bytes: try: zip_file = download_file_from_dwd(remote_file) except InvalidURL as e: raise InvalidURL( f"Error: the station data {remote_file} could not be reached." ) from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zip_file_opened = ZipFile(zip_file) # Files of archive archive_files = zip_file_opened.namelist() for file in archive_files: # If found file load file in bytes, close zipfile and return bytes if file.startswith(PRODUCT_FILE_IDENTIFIER): file_in_bytes = zip_file_opened.open(file).read() zip_file_opened.close() return file_in_bytes # If whatsoever no file was found and returned already throw exception raise ProductFileNotFound( f"The archive of {remote_file} does not hold a 'produkt' file.") except BadZipFile as e: raise BadZipFile( f"The archive of {remote_file} seems to be corrupted.") from e
def send(self, request, stream=None, timeout=None, verify=None, cert=None, proxies=None): parsed_url = urlparse.urlparse(request.url) # We only work for requests with a host of localhost if parsed_url.netloc.lower() != "localhost": raise InvalidURL("Invalid URL %r: Only localhost is allowed" % request.url) real_url = urlparse.urlunparse(parsed_url[:1] + ("",) + parsed_url[2:]) pathname = url_to_path(real_url) resp = Response() resp.status_code = 200 resp.url = real_url stats = os.stat(pathname) modified = email.utils.formatdate(stats.st_mtime, usegmt=True) resp.headers = CaseInsensitiveDict({ "Content-Type": mimetypes.guess_type(pathname)[0] or "text/plain", "Content-Length": stats.st_size, "Last-Modified": modified, }) resp.raw = LocalFSResponse(open(pathname, "rb")) resp.close = resp.raw.close return resp
def _create_meta_index_for_climate_observations( parameter_set: DwdObservationDataset, resolution: Resolution, period: Period, ) -> pd.DataFrame: """Function used to create meta index DataFrame parsed from the text files that are located in each data section of the station data directory of the weather service. Args: parameter_set: observation measure resolution: frequency/granularity of measurement interval period: current, recent or historical files Return: DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is not yet complete as file existence is not checked. """ parameter_path = build_path_to_parameter(parameter_set, resolution, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files(url, recursive=True) # Find the one meta file from the files listed on the server meta_file = _find_meta_file(files_server, url) try: file = download_file_from_dwd(meta_file) except InvalidURL as e: raise InvalidURL( f"Error: reading metadata {meta_file} file failed.") from e meta_index = pd.read_fwf( filepath_or_buffer=file, colspecs=METADATA_FIXED_COLUMN_WIDTH, skiprows=[1], dtype=str, encoding="ISO-8859-1", ) # Fix column names, as header is not aligned to fixed column widths meta_index.columns = "".join([ column for column in meta_index.columns if "unnamed" not in column.lower() ]).split(" ") meta_index = meta_index.rename(columns=str.lower) meta_index = meta_index.rename(columns=GERMAN_TO_ENGLISH_COLUMNS_MAPPING) return meta_index
def test_raises_unexpected_error(self): responses.add( responses.GET, self.FETCH_URL, body=InvalidURL('Unittest Mock InvalidURL!!'), status=200 ) self.assertRaises(InvalidURL, self.stk.fetch, 2015, 5)
def test_deepsea_invalid_url_error(self): with mock.patch("rest_client.TimeoutRequestsSession") as mock_requests_session: mock_requests_session().post.side_effect = InvalidURL("test") api = DeepSea('', 8000, 'auto', 'hello', 'world') with self.assertRaisesRegexp(RequestException, "^test$") as context: api._login() self.assertEqual(context.exception.status_code, None) self.assertTrue(mock_requests_session().post.called)
def test_raises_unexpected_error(self): realtime.proxies_list = [] responses.add(responses.GET, 'http://mis.twse.com.tw/stock/index.jsp', status=200) responses.add(responses.GET, self.FETCH_URL['2330'], body=InvalidURL('Unittest Mock InvalidURL!!'), status=200) self.assertRaises(InvalidURL, realtime.get, '2330')
def get_status_code(url): """ Gets the current status code of a full url or short url. Short urls will be prepended with https. """ if valid_url(url): return get(url).status_code elif valid_url("https://" + url): return get("https://" + url).status_code else: raise InvalidURL(f"{url} isn't a valid URL!")
def get_root_uri(service_endpoint): provider_uri = service_endpoint api_version = DataServiceProvider.get_api_version() if api_version in provider_uri: i = provider_uri.find(api_version) provider_uri = provider_uri[:i] parts = provider_uri.split("/") if len(parts) < 2: raise InvalidURL(f"InvalidURL {service_endpoint}.") if parts[-2] == "services": provider_uri = "/".join(parts[:-2]) result = DataServiceProvider._remove_slash(provider_uri) if not result: raise InvalidURL(f"InvalidURL {service_endpoint}.") return result
def csv_request(self, url): """ Get and parse csv file from url """ print "Get {}".format(url) r = requests.get(url) if r.status_code != 200: if r.status_code == 404: raise InvalidURL() else: raise RequestException(r.content) return r.content
def _create_meta_index_for_subdaily_extreme_wind(period: Period) -> pd.DataFrame: """Create metadata DataFrame for subdaily wind extreme :param period: period for which metadata is acquired :return: pandas.DataFrame with combined information for both 3hourly (fx3) and 6hourly (fx6) wind extremes """ parameter_path = build_path_to_parameter(DwdObservationDataset.WIND_EXTREME, Resolution.SUBDAILY, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX) # Find the one meta file from the files listed on the server meta_file_fx3 = _find_meta_file(files_server, url, ["fx3", "beschreibung", "txt"]) meta_file_fx6 = _find_meta_file(files_server, url, ["fx6", "beschreibung", "txt"]) try: meta_file_fx3 = download_file(meta_file_fx3, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file_fx3} file failed.") from e try: meta_file_fx6 = download_file(meta_file_fx6, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file_fx6} file failed.") from e df_fx3 = _read_meta_df(meta_file_fx3) df_fx6 = _read_meta_df(meta_file_fx6) df_fx6 = df_fx6.loc[df_fx6[Columns.STATION_ID.value].isin(df_fx3[Columns.STATION_ID.value].tolist()), :] return pd.concat([df_fx3, df_fx6])
def test_log__invalid_url(self, logger_mock, delete_mock): # Mock a URL error response delete_mock.return_value.json.side_effect = ValueError delete_mock.return_value.raise_for_status.side_effect = InvalidURL( 'something happened', ) purge_all_from_cache() logger_mock.error.assert_called_once_with( 'Couldn\'t purge from Cloudflare with data: %s. InvalidURL: %s', '{}', 'something happened' )
def fix_url(url): """ Any Unicode characters in a URL become encoded in UTF8. It does this by unquoting, encoding as UTF8, and quoting again. It must not get tripped up on '+' characters which are encoded spaces. e.g. This should be left unchanged or the "+" changed to "%20" : http://data.defra.gov.uk/inspire/UK+MR_indicators_Report_2011V6.csv """ from requests.exceptions import InvalidURL # resource.url is type unicode, but if it isn't for some reason, decode if not isinstance(url, unicode): url = url.decode('utf8') # parse it parsed = urlparse.urlsplit(url) # divide the netloc further userpass, at, hostport = parsed.netloc.rpartition('@') user, colon1, pass_ = userpass.partition(':') host, colon2, port = hostport.partition(':') def fix_common_host_problems(host): return host.replace('..', '.') # encode each component scheme = parsed.scheme.encode('utf8') user = urllib.quote(user.encode('utf8')) colon1 = colon1.encode('utf8') pass_ = urllib.quote(pass_.encode('utf8')) at = at.encode('utf8') try: host = fix_common_host_problems(host).encode('idna') except UnicodeError: # This is an invalid URL, so we should complain by abusing the # requests InvalidUrl exception raise InvalidURL("URL nije validan") colon2 = colon2.encode('utf8') port = port.encode('utf8') path = '/'.join( # could be encoded slashes! urllib.quote(urllib.unquote_plus(pce).encode('utf8'), '') for pce in parsed.path.split('/')) query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'), '=&?/') fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8')) # put it back together netloc = ''.join((user, colon1, pass_, at, host, colon2, port)) return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
def _init_model_from_server(model_server): # type: (EndpointConfig) -> Optional[(Domain, PolicyEnsemble, Text)] """Initialise a Rasa Core model from a URL.""" if not is_url(model_server.url): raise InvalidURL(model_server.url) model_directory = tempfile.mkdtemp() fingerprint = _pull_model_and_fingerprint(model_server, model_directory, fingerprint=None) return fingerprint, model_directory
def download_file_from_url(url: Text) -> Text: """Download a story file from a url and persists it into a temp file. Returns the file path of the temp file that contains the downloaded content.""" if not nlu_utils.is_url(url): raise InvalidURL(url) response = requests.get(url) response.raise_for_status() filename = nlu_utils.create_temporary_file(response.content, mode="w+b") return filename
def _init_model_from_server(model_server: EndpointConfig ) -> Optional[typing.Tuple[Text, Text]]: """Initialise a Rasa Core model from a URL.""" if not is_url(model_server.url): raise InvalidURL(model_server.url) model_directory = tempfile.mkdtemp() fingerprint = _pull_model_and_fingerprint(model_server, model_directory, fingerprint=None) return fingerprint, model_directory
def _download_climate_observations_data_parallel( remote_file: Union[str, Path]) -> BytesIO: """ This function downloads the station data for which the link is provided by the 'select_dwd' function. It checks the shortened filepath (just the zipfile) for its parameters, creates the full filepath and downloads the file(s) according to the set up folder. Args: remote_file: contains path to file that should be downloaded and the path to the folder to store the files Returns: stores data on local file system """ try: zip_file = download_file_from_dwd(remote_file, DWDCDCBase.CLIMATE_OBSERVATIONS) except InvalidURL as e: raise InvalidURL( f"Error: the station data {remote_file} couldn't be reached." ) from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zip_file_opened = ZipFile(zip_file) # Files of archive archive_files = zip_file_opened.namelist() for file in archive_files: # If found file load file in bytes, close zipfile and return bytes if file.startswith(PRODUCT_FILE_IDENTIFIER): file_in_bytes = BytesIO(zip_file_opened.open(file).read()) zip_file_opened.close() return file_in_bytes # If whatsoever no file was found and returned already throw exception raise ProductFileNotFound( f"The archive of {remote_file} does not hold a 'produkt' file.") except BadZipFile as e: raise BadZipFile( f"The archive of {remote_file} seems to be corrupted.") from e
def _update_model_from_server(model_server: EndpointConfig, agent: 'Agent') -> None: """Load a zipped Rasa Core model from a URL and update the passed agent.""" if not is_url(model_server.url): raise InvalidURL(model_server.url) model_directory = tempfile.mkdtemp() new_model_fingerprint = _pull_model_and_fingerprint( model_server, model_directory, agent.fingerprint) if new_model_fingerprint: _load_and_set_updated_model(agent, model_directory, new_model_fingerprint) else: logger.debug("No new model found at " "URL {}".format(model_server.url))
def _download_metadata_file_for_1minute_precipitation(metadata_file: str) -> BytesIO: """A function that simply opens a filepath with help of the urllib library and then writes the content to a BytesIO object and returns this object. For this case as it opens lots of requests (there are approx 1000 different files to open for 1minute data), it will do the same at most three times for one file to assure success reading the file. Args: metadata_file (str) - the file that shall be downloaded and returned as bytes. Return: A BytesIO object to which the opened file was written beforehand. """ try: return download_file(metadata_file, ttl=CacheExpiry.NO_CACHE) except InvalidURL as e: raise InvalidURL(f"Reading metadata {metadata_file} file failed.") from e
def testGetImageFromUrlNotWorthRetry(self, mock_io, mock_requests): # We should give up quickly for non-timeout errors. image_url = 'https://some.url.com/image.jpg' orig_img = Image.new('RGBA', (200, 200)) img_bytes = io.BytesIO() orig_img.save(img_bytes, format='png') mock_response = mock.MagicMock() mock_response.raw.read.return_value = 'something' mock_io.BytesIO.return_value = img_bytes mock_requests.get.side_effect = [InvalidURL(), ReadTimeout(), mock_response] with self.assertRaises(InvalidURL): atlasmaker_io.get_image(image_url, request_timeout=30, http_max_retries=3, http_retry_interval=0) self.assertEqual(mock_requests.get.call_count, 1)
def _download_image(url=None): try: IMAGE_SIZE = int(requests.head(url).headers['content-length']) if not CheckImage.allowed_file( requests.head(url).headers['content-type']): raise ValueError('NOT_SUPPORTED') if IMAGE_SIZE > MAX_SIZE: raise Exception('FILE_SIZE') response = requests.get(url) return load_image_file(BytesIO(response.content)) except InvalidURL: raise InvalidURL(ERR['INV_URL']) except MissingSchema: raise MissingSchema(ERR['INV_SCHEMA']) except Timeout: raise Timeout(ERR['TIMEOUT']) except AttributeError: raise
def _update_model_from_server(model_server: EndpointConfig, project: 'Project') -> None: """Load a zipped Rasa NLU model from a URL and update the passed project.""" if not is_url(model_server.url): raise InvalidURL(model_server) model_directory = tempfile.mkdtemp() new_model_fingerprint, filename = _pull_model_and_fingerprint( model_server, model_directory, project.fingerprint) if new_model_fingerprint: model_name = _get_remote_model_name(filename) project.fingerprint = new_model_fingerprint project.update_model_from_dir_and_unload_others( model_directory, model_name) else: logger.debug("No new model found at URL {}".format(model_server.url))
def get(z, x, y): "downloads data to z/x/y.json, relative to the current working directory" url = template.format(server=random.choice(["a", "b", "c"]), archive=archive, z=z, x=x, y=y) tile = requests.get(url) if tile.status_code != 200: raise InvalidURL("Couldn't download", z, x, y, tile.status_code) if not ('features' in tile.json() and tile.json()['features']): #print("\tempty") return #OSM generates a geojson file that looks valid but is empty if you ask for an area it doesn't know about; skip saving these print(url) os.makedirs(os.path.join(str(z), str(x)), exist_ok=True) with open(os.path.join(*[str(e) for e in [z, x, y]]) + ".json", "w") as j: j.write(tile.text)
def send( self, request: requests.models.PreparedRequest, stream: bool = False, timeout: Union[None, float, Tuple[float, float], Tuple[float, None]] = None, verify: Union[bool, str] = True, cert: Union[None, Union[bytes, Text], Container[Union[bytes, Text]]] = None, proxies: Optional[Mapping[str, str]] = None, ) -> requests.models.Response: try: # This is very similar to the parser in # urllib.request.DataHandler in the standard library. assert request.url scheme, data_str = request.url.split(':', 1) mediatype, data_str = data_str.split(',', 1) data_bytes = unquote_to_bytes(data_str) if mediatype.endswith(';base64'): data_bytes = base64.decodebytes(data_bytes) mediatype = mediatype[:-len(';base64')] if not mediatype: mediatype = "text/plain;charset=US-ASCII" except BaseException as err: raise InvalidURL(err, request=request) # Now pack that info in to a urllib3.response.HTTPResponse. u3resp = HTTPResponse( status=200, reason='OK', headers={ 'Content-Type': mediatype, 'Content-Length': str(len(data_bytes)), }, body=io.BytesIO(data_bytes), ) # Now pack that info in to a requests.models.Response. return HTTPAdapter.build_response(cast(HTTPAdapter, self), request, u3resp)
def _update_model_from_server(model_server: EndpointConfig, agent: 'Agent') -> None: """Load a zipped Rasa Core model from a URL and update the passed agent.""" if not is_url(model_server.url): raise InvalidURL(model_server.url) model_directory = tempfile.mkdtemp() new_model_fingerprint = _pull_model_and_fingerprint( model_server, model_directory, agent.fingerprint) if new_model_fingerprint: domain_path = os.path.join(os.path.abspath(model_directory), "domain.yml") domain = Domain.load(domain_path) policy_ensemble = PolicyEnsemble.load(model_directory) agent.update_model(domain, policy_ensemble, new_model_fingerprint) else: logger.debug("No new model found at " "URL {}".format(model_server.url))
def __download_climate_observations_data(remote_file: str) -> bytes: try: file = download_file(remote_file, ttl=CacheExpiry.FIVE_MINUTES) except InvalidURL as e: raise InvalidURL(f"Error: the station data {remote_file} could not be reached.") from e except Exception: raise FailedDownload(f"Download failed for {remote_file}") try: zfs = ZipFileSystem(file) except BadZipFile as e: raise BadZipFile(f"The archive of {remote_file} seems to be corrupted.") from e product_file = zfs.glob("produkt*") if len(product_file) != 1: raise ProductFileNotFound(f"The archive of {remote_file} does not hold a 'produkt' file.") return zfs.open(product_file[0]).read()
def _create_meta_index_for_climate_observations( dataset: DwdObservationDataset, resolution: Resolution, period: Period, ) -> pd.DataFrame: """Function used to create meta index DataFrame parsed from the text files that are located in each data section of the station data directory of the weather service. Args: dataset: observation measure resolution: frequency/granularity of measurement interval period: current, recent or historical files Return: DataFrame with parsed columns of the corresponding text file. Columns are translated into English and data is not yet complete as file existence is not checked. """ parameter_path = build_path_to_parameter(dataset, resolution, period) url = reduce( urljoin, [ DWD_SERVER, DWD_CDC_PATH, DWDCDCBase.CLIMATE_OBSERVATIONS.value, parameter_path, ], ) files_server = list_remote_files_fsspec(url, ttl=CacheExpiry.METAINDEX) # Find the one meta file from the files listed on the server meta_file = _find_meta_file(files_server, url, ["beschreibung", "txt"]) try: file = download_file(meta_file, ttl=CacheExpiry.METAINDEX) except InvalidURL as e: raise InvalidURL(f"Error: reading metadata {meta_file} file failed.") from e return _read_meta_df(file)
async def _update_model_from_server( model_server: EndpointConfig, nlu_model: "NLUModel", component_builder: ComponentBuilder, ) -> None: """Load a tar.gz Rasa NLU model from a URL and update the passed nlu model.""" if not is_url(model_server.url): raise InvalidURL(model_server) model_directory = tempfile.mkdtemp() new_model_fingerprint, filename = await _pull_model_and_fingerprint( model_server, model_directory, nlu_model.fingerprint ) if new_model_fingerprint: model_name = _get_remote_model_name(filename) nlu_model.fingerprint = new_model_fingerprint nlu_model.update_model(component_builder, model_directory, model_name) else: logger.debug("No new model found at URL '{}'".format(model_server.url))
def get(url, timeout=None, headers=None, encoding=None): """ 发送GET请求 :param url: URL :param timeout: 超时时间(秒) :param headers: 头部信息 :param encoding: 编码 :return: """ # 参数验证,URL不能为空 if not url: raise InvalidURL("Invalid URL %r" % url) response = requests.get(url, headers=headers, timeout=timeout) if not response: return None if encoding: response.encoding = encoding if response.status_code == 200: r_text = response.text response.close() return r_text response.close() return None