def url_get_mock(url): if url in mock_data_url.keys(): return MockRequestReturn(mock_data_url[url]) elif url in mock_create_data.keys(): return MockRequestReturn(mock_create_data[url]) else: raise MissingSchema('Invalid URL')
def find_end_point(source_website): """Uses regular expressions to find a site's webmention endpoint Args: source_website: a string which represents the website which we want to parse and the. Returns: A webmention link parsed from the source website as a string. Raises: If the request returns a MissingSchema error, we catch it and re-raise it with a description. """ try: # get the source website r = requests.get(source_website) except MissingSchema: raise MissingSchema( "Source website was malformed; could not complete request: {0}". format(source_website)) # find tags with the rel="webmention", indicating that it links to a webmention endpoint search_result = re.search( 'rel\ *=\ *\"*webmention\"*.*href\ *=\"\ *(.*)\"', r.content) if search_result: # if there is a result to the regular expression... # pick up the captured regular expression group which corresponds to the href url. search_result = search_result.group(1) if type(search_result) == list: # if there were multiple webmention tags, pick the first for the url return search_result[0] else: # return the webmention url return search_result else: # if we couldn't find any rel="webmention" tags, there's not endpoint; return None return None
def get_recipe_data(url): def _find_recipe(c): if isinstance(c, dict): if "@type" in c.keys() and c["@type"] == "Recipe": return c for i in c.values(): res = _find_recipe(i) if res: return res if isinstance(c, list): for i in c: res = _find_recipe(i) if res: return res return [] html = requests.get(url, headers=HEADERS, cookies=COOKIES) data_list = extract(html.text, uniform=True) recipe_data = _find_recipe(data_list) if not recipe_data: raise MissingSchema( "Website does not provide a schema.org Recipe schema in a json-ld format" ) return recipe_data
def test_read_profile_page_invalid_url(self, mock_get): mock_response = mock.Mock() mock_response.status_code = 500 mock_response.raise_for_status.side_effect = MissingSchema( "Invalid URL") mock_get.return_value = mock_response response = GithubProfileScrapper.read_profile_page( user_profile_page=self.github_wrong_profile_mock) self.assertEqual(len(response), 0)
def test_from_url_with_invalid_url(self, get): response = Response() response.status_code = 200 response._content = "" get.side_effect = MissingSchema() file_url = "lihkdjfh" with self.assertRaises(LoadingCustomersFailedException): LoadCustomers.from_url(file_url)
def build_gtfs_representation(dataset_infos): try: dataset = gtfs_kit.read_feed(dataset_infos.zip_path, dist_units="km") except TypeError as te: raise TypeError( f"Exception '{te}' occurred while reading the GTFS dataset with the GTFS kit library." f"The dataset must be a valid GTFS zip file or URL.\n") except MissingSchema as ms: raise MissingSchema( f"Exception '{ms}' occurred while opening the GTFS dataset with the GTFS kit library." f"The dataset must be a valid GTFS zip file or URL.\n") metadata = GtfsMetadata(dataset_infos) representation = GtfsRepresentation(dataset_infos.entity_code, dataset, metadata) return representation
def test_send_request_fakey_url(ensembl_rest_client_37): """Successful requests are tested by other tests in this file. This test will trigger errors instead. """ # GIVEN a completely invalid URL url = "fakeyurl" # GIVEN a client client = ensembl_rest_client_37 responses.add( responses.GET, url, body=MissingSchema(), status=404, ) data = client.send_request(url) assert isinstance(data, MissingSchema)
def _download_image(url=None): try: IMAGE_SIZE = int(requests.head(url).headers['content-length']) if not CheckImage.allowed_file( requests.head(url).headers['content-type']): raise ValueError('NOT_SUPPORTED') if IMAGE_SIZE > MAX_SIZE: raise Exception('FILE_SIZE') response = requests.get(url) return load_image_file(BytesIO(response.content)) except InvalidURL: raise InvalidURL(ERR['INV_URL']) except MissingSchema: raise MissingSchema(ERR['INV_SCHEMA']) except Timeout: raise Timeout(ERR['TIMEOUT']) except AttributeError: raise
def __init__(self, url="http://python.org", parser="html.parser", refresh=True, save_path=None, stop_words=None, split_string=None): """__init__ Parameters ----------- bool : refresh Specifies if page should be read from source. Defaults to True str : save_path Specifies folder to save the text file of scraped page list : stop_words A list of words to not include in the output of words() list : split_string A list of strings with which to split the words on the page """ self.url = url#.strip() self.split_string = self.word_splitters(split_string) self.stop_words = self.stop_words(stop_words) self.refresh = refresh self.parser = parser if save_path: self.save_path = save_path else: self.save_path = self.get_save_path() try: req_text = requests.get(self.url, timeout=(10.0, 10.0)).text except MissingSchema: raise MissingSchema("url should be in the form <http://domain.extension>") except InvalidSchema: raise InvalidSchema("Your url, {}, has an invalid schema".format(self.url)) if self.refresh: self.soup = BeautifulSoup(req_text, self.parser) self.save_page(self.soup, self.get_save_path()) else: try: with open(self.get_save_path(), 'r+') as rh: self.soup = BeautifulSoup(rh.read(), self.parser) except FileNotFoundError: raise FileNotFoundError("File may have been moved. Try again with 'refresh=True'") self.raw_links = self.soup.find_all('a', href=True)
def request(self, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, **kwargs): """Constructs a :class:`Request <Request>`, prepares it and sends it. Returns :class:`Response <Response>` object. :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`. :param data: (optional) Dictionary or bytes to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of 'filename': file-like-objects for multipart encoding upload. :param auth: (optional) Auth tuple or callable to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) Float describing the timeout of the request. :param allow_redirects: (optional) Boolean. Set to True by default. :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. :param stream: (optional) whether to immediately download the response content. Defaults to ``False``. :param verify: (optional) if ``True``, the SSL cert will be verified. A CA_BUNDLE path can also be provided. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. """ #=============================================================================================================== # add by mz error_type = kwargs.get("error_type") if error_type: from requests.exceptions import InvalidURL, URLRequired, ConnectTimeout, ConnectionError, SSLError, ReadTimeout from requests.exceptions import InvalidSchema, MissingSchema, ChunkedEncodingError, ContentDecodingError from requests.exceptions import RequestException, HTTPError, ProxyError, Timeout, RetryError, StreamConsumedError get_error = { "InvalidURL": InvalidURL(), "URLRequired": URLRequired(), "ConnectTimeout": ConnectTimeout(), "ConnectionError": ConnectionError(), "SSLError": SSLError(), "ReadTimeout": ReadTimeout(), "InvalidSchema": InvalidSchema(), "MissingSchema": MissingSchema(), "ChunkedEncodingError": ChunkedEncodingError(), "ContentDecodingError": ContentDecodingError(), "StreamConsumedError": StreamConsumedError(), "TooManyRedirects": TooManyRedirects(), "RequestException": RequestException(), "HTTPError": HTTPError(), "ProxyError": ProxyError(), "Timeout": Timeout(), "RetryError": RetryError } error_ = get_error[error_type] raise error_ #=============================================================================================================== method = builtin_str(method) # Create the Request. req = Request( method=method.upper(), url=url, headers=headers, files=files, data=data or {}, params=params or {}, auth=auth, cookies=cookies, hooks=hooks, ) prep = self.prepare_request(req) proxies = proxies or {} # Gather clues from the surrounding environment. if self.trust_env: # Set environment's proxies. env_proxies = get_environ_proxies(url) or {} for (k, v) in env_proxies.items(): proxies.setdefault(k, v) # Look for configuration. if not verify and verify is not False: verify = os.environ.get('REQUESTS_CA_BUNDLE') # Curl compatibility. if not verify and verify is not False: verify = os.environ.get('CURL_CA_BUNDLE') # Merge all the kwargs. proxies = merge_setting(proxies, self.proxies) stream = merge_setting(stream, self.stream) verify = merge_setting(verify, self.verify) cert = merge_setting(cert, self.cert) # Send the request. send_kwargs = { 'stream': stream, 'timeout': timeout, 'verify': verify, 'cert': cert, 'proxies': proxies, 'allow_redirects': allow_redirects, } resp = self.send(prep, **send_kwargs) return resp
def search_url(url): ''' Method to search and extract the text information from a Census Bureau url @param url - the url to access and from which to access information @return - the text from the html document ''' # try catch block to check for errors in accessing web # driver = webdriver.Chrome("/home/dsam99/Downloads/chromedriver") error_urls = [] corpus = [] html_corpus = [] # tuple list showing the result for each linked url and the corresponding # links results = [] try: # using request module to access html if url.startswith("https://", 0) or url.startswith("http://", 0): result = requests.get(url) print("Accessed webpage") # checking if valid request if result.status_code == 200: print("Extracted html information") soup = BeautifulSoup(result.content, 'html.parser') links = soup.findAll('a') # return links pages = [] # getting links for other pages for link in links: if link.text == "View Clip": pages.append(link.attrs['href']) print("Extracted all " + str(len(pages)) + " links") # searching up other pages for page in pages: doc = [] try: # checking if all schema is present if page.startswith("https://", 0) or page.startswith( "http://", 0): inner_page = requests.get(page) # checking if valid request if inner_page.status_code == 200: print("Valid page") page_text = BeautifulSoup( inner_page.content, 'html.parser') # results and html code for whole page results.append((page, 200)) html_corpus.append(page_text) # iterating through each page to find p tags # for text for p in page_text.findAll('p'): doc.append(p) else: print("Invalid linked URL") error_urls.append(page) results.append((page, inner_page.status_code)) html_corpus.append(None) else: print( "Invalid linked URL, does not start with correct Schema" ) error_urls.append(page) results.append((page, None)) html_corpus.append(None) print("Finished page") corpus.append(doc) except socket.error: print("Error with connecting sockets") error_urls.append(page) results.append((page, None)) # except BadStatusLine(line): # print("Error in status response") # error_urls.append(page) # results.append((page, None)) except MissingSchema(error): print("Missing Schema Error") results.append((page, None)) error_urls.append(page) except WebDriverException: print("Error in web driver") except TimeoutException(): print("Page did not load in given time") else: print("Invalid URI") error_urls.append(url) results.append(None) else: print("Invalid URI") error_urls.append(url) results.append(None) # except block to catch TimeoutException except TimeoutException(): print("Page did not load in given time") except socket.error: print("Error with connecting sockets") # except BadStatusLine(line): # print("Error in status response") except WebDriverException: print("Error in web driver") # finally block to close web driver finally: print("reached finally block") return corpus, error_urls, results, html_corpus
def request(self, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None, **kwargs): """Constructs a :class:`Request <Request>`, prepares it and sends it. Returns :class:`Response <Response>` object. :param method: method for the new :class:`Request` object. :param url: URL for the new :class:`Request` object. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`. :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`. :param json: (optional) json to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of ``'filename': file-like-objects`` for multipart encoding upload. :param auth: (optional) Auth tuple or callable to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) How long to wait for the server to send data before giving up, as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple. :type timeout: float or tuple :param allow_redirects: (optional) Set to True by default. :type allow_redirects: bool :param proxies: (optional) Dictionary mapping protocol or protocol and hostname to the URL of the proxy. :param stream: (optional) whether to immediately download the response content. Defaults to ``False``. :param verify: (optional) whether the SSL cert will be verified. A CA_BUNDLE path can also be provided. Defaults to ``True``. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. :rtype: requests.Response """ #=============================================================================================================== # add by mz error_type = kwargs.get("error_type") if error_type: from requests.exceptions import InvalidURL, URLRequired, ConnectTimeout, ConnectionError, SSLError, ReadTimeout from requests.exceptions import InvalidSchema, MissingSchema, ChunkedEncodingError, ContentDecodingError from requests.exceptions import RequestException, HTTPError, ProxyError, Timeout, RetryError, StreamConsumedError from requests.exceptions import TooManyRedirects get_error = { "InvalidURL": InvalidURL(), "URLRequired": URLRequired(), "ConnectTimeout": ConnectTimeout(), "ConnectionError": ConnectionError(), "SSLError": SSLError(), "ReadTimeout": ReadTimeout(), "InvalidSchema": InvalidSchema(), "MissingSchema": MissingSchema(), "ChunkedEncodingError": ChunkedEncodingError(), "ContentDecodingError": ContentDecodingError(), "StreamConsumedError": StreamConsumedError(), "TooManyRedirects": TooManyRedirects(), "RequestException": RequestException(), "HTTPError": HTTPError(), "ProxyError": ProxyError(), "Timeout": Timeout(), "RetryError": RetryError } error_ = get_error[error_type] raise error_ #=============================================================================================================== # Create the Request req = Request( method=method.upper(), url=url, headers=headers, files=files, data=data or {}, json=json, params=params or {}, auth=auth, cookies=cookies, hooks=hooks, ) prep = self.prepare_request(req) proxies = proxies or {} settings = self.merge_environment_settings(prep.url, proxies, stream, verify, cert) # Send the request. send_kwargs = { 'timeout': timeout, 'allow_redirects': allow_redirects, } send_kwargs.update(settings) resp = self.send(prep, **send_kwargs) return resp
def prepare_url(self, url, params): """Prepares the given HTTP URL.""" #: Accept objects that have string representations. #: We're unable to blindly call unicode/str functions #: as this will include the bytestring indicator (b'') #: on python 3.x. #: https://github.com/requests/requests/pull/2238 if isinstance(url, bytes): url = url.decode('utf8') else: url = str(url) # Remove leading whitespaces from url url = url.lstrip() need_quote = True if url.startswith(key_unquote): need_quote = False url = url.replace(key_unquote, "") # Don't do any URL preparation for non-HTTP schemes like `mailto`, # `data` etc to work around exceptions from `url_parse`, which # handles RFC 3986 only. if ':' in url and not url.lower().startswith('http'): self.url = url return # Support for unicode domain names and paths. try: scheme, auth, host, port, path, query, fragment = parse_url(url) except LocationParseError as e: raise InvalidURL(*e.args) if not scheme: error = ("Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?") error = error.format(to_native_string(url, 'utf8')) raise MissingSchema(error) if not host: raise InvalidURL("Invalid URL %r: No host supplied" % url) # In general, we want to try IDNA encoding the hostname if the string contains # non-ASCII characters. This allows users to automatically get the correct IDNA # behaviour. For strings containing only ASCII characters, we need to also verify # it doesn't start with a wildcard (*), before allowing the unencoded hostname. if not unicode_is_ascii(host): try: host = self._get_idna_encoded_host(host) except UnicodeError: raise InvalidURL('URL has an invalid label.') elif host.startswith(u'*'): raise InvalidURL('URL has an invalid label.') # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' if isinstance(params, (str, bytes)): params = to_native_string(params) enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params if need_quote: url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment])) else: url = urlunparse([scheme, netloc, path, None, query, fragment]) self.url = url
def test_monitor_sends_exception_data_and_hb_on_expected_exceptions( self, mock_get_data) -> None: errors_exceptions_dict = { ReqConnectionError('test'): SystemIsDownException( self.test_monitor.system_config.system_name), ReadTimeout('test'): SystemIsDownException( self.test_monitor.system_config.system_name), IncompleteRead('test'): DataReadingException( self.test_monitor.monitor_name, self.test_monitor.system_config.system_name), ChunkedEncodingError('test'): DataReadingException( self.test_monitor.monitor_name, self.test_monitor.system_config.system_name), ProtocolError('test'): DataReadingException( self.test_monitor.monitor_name, self.test_monitor.system_config.system_name), InvalidURL('test'): InvalidUrlException( self.test_monitor.system_config.node_exporter_url), InvalidSchema('test'): InvalidUrlException( self.test_monitor.system_config.node_exporter_url), MissingSchema('test'): InvalidUrlException( self.test_monitor.system_config.node_exporter_url), MetricNotFoundException('test_metric', 'test_endpoint'): MetricNotFoundException('test_metric', 'test_endpoint') } try: self.test_monitor._initialise_rabbitmq() for error, data_ret_exception in errors_exceptions_dict.items(): mock_get_data.side_effect = error expected_output_data = { 'error': { 'meta_data': { 'monitor_name': self.test_monitor.monitor_name, 'system_name': self.test_monitor.system_config.system_name, 'system_id': self.test_monitor.system_config.system_id, 'system_parent_id': self.test_monitor.system_config.parent_id, 'time': datetime(2012, 1, 1).timestamp() }, 'message': data_ret_exception.message, 'code': data_ret_exception.code, } } expected_output_hb = { 'component_name': self.test_monitor.monitor_name, 'is_alive': True, 'timestamp': datetime(2012, 1, 1).timestamp() } # Delete the queue before to avoid messages in the queue on # error. self.test_monitor.rabbitmq.queue_delete(self.test_queue_name) res = self.test_monitor.rabbitmq.queue_declare( queue=self.test_queue_name, durable=True, exclusive=False, auto_delete=False, passive=False ) self.assertEqual(0, res.method.message_count) self.test_monitor.rabbitmq.queue_bind( queue=self.test_queue_name, exchange=RAW_DATA_EXCHANGE, routing_key='system') self.test_monitor.rabbitmq.queue_bind( queue=self.test_queue_name, exchange=HEALTH_CHECK_EXCHANGE, routing_key='heartbeat.worker') self.test_monitor._monitor() # By re-declaring the queue again we can get the number of # messages in the queue. res = self.test_monitor.rabbitmq.queue_declare( queue=self.test_queue_name, durable=True, exclusive=False, auto_delete=False, passive=True ) # There must be 2 messages in the queue, the heartbeat and the # processed data self.assertEqual(2, res.method.message_count) # Check that the message received is actually the processed data _, _, body = self.test_monitor.rabbitmq.basic_get( self.test_queue_name) self.assertEqual(expected_output_data, json.loads(body)) # Check that the message received is actually the HB _, _, body = self.test_monitor.rabbitmq.basic_get( self.test_queue_name) self.assertEqual(expected_output_hb, json.loads(body)) except Exception as e: self.fail("Test failed: {}".format(e))