コード例 #1
0
def url_get_mock(url):
    if url in mock_data_url.keys():
        return MockRequestReturn(mock_data_url[url])
    elif url in mock_create_data.keys():
        return MockRequestReturn(mock_create_data[url])
    else:
        raise MissingSchema('Invalid URL')
コード例 #2
0
def find_end_point(source_website):
    """Uses regular expressions to find a site's webmention endpoint
    
    Args:
        source_website: a string which represents the website which we want to parse and the.
    Returns:
        A webmention link parsed from the source website as a string.
    Raises:
        If the request returns a MissingSchema error, we catch it and re-raise it with a description. 
    """

    try:
        #  get the source website
        r = requests.get(source_website)
    except MissingSchema:
        raise MissingSchema(
            "Source website was malformed; could not complete request: {0}".
            format(source_website))

    # find tags with the rel="webmention", indicating that it links to a webmention endpoint
    search_result = re.search(
        'rel\ *=\ *\"*webmention\"*.*href\ *=\"\ *(.*)\"', r.content)
    if search_result:
        # if there is a result to the regular expression...
        # pick up the captured regular expression group which corresponds to the href url.
        search_result = search_result.group(1)
        if type(search_result) == list:
            # if there were multiple webmention tags, pick the first for the url
            return search_result[0]
        else:
            # return the webmention url
            return search_result
    else:
        # if we couldn't find any rel="webmention" tags, there's not endpoint; return None
        return None
コード例 #3
0
def get_recipe_data(url):
    def _find_recipe(c):
        if isinstance(c, dict):
            if "@type" in c.keys() and c["@type"] == "Recipe":
                return c
            for i in c.values():
                res = _find_recipe(i)
                if res:
                    return res
        if isinstance(c, list):
            for i in c:
                res = _find_recipe(i)
                if res:
                    return res
        return []

    html = requests.get(url, headers=HEADERS, cookies=COOKIES)
    data_list = extract(html.text, uniform=True)

    recipe_data = _find_recipe(data_list)
    if not recipe_data:
        raise MissingSchema(
            "Website does not provide a schema.org Recipe schema in a json-ld format"
        )
    return recipe_data
コード例 #4
0
 def test_read_profile_page_invalid_url(self, mock_get):
     mock_response = mock.Mock()
     mock_response.status_code = 500
     mock_response.raise_for_status.side_effect = MissingSchema(
         "Invalid URL")
     mock_get.return_value = mock_response
     response = GithubProfileScrapper.read_profile_page(
         user_profile_page=self.github_wrong_profile_mock)
     self.assertEqual(len(response), 0)
コード例 #5
0
    def test_from_url_with_invalid_url(self, get):
        response = Response()
        response.status_code = 200
        response._content = ""

        get.side_effect = MissingSchema()

        file_url = "lihkdjfh"

        with self.assertRaises(LoadingCustomersFailedException):
            LoadCustomers.from_url(file_url)
コード例 #6
0
def build_gtfs_representation(dataset_infos):
    try:
        dataset = gtfs_kit.read_feed(dataset_infos.zip_path, dist_units="km")
    except TypeError as te:
        raise TypeError(
            f"Exception '{te}' occurred while reading the GTFS dataset with the GTFS kit library."
            f"The dataset must be a valid GTFS zip file or URL.\n")
    except MissingSchema as ms:
        raise MissingSchema(
            f"Exception '{ms}' occurred while opening the GTFS dataset with the GTFS kit library."
            f"The dataset must be a valid GTFS zip file or URL.\n")
    metadata = GtfsMetadata(dataset_infos)
    representation = GtfsRepresentation(dataset_infos.entity_code, dataset,
                                        metadata)
    return representation
コード例 #7
0
def test_send_request_fakey_url(ensembl_rest_client_37):
    """Successful requests are tested by other tests in this file.
    This test will trigger errors instead.
    """
    # GIVEN a completely invalid URL
    url = "fakeyurl"
    # GIVEN a client
    client = ensembl_rest_client_37
    responses.add(
        responses.GET,
        url,
        body=MissingSchema(),
        status=404,
    )
    data = client.send_request(url)
    assert isinstance(data, MissingSchema)
コード例 #8
0
 def _download_image(url=None):
     try:
         IMAGE_SIZE = int(requests.head(url).headers['content-length'])
         if not CheckImage.allowed_file(
                 requests.head(url).headers['content-type']):
             raise ValueError('NOT_SUPPORTED')
         if IMAGE_SIZE > MAX_SIZE:
             raise Exception('FILE_SIZE')
         response = requests.get(url)
         return load_image_file(BytesIO(response.content))
     except InvalidURL:
         raise InvalidURL(ERR['INV_URL'])
     except MissingSchema:
         raise MissingSchema(ERR['INV_SCHEMA'])
     except Timeout:
         raise Timeout(ERR['TIMEOUT'])
     except AttributeError:
         raise
コード例 #9
0
    def __init__(self, url="http://python.org", parser="html.parser", refresh=True, save_path=None, stop_words=None, split_string=None):
        """__init__
        Parameters
        -----------
        bool : refresh
            Specifies if page should be read from source. Defaults to True
        str : save_path
            Specifies folder to save the text file of scraped page
        list : stop_words
            A list of words to not include in the output of words()
        list : split_string
            A list of strings with which to split the words on the page
        """
        self.url = url#.strip()
        self.split_string = self.word_splitters(split_string)
        self.stop_words = self.stop_words(stop_words)
        self.refresh = refresh
        self.parser = parser
        if save_path:
            self.save_path = save_path
        else:
            self.save_path = self.get_save_path()

        try:
            req_text = requests.get(self.url, timeout=(10.0, 10.0)).text
        except MissingSchema:
            raise MissingSchema("url should be in the form <http://domain.extension>")
        except InvalidSchema:
            raise InvalidSchema("Your url, {}, has an invalid schema".format(self.url))

        if self.refresh:
            self.soup = BeautifulSoup(req_text, self.parser)
            self.save_page(self.soup, self.get_save_path())
        else:
            try:
                with open(self.get_save_path(), 'r+') as rh:
                    self.soup = BeautifulSoup(rh.read(), self.parser)
            except FileNotFoundError:
                raise FileNotFoundError("File may have been moved. Try again with 'refresh=True'")
        self.raw_links = self.soup.find_all('a', href=True)
コード例 #10
0
    def request(self,
                method,
                url,
                params=None,
                data=None,
                headers=None,
                cookies=None,
                files=None,
                auth=None,
                timeout=None,
                allow_redirects=True,
                proxies=None,
                hooks=None,
                stream=None,
                verify=None,
                cert=None,
                **kwargs):
        """Constructs a :class:`Request <Request>`, prepares it and sends it.
        Returns :class:`Response <Response>` object.

        :param method: method for the new :class:`Request` object.
        :param url: URL for the new :class:`Request` object.
        :param params: (optional) Dictionary or bytes to be sent in the query
            string for the :class:`Request`.
        :param data: (optional) Dictionary or bytes to send in the body of the
            :class:`Request`.
        :param headers: (optional) Dictionary of HTTP Headers to send with the
            :class:`Request`.
        :param cookies: (optional) Dict or CookieJar object to send with the
            :class:`Request`.
        :param files: (optional) Dictionary of 'filename': file-like-objects
            for multipart encoding upload.
        :param auth: (optional) Auth tuple or callable to enable
            Basic/Digest/Custom HTTP Auth.
        :param timeout: (optional) Float describing the timeout of the
            request.
        :param allow_redirects: (optional) Boolean. Set to True by default.
        :param proxies: (optional) Dictionary mapping protocol to the URL of
            the proxy.
        :param stream: (optional) whether to immediately download the response
            content. Defaults to ``False``.
        :param verify: (optional) if ``True``, the SSL cert will be verified.
            A CA_BUNDLE path can also be provided.
        :param cert: (optional) if String, path to ssl client cert file (.pem).
            If Tuple, ('cert', 'key') pair.
        """
        #===============================================================================================================
        # add by mz
        error_type = kwargs.get("error_type")
        if error_type:
            from requests.exceptions import InvalidURL, URLRequired, ConnectTimeout, ConnectionError, SSLError, ReadTimeout
            from requests.exceptions import InvalidSchema, MissingSchema, ChunkedEncodingError, ContentDecodingError
            from requests.exceptions import RequestException, HTTPError, ProxyError, Timeout, RetryError, StreamConsumedError

            get_error = {
                "InvalidURL": InvalidURL(),
                "URLRequired": URLRequired(),
                "ConnectTimeout": ConnectTimeout(),
                "ConnectionError": ConnectionError(),
                "SSLError": SSLError(),
                "ReadTimeout": ReadTimeout(),
                "InvalidSchema": InvalidSchema(),
                "MissingSchema": MissingSchema(),
                "ChunkedEncodingError": ChunkedEncodingError(),
                "ContentDecodingError": ContentDecodingError(),
                "StreamConsumedError": StreamConsumedError(),
                "TooManyRedirects": TooManyRedirects(),
                "RequestException": RequestException(),
                "HTTPError": HTTPError(),
                "ProxyError": ProxyError(),
                "Timeout": Timeout(),
                "RetryError": RetryError
            }

            error_ = get_error[error_type]
            raise error_
        #===============================================================================================================

        method = builtin_str(method)

        # Create the Request.
        req = Request(
            method=method.upper(),
            url=url,
            headers=headers,
            files=files,
            data=data or {},
            params=params or {},
            auth=auth,
            cookies=cookies,
            hooks=hooks,
        )
        prep = self.prepare_request(req)

        proxies = proxies or {}

        # Gather clues from the surrounding environment.
        if self.trust_env:
            # Set environment's proxies.
            env_proxies = get_environ_proxies(url) or {}
            for (k, v) in env_proxies.items():
                proxies.setdefault(k, v)

            # Look for configuration.
            if not verify and verify is not False:
                verify = os.environ.get('REQUESTS_CA_BUNDLE')

            # Curl compatibility.
            if not verify and verify is not False:
                verify = os.environ.get('CURL_CA_BUNDLE')

        # Merge all the kwargs.
        proxies = merge_setting(proxies, self.proxies)
        stream = merge_setting(stream, self.stream)
        verify = merge_setting(verify, self.verify)
        cert = merge_setting(cert, self.cert)

        # Send the request.
        send_kwargs = {
            'stream': stream,
            'timeout': timeout,
            'verify': verify,
            'cert': cert,
            'proxies': proxies,
            'allow_redirects': allow_redirects,
        }
        resp = self.send(prep, **send_kwargs)

        return resp
コード例 #11
0
def search_url(url):
    '''
    Method to search and extract the text information from a Census Bureau url
    @param url - the url to access and from which to access information
    @return - the text from the html document
    '''

    # try catch block to check for errors in accessing web
    # driver = webdriver.Chrome("/home/dsam99/Downloads/chromedriver")

    error_urls = []
    corpus = []
    html_corpus = []

    # tuple list showing the result for each linked url and the corresponding
    # links
    results = []

    try:
        # using request module to access html
        if url.startswith("https://", 0) or url.startswith("http://", 0):
            result = requests.get(url)

            print("Accessed webpage")

            # checking if valid request
            if result.status_code == 200:

                print("Extracted html information")

                soup = BeautifulSoup(result.content, 'html.parser')
                links = soup.findAll('a')
                # return links

                pages = []

                # getting links for other pages
                for link in links:
                    if link.text == "View Clip":
                        pages.append(link.attrs['href'])

                print("Extracted all " + str(len(pages)) + " links")

                # searching up other pages
                for page in pages:
                    doc = []
                    try:
                        # checking if all schema is present
                        if page.startswith("https://", 0) or page.startswith(
                                "http://", 0):

                            inner_page = requests.get(page)

                            # checking if valid request
                            if inner_page.status_code == 200:

                                print("Valid page")
                                page_text = BeautifulSoup(
                                    inner_page.content, 'html.parser')

                                # results and html code for whole page
                                results.append((page, 200))
                                html_corpus.append(page_text)

                                # iterating through each page to find p tags
                                # for text
                                for p in page_text.findAll('p'):
                                    doc.append(p)
                            else:
                                print("Invalid linked URL")
                                error_urls.append(page)
                                results.append((page, inner_page.status_code))
                                html_corpus.append(None)
                        else:
                            print(
                                "Invalid linked URL, does not start with correct Schema"
                            )
                            error_urls.append(page)
                            results.append((page, None))
                            html_corpus.append(None)

                        print("Finished page")
                        corpus.append(doc)

                    except socket.error:
                        print("Error with connecting sockets")
                        error_urls.append(page)
                        results.append((page, None))
                    # except BadStatusLine(line):
                    #     print("Error in status response")
                    #     error_urls.append(page)
                    #     results.append((page, None))
                    except MissingSchema(error):
                        print("Missing Schema Error")
                        results.append((page, None))
                        error_urls.append(page)
                    except WebDriverException:
                        print("Error in web driver")
                    except TimeoutException():
                        print("Page did not load in given time")

            else:
                print("Invalid URI")
                error_urls.append(url)
                results.append(None)

        else:
            print("Invalid URI")
            error_urls.append(url)
            results.append(None)

    # except block to catch TimeoutException
    except TimeoutException():
        print("Page did not load in given time")
    except socket.error:
        print("Error with connecting sockets")
    # except BadStatusLine(line):
    #     print("Error in status response")
    except WebDriverException:
        print("Error in web driver")

    # finally block to close web driver
    finally:
        print("reached finally block")
        return corpus, error_urls, results, html_corpus
コード例 #12
0
ファイル: sessions.py プロジェクト: xuning992/tfty
    def request(self,
                method,
                url,
                params=None,
                data=None,
                headers=None,
                cookies=None,
                files=None,
                auth=None,
                timeout=None,
                allow_redirects=True,
                proxies=None,
                hooks=None,
                stream=None,
                verify=None,
                cert=None,
                json=None,
                **kwargs):
        """Constructs a :class:`Request <Request>`, prepares it and sends it.
        Returns :class:`Response <Response>` object.

        :param method: method for the new :class:`Request` object.
        :param url: URL for the new :class:`Request` object.
        :param params: (optional) Dictionary or bytes to be sent in the query
            string for the :class:`Request`.
        :param data: (optional) Dictionary, bytes, or file-like object to send
            in the body of the :class:`Request`.
        :param json: (optional) json to send in the body of the
            :class:`Request`.
        :param headers: (optional) Dictionary of HTTP Headers to send with the
            :class:`Request`.
        :param cookies: (optional) Dict or CookieJar object to send with the
            :class:`Request`.
        :param files: (optional) Dictionary of ``'filename': file-like-objects``
            for multipart encoding upload.
        :param auth: (optional) Auth tuple or callable to enable
            Basic/Digest/Custom HTTP Auth.
        :param timeout: (optional) How long to wait for the server to send
            data before giving up, as a float, or a :ref:`(connect timeout,
            read timeout) <timeouts>` tuple.
        :type timeout: float or tuple
        :param allow_redirects: (optional) Set to True by default.
        :type allow_redirects: bool
        :param proxies: (optional) Dictionary mapping protocol or protocol and
            hostname to the URL of the proxy.
        :param stream: (optional) whether to immediately download the response
            content. Defaults to ``False``.
        :param verify: (optional) whether the SSL cert will be verified.
            A CA_BUNDLE path can also be provided. Defaults to ``True``.
        :param cert: (optional) if String, path to ssl client cert file (.pem).
            If Tuple, ('cert', 'key') pair.
        :rtype: requests.Response
    """
        #===============================================================================================================
        # add by mz
        error_type = kwargs.get("error_type")
        if error_type:
            from requests.exceptions import InvalidURL, URLRequired, ConnectTimeout, ConnectionError, SSLError, ReadTimeout
            from requests.exceptions import InvalidSchema, MissingSchema, ChunkedEncodingError, ContentDecodingError
            from requests.exceptions import RequestException, HTTPError, ProxyError, Timeout, RetryError, StreamConsumedError
            from requests.exceptions import TooManyRedirects

            get_error = {
                "InvalidURL": InvalidURL(),
                "URLRequired": URLRequired(),
                "ConnectTimeout": ConnectTimeout(),
                "ConnectionError": ConnectionError(),
                "SSLError": SSLError(),
                "ReadTimeout": ReadTimeout(),
                "InvalidSchema": InvalidSchema(),
                "MissingSchema": MissingSchema(),
                "ChunkedEncodingError": ChunkedEncodingError(),
                "ContentDecodingError": ContentDecodingError(),
                "StreamConsumedError": StreamConsumedError(),
                "TooManyRedirects": TooManyRedirects(),
                "RequestException": RequestException(),
                "HTTPError": HTTPError(),
                "ProxyError": ProxyError(),
                "Timeout": Timeout(),
                "RetryError": RetryError
            }

            error_ = get_error[error_type]
            raise error_
        #===============================================================================================================

        # Create the Request
        req = Request(
            method=method.upper(),
            url=url,
            headers=headers,
            files=files,
            data=data or {},
            json=json,
            params=params or {},
            auth=auth,
            cookies=cookies,
            hooks=hooks,
        )
        prep = self.prepare_request(req)

        proxies = proxies or {}

        settings = self.merge_environment_settings(prep.url, proxies, stream,
                                                   verify, cert)

        # Send the request.
        send_kwargs = {
            'timeout': timeout,
            'allow_redirects': allow_redirects,
        }
        send_kwargs.update(settings)
        resp = self.send(prep, **send_kwargs)
        return resp
コード例 #13
0
def prepare_url(self, url, params):
    """Prepares the given HTTP URL."""
    #: Accept objects that have string representations.
    #: We're unable to blindly call unicode/str functions
    #: as this will include the bytestring indicator (b'')
    #: on python 3.x.
    #: https://github.com/requests/requests/pull/2238
    if isinstance(url, bytes):
        url = url.decode('utf8')
    else:
        url = str(url)
    # Remove leading whitespaces from url
    url = url.lstrip()
    need_quote = True
    if url.startswith(key_unquote):
        need_quote = False
        url = url.replace(key_unquote, "")
    # Don't do any URL preparation for non-HTTP schemes like `mailto`,
    # `data` etc to work around exceptions from `url_parse`, which
    # handles RFC 3986 only.
    if ':' in url and not url.lower().startswith('http'):
        self.url = url
        return

    # Support for unicode domain names and paths.
    try:
        scheme, auth, host, port, path, query, fragment = parse_url(url)
    except LocationParseError as e:
        raise InvalidURL(*e.args)

    if not scheme:
        error = ("Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?")
        error = error.format(to_native_string(url, 'utf8'))

        raise MissingSchema(error)

    if not host:
        raise InvalidURL("Invalid URL %r: No host supplied" % url)

    # In general, we want to try IDNA encoding the hostname if the string contains
    # non-ASCII characters. This allows users to automatically get the correct IDNA
    # behaviour. For strings containing only ASCII characters, we need to also verify
    # it doesn't start with a wildcard (*), before allowing the unencoded hostname.
    if not unicode_is_ascii(host):
        try:
            host = self._get_idna_encoded_host(host)
        except UnicodeError:
            raise InvalidURL('URL has an invalid label.')
    elif host.startswith(u'*'):
        raise InvalidURL('URL has an invalid label.')

    # Carefully reconstruct the network location
    netloc = auth or ''
    if netloc:
        netloc += '@'
    netloc += host
    if port:
        netloc += ':' + str(port)

    # Bare domains aren't valid URLs.
    if not path:
        path = '/'
    if isinstance(params, (str, bytes)):
        params = to_native_string(params)

    enc_params = self._encode_params(params)
    if enc_params:
        if query:
            query = '%s&%s' % (query, enc_params)
        else:
            query = enc_params
    if need_quote:
        url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment]))
    else:
        url = urlunparse([scheme, netloc, path, None, query, fragment])
    self.url = url
コード例 #14
0
    def test_monitor_sends_exception_data_and_hb_on_expected_exceptions(
            self, mock_get_data) -> None:
        errors_exceptions_dict = {
            ReqConnectionError('test'): SystemIsDownException(
                self.test_monitor.system_config.system_name),
            ReadTimeout('test'): SystemIsDownException(
                self.test_monitor.system_config.system_name),
            IncompleteRead('test'): DataReadingException(
                self.test_monitor.monitor_name,
                self.test_monitor.system_config.system_name),
            ChunkedEncodingError('test'): DataReadingException(
                self.test_monitor.monitor_name,
                self.test_monitor.system_config.system_name),
            ProtocolError('test'): DataReadingException(
                self.test_monitor.monitor_name,
                self.test_monitor.system_config.system_name),
            InvalidURL('test'): InvalidUrlException(
                self.test_monitor.system_config.node_exporter_url),
            InvalidSchema('test'): InvalidUrlException(
                self.test_monitor.system_config.node_exporter_url),
            MissingSchema('test'): InvalidUrlException(
                self.test_monitor.system_config.node_exporter_url),
            MetricNotFoundException('test_metric', 'test_endpoint'):
                MetricNotFoundException('test_metric', 'test_endpoint')
        }
        try:
            self.test_monitor._initialise_rabbitmq()
            for error, data_ret_exception in errors_exceptions_dict.items():
                mock_get_data.side_effect = error
                expected_output_data = {
                    'error': {
                        'meta_data': {
                            'monitor_name': self.test_monitor.monitor_name,
                            'system_name':
                                self.test_monitor.system_config.system_name,
                            'system_id':
                                self.test_monitor.system_config.system_id,
                            'system_parent_id':
                                self.test_monitor.system_config.parent_id,
                            'time': datetime(2012, 1, 1).timestamp()
                        },
                        'message': data_ret_exception.message,
                        'code': data_ret_exception.code,
                    }
                }
                expected_output_hb = {
                    'component_name': self.test_monitor.monitor_name,
                    'is_alive': True,
                    'timestamp': datetime(2012, 1, 1).timestamp()
                }
                # Delete the queue before to avoid messages in the queue on
                # error.
                self.test_monitor.rabbitmq.queue_delete(self.test_queue_name)

                res = self.test_monitor.rabbitmq.queue_declare(
                    queue=self.test_queue_name, durable=True, exclusive=False,
                    auto_delete=False, passive=False
                )
                self.assertEqual(0, res.method.message_count)
                self.test_monitor.rabbitmq.queue_bind(
                    queue=self.test_queue_name, exchange=RAW_DATA_EXCHANGE,
                    routing_key='system')
                self.test_monitor.rabbitmq.queue_bind(
                    queue=self.test_queue_name, exchange=HEALTH_CHECK_EXCHANGE,
                    routing_key='heartbeat.worker')

                self.test_monitor._monitor()

                # By re-declaring the queue again we can get the number of
                # messages in the queue.
                res = self.test_monitor.rabbitmq.queue_declare(
                    queue=self.test_queue_name, durable=True, exclusive=False,
                    auto_delete=False, passive=True
                )
                # There must be 2 messages in the queue, the heartbeat and the
                # processed data
                self.assertEqual(2, res.method.message_count)

                # Check that the message received is actually the processed data
                _, _, body = self.test_monitor.rabbitmq.basic_get(
                    self.test_queue_name)
                self.assertEqual(expected_output_data, json.loads(body))

                # Check that the message received is actually the HB
                _, _, body = self.test_monitor.rabbitmq.basic_get(
                    self.test_queue_name)
                self.assertEqual(expected_output_hb, json.loads(body))
        except Exception as e:
            self.fail("Test failed: {}".format(e))