Ejemplo n.º 1
0
def _defaultFetcher(url):
    """Retrieve data from ``url``. css_parser default implementation of fetch
    URL function.

    Returns ``(encoding, string)`` or ``None``
    """
    try:
        request = urllib_Request(url)
        request.add_header('User-agent',
                           'css_parser %s (http://www.cthedot.de/css_parser/)' % VERSION)
        res = urllib_urlopen(request)
    except urllib_HTTPError as e:
        # http error, e.g. 404, e can be raised
        log.warn('HTTPError opening url=%s: %s %s' %
                 (url, e.code, e.msg), error=e)
    except urllib_URLError as e:
        # URLError like mailto: or other IO errors, e can be raised
        log.warn('URLError, %s' % e.reason, error=e)
    except OSError as e:
        # e.g if file URL and not found
        log.warn(e, error=OSError)
    except ValueError as e:
        # invalid url, e.g. "1"
        log.warn('ValueError, %s' % e.args[0], error=ValueError)
    else:
        if res:
            mimeType, encoding = encutils.getHTTPInfo(res)
            if mimeType != 'text/css':
                log.error('Expected "text/css" mime type for url=%r but found: %r' %
                          (url, mimeType), error=ValueError)
            content = res.read()
            if hasattr(res, 'close'):
                res.close()
            return encoding, content
Ejemplo n.º 2
0
    def _doRequest(self, url):
        """Do an HTTP request

        Return (url, rawcontent)
            url might have been changed by server due to redirects etc
        """
        self._log.debug('    CSSCapture._doRequest\n        * URL: %s' % url)

        req = urllib_Request(url)
        if self._ua:
            req.add_header('User-agent', self._ua)
            self._log.info('        * Using User-Agent: %s', self._ua)

        try:
            res = urllib_urlopen(req)
        except urllib_HTTPError as e:
            self._log.critical('    %s\n%s %s\n%s' % (
                e.geturl(), e.code, e.msg, e.headers))
            return None, None

        # get real url
        if url != res.geturl():
            url = res.geturl()
            self._log.info('        URL retrieved: %s', url)

        return url, res
Ejemplo n.º 3
0
def download_url(url, header=None):
    for retries in range(0, 5):
        try:
            r = urllib_Request(url)
            r.add_header('User-Agent', UA)
            if header:
                for h_key, h_value in header.items():
                    r.add_header(h_key, h_value)
            http_handler = HTTPHandler(debuglevel=0)
            https_handler = HTTPSHandler(debuglevel=0)
            opener = build_opener(http_handler, https_handler)
            install_opener(opener)
            u = urlopen(r)
            contents = u.read()
            u.close()
            return contents
        except:
            raise RuntimeError('Could not open URL: {}'.format(url))
Ejemplo n.º 4
0
    def _fetch_and_parse(self, job_id, url, depth):

        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
Ejemplo n.º 5
0
def _defaultFetcher(url):
    """Retrieve data from ``url``. css_parser default implementation of fetch
    URL function.

    Returns ``(encoding, string)`` or ``None``
    """
    try:
        request = urllib_Request(url)
        request.add_header(
            'User-agent',
            'css_parser %s (http://www.cthedot.de/css_parser/)' % VERSION)
        res = urllib_urlopen(request)
    except urllib_HTTPError as e:
        # http error, e.g. 404, e can be raised
        log.warn('HTTPError opening url=%s: %s %s' % (url, e.code, e.msg),
                 error=e)
    except urllib_URLError as e:
        # URLError like mailto: or other IO errors, e can be raised
        log.warn('URLError, %s' % e.reason, error=e)
    except OSError as e:
        # e.g if file URL and not found
        log.warn(e, error=OSError)
    except ValueError as e:
        # invalid url, e.g. "1"
        log.warn('ValueError, %s' % e.args[0], error=ValueError)
    else:
        if res:
            mimeType, encoding = encutils.getHTTPInfo(res)
            if mimeType != 'text/css':
                log.error(
                    'Expected "text/css" mime type for url=%r but found: %r' %
                    (url, mimeType),
                    error=ValueError)
            content = res.read()
            if hasattr(res, 'close'):
                res.close()
            return encoding, content
Ejemplo n.º 6
0
    def _fetch_and_parse(self, job_id, url, depth):
        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
Ejemplo n.º 7
0
tracert_line = re_sub(' +', ' ', tracert_line)
counter = 1
while len(tracert_line) > 5:
    if 'Ошибка передачи' in tracert_line:
        print('Ошибка передачи')
        sys_exit(ERR_TRANSITION_ERROR)
    if "Превышен интервал ожидания для запроса." in tracert_line:
        print(str(counter) + "\t" + " ".join(tracert_line.split(' ')[5:]))
        break

    node_ip = tracert_line.split(' ')[-2]
    if node_ip.startswith('['):
        node_ip = node_ip[1:-1]

    url_iptoasn = 'https://api.iptoasn.com/v1/as/ip/' + node_ip
    request = urllib_Request(url_iptoasn,
                             headers={'User-Agent': 'Mozilla/5.0'})
    json_answer = {}
    try:
        json_answer = json_loads(urlopen(request).read().decode('utf-8'))
        time_sleep(0.5)
    except URLError as error:
        print("Internet connection problem occurred:")
        print(str(error))
        sys_exit(ERR_INTERNET_CONNECTION_ERROR)

    if counter == 1:
        print('№\tIP\t\tASN\tCOUNTRY\tPROVIDER')

    if 'announced' not in json_answer:
        print(f'{counter}\tIncorrect data in json_answer')
    elif not json_answer['announced']:
Ejemplo n.º 8
0
def get_data(url, forceFetch=False, decrypt=False, useCache=True):
    if not url:
        return url

    start = datetime.datetime.now()
    tag = ''
    data = ''
    forceFetch = forceFetch or not useCache

    cache = common_cache.get(url)
    if cache:
        try:
            tag = cache.get('tag')
            data = cache.get('data')
        except:
            data = cache

        if data and not forceFetch:
            log(
                'getData Cache (' + str(
                    int((datetime.datetime.now() - start).total_seconds() *
                        1000)) + 'ms) ' + str(url), 'Debug')
            return json.loads(data)

    new_headers = {}
    if tag != '':
        new_headers.update({'If-None-Match': tag})
    new_headers.update({'User-Agent': 'okhttp/3.10.0'})
    new_headers.update({'Accept-Encoding': 'gzip'})

    try:
        request = urllib_urlopen(urllib_Request(url, headers=new_headers))
    except urllib_HTTPError as e:
        if e.code == 304:
            log(
                'getData 304 (' + str(
                    int((datetime.datetime.now() - start).total_seconds() *
                        1000)) + 'ms) ' + str(url), 'Debug')
            return json.loads(data)
        failure = str(e)
        if hasattr(e, 'code') or hasattr(e, 'reason'):
            log('get_data ERROR: ' + url + ' / ' + failure)

        log(
            'getData RequestErr (' +
            str(int(
                (datetime.datetime.now() - start).total_seconds() * 1000)) +
            'ms) ' + str(url), 'Debug')
        return json.loads(data)

    if request.info().get('Content-Encoding') == 'gzip':
        buffer = StringIO(request.read())
        deflatedContent = gzip.GzipFile(fileobj=buffer)
        data = deflatedContent.read()
    else:
        data = request.read()

    #if Etag is set, use it
    exp = datetime.timedelta(minutes=_cacheMinutes)
    if request.info().get('ETag'):
        tag = request.info().get('ETag')
        exp = datetime.timedelta(days=200)

    if decrypt:
        data = decryptBase64StringToStringss(data, _xxtea_key)

    common_cache.set(url, {'data': data, 'tag': tag}, expiration=exp)

    log(
        'getData (' +
        str(int((datetime.datetime.now() - start).total_seconds() * 1000)) +
        'ms) ' + str(url), 'Debug')
    return json.loads(data)