Example #1
0
def load(use_cache_server=True):
    browsers_dict = {}
    randomize_dict = {}

    try:
        for item in get_browsers():
            browser, percent = item

            browser_key = browser

            for value, replacement in settings.REPLACEMENTS.items():
                browser_key = browser_key.replace(value, replacement)

            browser_key = browser_key.lower()

            browsers_dict[browser_key] = get_browser_versions(browser)

            # it is actually so bad way for randomizing, simple list with
            # browser_key's is event better
            # I've failed so much a lot of years ago.
            # Ideas for refactoring
            # {'chrome': <percantage|int>, 'firefox': '<percatage|int>'}
            for _ in range(int(float(percent) * 10)):
                randomize_dict[str(len(randomize_dict))] = browser_key
    except Exception as exc:
        if not use_cache_server:
            raise exc

        logger.warning(
            'Error occurred during loading data. '
            'Trying to use cache server %s',
            settings.CACHE_SERVER,
            exc_info=exc,
        )
        try:
            ret = json.loads(get(settings.CACHE_SERVER).decode('utf-8'))
        except (TypeError, ValueError):
            raise FakeUserAgentError('Can not load data from cache server')
    else:
        ret = {
            'browsers': browsers_dict,
            'randomize': randomize_dict,
        }

    if not isinstance(ret, dict):
        raise FakeUserAgentError('Data is not dictionary ', ret)

    for param in ['browsers', 'randomize']:
        if param not in ret:
            raise FakeUserAgentError('Missing data param: ', param)

        if not isinstance(ret[param], dict):
            raise FakeUserAgentError('Data param is not dictionary', ret[param])  # noqa

        if not ret[param]:
            raise FakeUserAgentError('Data param is empty', ret[param])

    return ret
Example #2
0
    def __getattr__(self, attr):
        if attr in self.safe_attrs:
            return super(UserAgent, self).__getattr__(attr)

        try:
            for value, replacement in settings.REPLACEMENTS.items():
                attr = attr.replace(value, replacement)

            attr = attr.lower()

            if attr == 'random':
                browser = random.choice(self.data_randomize)
            else:
                browser = settings.SHORTCUTS.get(attr, attr)

            return random.choice(self.data_browsers[browser])
        except (KeyError, IndexError):
            if self.fallback is None:
                raise FakeUserAgentError(
                    'Error occurred during getting browser')  # noqa
            else:
                logger.warning(
                    'Error occurred during getting browser, '
                    'but was suppressed with fallback.', )

                return self.fallback
Example #3
0
def get_browser_versions(browser, verify_ssl=True):
    """
    very very hardcoded/dirty re/split stuff, but no dependencies
    """
    html = get(
        settings.BROWSER_BASE_PAGE.format(browser=quote_plus(browser)),
        verify_ssl=verify_ssl,
    )
    html = html.decode('iso-8859-1')
    html = html.split('<div id=\'liste\'>')[1]
    html = html.split('</div>')[0]

    pattern = r'\?id=\d+\'>(.+?)</a'
    browsers_iter = re.finditer(pattern, html, re.UNICODE)

    browsers = []

    for browser in browsers_iter:
        if 'more' in browser.group(1).lower():
            continue

        browsers.append(browser.group(1))

        if len(browsers) == settings.BROWSERS_COUNT_LIMIT:
            break

    if not browsers:
        raise FakeUserAgentError(
            'No browsers version found for {browser}'.format(browser=browser))

    return browsers
Example #4
0
def get(url):
    request = Request(url)

    attempt = 0

    while True:
        attempt += 1

        try:
            return urlopen(request, timeout=settings.HTTP_TIMEOUT).read()
        except (URLError, OSError) as exc:
            logger.debug(
                'Error occurred during fetching %s',
                url,
                exc_info=exc,
            )

            if attempt == settings.HTTP_RETRIES:
                raise FakeUserAgentError('Maximum amount of retries reached')
            else:
                logger.debug(
                    'Sleeping for %s secconds',
                    settings.HTTP_TIMEOUT,
                )
                sleep(settings.HTTP_TIMEOUT)
Example #5
0
    def get_ua(self, proxy=None):
        '''Gets random UA based on the type setting (random, firefox…)'''
        if proxy and proxy in self.proxy2ua:
            return self.proxy2ua[proxy]

        uas = self.uas
        for key in self.ua_type:
            try:
                uas = uas[key]
            except (KeyError, IndexError):
                uas = None

            if uas is None:
                if self.fallback is None:
                    raise FakeUserAgentError(
                        'Error occurred during getting browser')
                else:
                    logger.warning(
                        'Error occurred during getting browser for type "%s", '
                        'but was suppressed with fallback.',
                        '.'.join(self.ua_type))
                    return self.fallback

        ua = random.choice(uas) if isinstance(uas, list) else uas
        if proxy:
            self.proxy2ua[proxy] = ua

        return ua
Example #6
0
def get(url, verify_ssl=True):
    attempt = 0

    while True:
        request = Request(url)

        attempt += 1

        try:
            if urlopen_has_ssl_context:
                if not verify_ssl:
                    context = ssl._create_unverified_context()
                else:
                    context = None

                with contextlib.closing(urlopen(
                    request,
                    timeout=settings.HTTP_TIMEOUT,
                    context=context,
                )) as response:
                    return response.read()
            else:  # ssl context is not supported ;(
                with contextlib.closing(urlopen(
                    request,
                    timeout=settings.HTTP_TIMEOUT,
                )) as response:
                    return response.read()
        except (URLError, OSError) as exc:
            logger.debug(
                'Error occurred during fetching %s',
                url,
                exc_info=exc,
            )

            if attempt == settings.HTTP_RETRIES:
                raise FakeUserAgentError('Maximum amount of retries reached')
            else:
                logger.debug(
                    'Sleeping for %s seconds',
                    settings.HTTP_DELAY,
                )
                try:
                    response = requests.get(url)
                    return response.content
                except:
                    pass
                sleep(settings.HTTP_DELAY)