Beispiel #1
0
def download_file(url, filename, session=None):
    # create session
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session,
                           cache=caches.FileCache(
                               os.path.join(cache_dir, 'sessions')))

    # request session headers
    session.headers.update({
        'User-Agent': USER_AGENT,
        'Accept-Encoding': 'gzip,deflate'
    })

    # request session ssl verify
    session.verify = False

    # request session streaming
    session.stream = True

    # request session proxies
    if sickbeard.PROXY_SETTING:
        logger.log("Using proxy for url: " + url, logger.DEBUG)
        session.proxies = {
            "http": sickbeard.PROXY_SETTING,
            "https": sickbeard.PROXY_SETTING,
        }

    try:
        resp = session.get(url)
        if not resp.ok:
            logger.log(
                u"Requested url " + url + " returned status code is " +
                str(resp.status_code) + ': ' +
                clients.http_error_code[resp.status_code], logger.DEBUG)
            return False

        with open(filename, 'wb') as fp:
            for chunk in resp.iter_content(chunk_size=1024):
                if chunk:
                    fp.write(chunk)
                    fp.flush()

        chmodAsParent(filename)
    except requests.exceptions.HTTPError, e:
        _remove_file_failed(filename)
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url,
                   logger.WARNING)
        return False
Beispiel #2
0
    def _load_url(self, url, params=None, language=None):
        log().debug('Retrieving URL %s' % url)

        session = requests.session()

        if self.config['cache_enabled']:
            session = CacheControl(session,
                                   cache=caches.FileCache(
                                       self.config['cache_location']))

        if self.config['proxy']:
            log().debug('Using proxy for URL: %s' % url)
            session.proxies = {
                'http': self.config['proxy'],
                'https': self.config['proxy']
            }

        headers = {
            'Accept-Encoding': 'gzip,deflate',
            'Authorization': 'Bearer %s' % self.get_token(),
            'Accept': 'application/vnd.thetvdb.v%s' % __api_version__
        }

        if None is not language and language in self.config['valid_languages']:
            headers.update({'Accept-Language': language})

        resp = None
        if self._match_url_pattern('url_seriesInfo', url):
            self.show_not_found = False
        self.not_found = False
        try:
            resp = getURL(url.strip(),
                          params=params,
                          session=session,
                          headers=headers,
                          json=True,
                          raise_status_code=True,
                          raise_exceptions=True)
        except requests.exceptions.HTTPError as e:
            if 401 == e.response.status_code:
                # token expired, get new token, raise error to retry
                sickbeard.THETVDB_V2_API_TOKEN = self.get_new_token()
                raise tvdb_tokenexpired
            elif 404 == e.response.status_code:
                if self._match_url_pattern('url_seriesInfo', url):
                    self.show_not_found = True
                elif self._match_url_pattern('url_epInfo', url):
                    resp = {'data': []}
                self.not_found = True
            elif 404 != e.response.status_code:
                raise tvdb_error
        except (StandardError, Exception):
            raise tvdb_error

        map_show = {
            'airstime': 'airs_time',
            'airsdayofweek': 'airs_dayofweek',
            'imdbid': 'imdb_id',
            'writers': 'writer'
        }

        def map_show_keys(data):
            keep_data = {}
            del_keys = []
            new_data = {}
            for k, v in data.iteritems():
                k_org = k
                k = k.lower()
                if None is not v:
                    if k in ['banner', 'fanart', 'poster'] and v:
                        v = self.config['url_artworkPrefix'] % v
                    elif 'genre' == k:
                        keep_data['genre_list'] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, basestring)
                        ])
                    elif 'gueststars' == k:
                        keep_data['gueststars_list'] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, basestring)
                        ])
                    elif 'writers' == k:
                        keep_data[k] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, basestring)
                        ])
                    elif 'firstaired' == k:
                        if v:
                            try:
                                v = parse(v, fuzzy=True).strftime('%Y-%m-%d')
                            except (StandardError, Exception):
                                v = None
                        else:
                            v = None
                    elif 'imdbid' == k:
                        if v:
                            if re.search(r'^(tt)?\d{1,7}$', v, flags=re.I):
                                v = clean_data(v)
                            else:
                                v = ''
                    else:
                        v = clean_data(v)
                if k in map_show:
                    k = map_show[k]
                if k_org is not k:
                    del_keys.append(k_org)
                    new_data[k] = v
                else:
                    data[k] = v
            for d in del_keys:
                del (data[d])
            if isinstance(data, dict):
                data.update(new_data)
                data.update(keep_data)
            return data

        if resp:
            if isinstance(resp['data'], dict):
                resp['data'] = map_show_keys(resp['data'])
            elif isinstance(resp['data'], list):
                for idx, row in enumerate(resp['data']):
                    if isinstance(row, dict):
                        resp['data'][idx] = map_show_keys(row)
            return resp
        return dict([(u'data', None)])
Beispiel #3
0
def getURL(url,
           post_data=None,
           params=None,
           headers=None,
           timeout=30,
           session=None,
           json=False):
    """
    Returns a byte-string retrieved from the url provider.
    """

    # request session
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session,
                           cache=caches.FileCache(
                               os.path.join(cache_dir, 'sessions')))

    # request session headers
    req_headers = {'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'}
    if headers:
        req_headers.update(headers)
    session.headers.update(req_headers)

    # request session ssl verify
    session.verify = False

    # request session paramaters
    session.params = params

    try:
        # Remove double-slashes from url
        parsed = list(urlparse.urlparse(url))
        parsed[2] = re.sub("/{2,}", "/",
                           parsed[2])  # replace two or more / with one
        url = urlparse.urlunparse(parsed)

        # request session proxies
        if sickbeard.PROXY_SETTING:
            logger.log("Using proxy for url: " + url, logger.DEBUG)
            session.proxies = {
                "http": sickbeard.PROXY_SETTING,
                "https": sickbeard.PROXY_SETTING,
            }

        # decide if we get or post data to server
        if post_data:
            resp = session.post(url, data=post_data, timeout=timeout)
        else:
            resp = session.get(url, timeout=timeout)

        if not resp.ok:
            logger.log(
                u"Requested url " + url + " returned status code is " +
                str(resp.status_code) + ': ' +
                clients.http_error_code[resp.status_code], logger.DEBUG)
            return

    except requests.exceptions.HTTPError, e:
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url,
                   logger.WARNING)
        return
Beispiel #4
0
def get_url(
        url,  # type: AnyStr
        post_data=None,  # type: Optional
        params=None,  # type: Optional
        headers=None,  # type: Optional[Dict]
        timeout=30,  # type: int
        session=None,  # type: Optional[requests.Session]
        parse_json=False,  # type: bool
        raise_status_code=False,  # type: bool
        raise_exceptions=False,  # type: bool
        as_binary=False,  # type: bool
        encoding=None,  # type: Optional[AnyStr]
        **kwargs):
    # type: (...) -> Optional[Union[AnyStr, bool, bytes, Dict, Tuple[Union[Dict, List], requests.Session]]]
    """
    Either
    1) Returns a byte-string retrieved from the url provider.
    2) Return True/False if success after using kwargs 'savefile' set to file pathname.
    3) Returns Tuple response, session if success after setting kwargs 'resp_sess' True.
    4) JSON Dict if parse_json=True.

    :param url: url
    :param post_data: post data
    :param params:
    :param headers: headers to add
    :param timeout: timeout
    :param session: optional session object
    :param parse_json: return JSON Dict
    :param raise_status_code: raise exception for status codes
    :param raise_exceptions: raise exceptions
    :param as_binary: return bytes instead of text
    :param encoding: overwrite encoding return header if as_binary is False
    :param kwargs:
    :return:
    """

    response_attr = ('text', 'content')[as_binary]

    # selectively mute some errors
    mute = filter_list(lambda x: kwargs.pop(x, False), [
        'mute_connect_err', 'mute_read_timeout', 'mute_connect_timeout',
        'mute_http_error'
    ])

    # reuse or instantiate request session
    resp_sess = kwargs.pop('resp_sess', None)
    if None is session:
        session = CloudflareScraper.create_scraper()
        session.headers.update({'User-Agent': USER_AGENT})

    # download and save file or simply fetch url
    savename = kwargs.pop('savename', None)
    if savename:
        # session streaming
        session.stream = True

    if not kwargs.pop('nocache', False):
        cache_dir = CACHE_DIR or get_system_temp_dir()
        session = CacheControl(sess=session,
                               cache=caches.FileCache(
                                   ek.ek(os.path.join, cache_dir, 'sessions')))

    provider = kwargs.pop('provider', None)

    # handle legacy uses of `json` param
    if kwargs.get('json'):
        parse_json = kwargs.pop('json')

    # session master headers
    req_headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip,deflate'
    }
    if headers:
        req_headers.update(headers)
    if hasattr(session, 'reserved') and 'headers' in session.reserved:
        req_headers.update(session.reserved['headers'] or {})
    session.headers.update(req_headers)

    # session parameters
    session.params = params

    # session ssl verify
    session.verify = False

    # don't trust os environments (auth, proxies, ...)
    session.trust_env = False

    response = None
    try:
        # sanitise url
        parsed = list(urlparse(url))
        parsed[2] = re.sub('/{2,}', '/',
                           parsed[2])  # replace two or more / with one
        url = urlunparse(parsed)

        # session proxies
        if PROXY_SETTING:
            (proxy_address, pac_found) = proxy_setting(PROXY_SETTING, url)
            msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
            if None is proxy_address:
                logger.debug('Proxy error, aborted the request using %s' % msg)
                return
            elif proxy_address:
                logger.debug('Using %s' % msg)
                session.proxies = {
                    'http': proxy_address,
                    'https': proxy_address
                }

        # decide if we get or post data to server
        if post_data or 'post_json' in kwargs:
            if True is post_data:
                post_data = None

            if post_data:
                kwargs.setdefault('data', post_data)

            if 'post_json' in kwargs:
                kwargs.setdefault('json', kwargs.pop('post_json'))

            response = session.post(url, timeout=timeout, **kwargs)
        else:
            response = session.get(url, timeout=timeout, **kwargs)
            if response.ok and not response.content and 'url=' in response.headers.get(
                    'Refresh', '').lower():
                url = response.headers.get('Refresh').lower().split(
                    'url=')[1].strip('/')
                if not url.startswith('http'):
                    parsed[2] = '/%s' % url
                    url = urlunparse(parsed)
                response = session.get(url, timeout=timeout, **kwargs)

        # if encoding is not in header try to use best guess
        # ignore downloads with savename
        if not savename and not as_binary:
            if encoding:
                response.encoding = encoding
            elif not response.encoding or 'charset' not in response.headers.get(
                    'Content-Type', ''):
                response.encoding = response.apparent_encoding

        # noinspection PyProtectedMember
        if provider and provider._has_signature(response.text):
            return getattr(response, response_attr)

        if raise_status_code:
            response.raise_for_status()

        if not response.ok:
            http_err_text = 'CloudFlare Ray ID' in response.text and \
                            'CloudFlare reports, "Website is offline"; ' or ''
            if response.status_code in http_error_code:
                http_err_text += http_error_code[response.status_code]
            elif response.status_code in range(520, 527):
                http_err_text += 'Origin server connection failure'
            else:
                http_err_text = 'Custom HTTP error code'
                if 'mute_http_error' not in mute:
                    logger.debug(
                        u'Response not ok. %s: %s from requested url %s' %
                        (response.status_code, http_err_text, url))
            return

    except requests.exceptions.HTTPError as e:
        if raise_status_code:
            response.raise_for_status()
        logger.warning(u'HTTP error %s while loading URL%s' %
                       (e.errno, _maybe_request_url(e)))
        return
    except requests.exceptions.ConnectionError as e:
        if 'mute_connect_err' not in mute:
            logger.warning(u'Connection error msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except requests.exceptions.ReadTimeout as e:
        if 'mute_read_timeout' not in mute:
            logger.warning(u'Read timed out msg:%s while loading URL%s' %
                           (ex(e), _maybe_request_url(e)))
        if raise_exceptions:
            raise e
        return
    except (requests.exceptions.Timeout, socket.timeout) as e:
        if 'mute_connect_timeout' not in mute:
            logger.warning(
                u'Connection timed out msg:%s while loading URL %s' %
                (ex(e), _maybe_request_url(e, url)))
        if raise_exceptions:
            raise e
        return
    except (BaseException, Exception) as e:
        if ex(e):
            logger.warning(
                u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s'
                % (url, ex(e), traceback.format_exc()))
        else:
            logger.warning(
                u'Unknown exception while loading URL %s\r\nDetail... %s' %
                (url, traceback.format_exc()))
        if raise_exceptions:
            raise e
        return

    if parse_json:
        try:
            data_json = response.json()
            if resp_sess:
                return ({}, data_json)[isinstance(data_json,
                                                  (dict, list))], session
            return ({}, data_json)[isinstance(data_json, (dict, list))]
        except (TypeError, Exception) as e:
            logger.warning(u'JSON data issue from URL %s\r\nDetail... %s' %
                           (url, ex(e)))
            if raise_exceptions:
                raise e
            return None

    if savename:
        try:
            write_file(savename,
                       response,
                       raw=True,
                       raise_exceptions=raise_exceptions)
        except (BaseException, Exception) as e:
            if raise_exceptions:
                raise e
            return
        return True

    if resp_sess:
        return getattr(response, response_attr), session

    return getattr(response, response_attr)
Beispiel #5
0
    def _load_url(self, url, params=None, language=None):
        log.debug('Retrieving URL %s' % url)

        session = requests.session()

        if self.config['cache_enabled']:
            session = CacheControl(session,
                                   cache=caches.FileCache(
                                       self.config['cache_location']))

        if self.config['proxy']:
            log.debug('Using proxy for URL: %s' % url)
            session.proxies = {
                'http': self.config['proxy'],
                'https': self.config['proxy']
            }

        headers = {
            'Accept-Encoding': 'gzip,deflate',
            'Authorization': 'Bearer %s' % self.get_token(),
            'Accept': 'application/vnd.thetvdb.v%s' % __api_version__
        }

        if None is not language and language in self.config['valid_languages']:
            headers.update({'Accept-Language': language})

        resp = None
        is_series_info = self._match_url_pattern('url_series_info', url)
        if is_series_info:
            self.show_not_found = False
        self.not_found = False
        try:
            resp = get_url(url.strip(),
                           params=params,
                           session=session,
                           headers=headers,
                           parse_json=True,
                           raise_status_code=True,
                           raise_exceptions=True)
        except requests.exceptions.HTTPError as e:
            if 401 == e.response.status_code:
                # token expired, get new token, raise error to retry
                global THETVDB_V2_API_TOKEN
                THETVDB_V2_API_TOKEN = self.get_new_token()
                raise TvdbTokenexpired
            elif 404 == e.response.status_code:
                if is_series_info:
                    self.show_not_found = True
                elif self._match_url_pattern('url_series_episodes_info', url):
                    resp = {'data': []}
                self.not_found = True
            elif 404 != e.response.status_code:
                raise TvdbError
        except (BaseException, Exception):
            raise TvdbError

        if is_series_info and isinstance(resp, dict) and isinstance(resp.get('data'), dict) and \
                isinstance(resp['data'].get('seriesName'), string_types) and \
                re.search(r'^[*]\s*[*]\s*[*]', resp['data'].get('seriesName', ''), flags=re.I):
            self.show_not_found = True
            self.not_found = True

        map_show = {
            'airstime': 'airs_time',
            'airsdayofweek': 'airs_dayofweek',
            'imdbid': 'imdb_id',
            'writers': 'writer',
            'siterating': 'rating'
        }

        def map_show_keys(data):
            keep_data = {}
            del_keys = []
            new_data = {}
            for k, v in iteritems(data):
                k_org = k
                k = k.lower()
                if None is not v:
                    if k in ['banner', 'fanart', 'poster'] and v:
                        v = self.config['url_artworks'] % v
                    elif 'genre' == k:
                        keep_data['genre_list'] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, string_types)
                        ])
                    elif 'gueststars' == k:
                        keep_data['gueststars_list'] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, string_types)
                        ])
                    elif 'writers' == k:
                        keep_data[k] = v
                        v = '|%s|' % '|'.join([
                            clean_data(c)
                            for c in v if isinstance(c, string_types)
                        ])
                    elif 'rating' == k:
                        new_data['contentrating'] = v
                    elif 'firstaired' == k:
                        if v:
                            try:
                                v = parse(v, fuzzy=True).strftime('%Y-%m-%d')
                            except (BaseException, Exception):
                                v = None
                        else:
                            v = None
                    elif 'imdbid' == k:
                        if v:
                            if re.search(r'^(tt)?\d{1,7}$', v, flags=re.I):
                                v = clean_data(v)
                            else:
                                v = ''
                    else:
                        v = clean_data(v)
                else:
                    if 'seriesname' == k:
                        if isinstance(data.get('aliases'),
                                      list) and 0 < len(data.get('aliases')):
                            v = data['aliases'].pop(0)
                        # this is a invalid show, it has no Name
                        if None is v:
                            return None

                if k in map_show:
                    k = map_show[k]
                if k_org is not k:
                    del_keys.append(k_org)
                    new_data[k] = v
                else:
                    data[k] = v
            for d in del_keys:
                del (data[d])
            if isinstance(data, dict):
                data.update(new_data)
                data.update(keep_data)
            return data

        if resp:
            if isinstance(resp['data'], dict):
                resp['data'] = map_show_keys(resp['data'])
            elif isinstance(resp['data'], list):
                data_list = []
                for idx, row in enumerate(resp['data']):
                    if isinstance(row, dict):
                        cr = map_show_keys(row)
                        if None is not cr:
                            data_list.append(cr)
                resp['data'] = data_list
            return resp
        return dict([(u'data', None)])
Beispiel #6
0
def download_file(url, filename, session=None):
    # create session
    if None is session:
        session = requests.session()
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, 'sessions')))

    # request session headers
    session.headers.update({'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'})

    # request session ssl verify
    session.verify = False

    # request session streaming
    session.stream = True

    # request session proxies
    if sickbeard.PROXY_SETTING:
        (proxy_address, pac_found) = proxy_setting(sickbeard.PROXY_SETTING, url)
        msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
        if None is proxy_address:
            logger.log('Proxy error, aborted the request using %s' % msg, logger.DEBUG)
            return
        elif proxy_address:
            logger.log('Using %s' % msg, logger.DEBUG)
            session.proxies = {
                'http': proxy_address,
                'https': proxy_address
            }

    try:
        resp = session.get(url)
        if not resp.ok:
            logger.log(u"Requested url " + url + " returned status code is " + str(
                resp.status_code) + ': ' + clients.http_error_code[resp.status_code], logger.DEBUG)
            return False

        with open(filename, 'wb') as fp:
            for chunk in resp.iter_content(chunk_size=1024):
                if chunk:
                    fp.write(chunk)
                    fp.flush()

        chmodAsParent(filename)
    except requests.exceptions.HTTPError as e:
        _remove_file_failed(filename)
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING)
        return False
    except requests.exceptions.ConnectionError as e:
        _remove_file_failed(filename)
        logger.log(u"Connection error " + str(e.message) + " while loading URL " + url, logger.WARNING)
        return False
    except requests.exceptions.Timeout as e:
        _remove_file_failed(filename)
        logger.log(u"Connection timed out " + str(e.message) + " while loading URL " + url, logger.WARNING)
        return False
    except EnvironmentError as e:
        _remove_file_failed(filename)
        logger.log(u"Unable to save the file: " + ex(e), logger.ERROR)
        return False
    except Exception:
        _remove_file_failed(filename)
        logger.log(u"Unknown exception while loading URL " + url + ": " + traceback.format_exc(), logger.WARNING)
        return False

    return True
Beispiel #7
0
def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False, **kwargs):
    """
    Returns a byte-string retrieved from the url provider.
    """

    # request session
    if None is session:
        session = requests.session()
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, 'sessions')))

    # request session headers
    req_headers = {'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'}
    if headers:
        req_headers.update(headers)
    session.headers.update(req_headers)

    # request session ssl verify
    session.verify = False

    # request session paramaters
    session.params = params

    try:
        # Remove double-slashes from url
        parsed = list(urlparse.urlparse(url))
        parsed[2] = re.sub("/{2,}", "/", parsed[2])  # replace two or more / with one
        url = urlparse.urlunparse(parsed)

        # request session proxies
        if sickbeard.PROXY_SETTING:
            (proxy_address, pac_found) = proxy_setting(sickbeard.PROXY_SETTING, url)
            msg = '%sproxy for url: %s' % (('', 'PAC parsed ')[pac_found], url)
            if None is proxy_address:
                logger.log('Proxy error, aborted the request using %s' % msg, logger.DEBUG)
                return
            elif proxy_address:
                logger.log('Using %s' % msg, logger.DEBUG)
                session.proxies = {
                    'http': proxy_address,
                    'https': proxy_address
                }

        # decide if we get or post data to server
        if post_data:
            resp = session.post(url, data=post_data, timeout=timeout, **kwargs)
        else:
            resp = session.get(url, timeout=timeout, **kwargs)

        if not resp.ok:
            if resp.status_code in clients.http_error_code:
                http_err_text = clients.http_error_code[resp.status_code]
            elif resp.status_code in range(520, 527):
                http_err_text = 'CloudFlare to origin server connection failure'
            else:
                http_err_text = 'Custom HTTP error code'
            logger.log(u'Requested url %s returned status code is %s: %s'
                       % (url, resp.status_code, http_err_text), logger.DEBUG)
            return

    except requests.exceptions.HTTPError as e:
        logger.log(u'HTTP error %s while loading URL %s' % (e.errno, url), logger.WARNING)
        return
    except requests.exceptions.ConnectionError as e:
        logger.log(u'Internet connection error msg:%s while loading URL %s' % (str(e.message), url), logger.WARNING)
        return
    except requests.exceptions.ReadTimeout as e:
        logger.log(u'Read timed out msg:%s while loading URL %s' % (str(e.message), url), logger.WARNING)
        return
    except (requests.exceptions.Timeout, socket.timeout) as e:
        logger.log(u'Connection timed out msg:%s while loading URL %s' % (str(e.message), url), logger.WARNING)
        return
    except Exception as e:
        if e.message:
            logger.log(u'Exception caught while loading URL %s\r\nDetail... %s\r\n%s' % (url, str(e.message), traceback.format_exc()), logger.WARNING)
        else:
            logger.log(u'Unknown exception while loading URL %s\r\nDetail... %s' % (url, traceback.format_exc()), logger.WARNING)
        return

    if json:
        return resp.json()

    return resp.content