Exemple #1
0
class IMDbURLopener:
    """Fetch web pages and handle errors."""
    _logger = logger.getChild('urlopener')

    def __init__(self, *args, **kwargs):
        self._last_url = ''
        self.https_handler = IMDbHTTPSHandler(logger=self._logger)
        self.proxies = {}
        self.addheaders = []
        for header in ('User-Agent', 'User-agent', 'user-agent'):
            self.del_header(header)
        self.set_header('User-Agent', 'Mozilla/5.0')
        lang = kwargs.get('languages', 'en-us,en;q=0.5')
        self.set_header('Accept-Language', lang)

    def get_proxy(self):
        """Return the used proxy, or an empty string."""
        return self.proxies.get('http', '')

    def set_proxy(self, proxy):
        """Set the proxy."""
        if not proxy:
            if 'http' in self.proxies:
                del self.proxies['http']
        else:
            if not proxy.lower().startswith('http://'):
                proxy = 'http://%s' % proxy
            self.proxies['http'] = proxy

    def set_header(self, header, value, _overwrite=True):
        """Set a default header."""
        if _overwrite:
            self.del_header(header)
        self.addheaders.append((header, value))

    def get_header(self, header):
        """Return the first value of a header, or None
        if not present."""
        for index in range(len(self.addheaders)):
            if self.addheaders[index][0] == header:
                return self.addheaders[index][1]
        return None

    def del_header(self, header):
        """Remove a default header."""
        for index in range(len(self.addheaders)):
            if self.addheaders[index][0] == header:
                del self.addheaders[index]
                break

    def retrieve_unicode(self, url, size=-1):
        """Retrieves the given URL, and returns a unicode string,
        trying to guess the encoding of the data (assuming utf8
        by default)"""
        encode = None
        try:
            if size != -1:
                self.set_header('Range', 'bytes=0-%d' % size)
            handlers = []
            if 'http' in self.proxies:
                proxy_handler = ProxyHandler({
                    'http': self.proxies['http'],
                    'https': self.proxies['http']
                })
                handlers.append(proxy_handler)
            handlers.append(self.https_handler)
            uopener = build_opener(*handlers)
            uopener.addheaders = list(self.addheaders)
            response = uopener.open(url)
            content = response.read()
            self._last_url = response.url
            # Maybe the server is so nice to tell us the charset...
            if PY2:
                server_encode = response.headers.getparam('charset') or None
            else:
                server_encode = response.headers.get_content_charset(None)
            # Otherwise, look at the content-type HTML meta tag.
            if server_encode is None and content:
                begin_h = content.find(b'text/html; charset=')
                if begin_h != -1:
                    end_h = content[19 + begin_h:].find('"')
                    if end_h != -1:
                        server_encode = content[19 + begin_h:19 + begin_h +
                                                end_h]
            if server_encode:
                try:
                    if lookup(server_encode):
                        encode = server_encode
                except (LookupError, ValueError, TypeError):
                    pass
            if size != -1:
                self.del_header('Range')
            response.close()
        except IOError as e:
            if size != -1:
                # Ensure that the Range header is removed.
                self.del_header('Range')
            raise IMDbDataAccessError({
                'errcode': e.errno,
                'errmsg': str(e.strerror),
                'url': url,
                'proxy': self.get_proxy(),
                'exception type': 'IOError',
                'original exception': e
            })
        if encode is None:
            encode = 'utf8'
            # The detection of the encoding is error prone...
            self._logger.warn(
                'Unable to detect the encoding of the retrieved page [%s];'
                ' falling back to default utf8.', encode)
        if isinstance(content, str):
            return content
        return str(content, encode, 'replace')
Exemple #2
0
from imdb._exceptions import IMDbDataAccessError, IMDbParserError

from . import (companyParser, movieParser, personParser, searchMovieParser,
               searchMovieAdvancedParser, searchPersonParser,
               searchCompanyParser, searchKeywordParser, topBottomParser,
               listParser)

if PY2:
    from urllib import quote_plus
    from urllib2 import HTTPSHandler, ProxyHandler, build_opener
else:
    from urllib.parse import quote_plus
    from urllib.request import HTTPSHandler, ProxyHandler, build_opener

# Logger for miscellaneous functions.
_aux_logger = logger.getChild('aux')


class _ModuleProxy:
    """A proxy to instantiate and access parsers."""
    def __init__(self, module, defaultKeys=None):
        """Initialize a proxy for the given module; defaultKeys, if set,
        muste be a dictionary of values to set for instanced objects."""
        if defaultKeys is None:
            defaultKeys = {}
        self._defaultKeys = defaultKeys
        self._module = module

    def __getattr__(self, name):
        """Called only when no look-up is found."""
        _sm = self._module
Exemple #3
0
            elif isinstance(d[i], (list, dict)):
                _putRefs(d[i], re_titles, re_names, lastKey=lastKey)
    elif isinstance(d, dict):
        for k, v in list(d.items()):
            lastKey = k
            if isinstance(v, str):
                if lastKey in _modify_keys:
                    if re_names:
                        d[k] = re_names.sub(r"'\1' (qv)", v)
                    if re_titles:
                        d[k] = re_titles.sub(r'_\1_ (qv)', v)
            elif isinstance(v, (list, dict)):
                _putRefs(d[k], re_titles, re_names, lastKey=lastKey)


_b_p_logger = logger.getChild('build_person')


def build_person(txt,
                 personID=None,
                 billingPos=None,
                 roleID=None,
                 accessSystem='http',
                 modFunct=None,
                 headshot=None):
    """Return a Person instance from the tipical <tr>...</tr> strings
    found in the IMDb's web site."""
    # if personID is None
    #     _b_p_logger.debug('empty name or personID for "%s"', txt)
    notes = ''
    role = ''
Exemple #4
0
class DOMParserBase(object):
    """Base parser to handle HTML data from the IMDb's web server."""
    _defGetRefs = False
    _containsObjects = False

    preprocessors = []
    rules = []

    _logger = logger.getChild('domparser')

    def __init__(self):
        """Initialize the parser."""
        self._modFunct = None
        self._as = 'http'
        self._cname = self.__class__.__name__
        self._init()
        self.reset()

    def reset(self):
        """Reset the parser."""
        # Names and titles references.
        self._namesRefs = {}
        self._titlesRefs = {}
        self._reset()

    def _init(self):
        """Subclasses can override this method, if needed."""
        pass

    def _reset(self):
        """Subclasses can override this method, if needed."""
        pass

    def parse(self, html_string, getRefs=None, **kwds):
        """Return the dictionary generated from the given html string;
        getRefs can be used to force the gathering of movies/persons
        references."""
        self.reset()
        if getRefs is not None:
            self.getRefs = getRefs
        else:
            self.getRefs = self._defGetRefs
        if PY2 and isinstance(html_string, str):
            html_string = html_string.decode('utf-8')
        # Temporary fix: self.parse_dom must work even for empty strings.
        html_string = self.preprocess_string(html_string)
        if html_string:
            html_string = html_string.replace('&nbsp;', ' ')
            dom = self.get_dom(html_string)
            try:
                dom = self.preprocess_dom(dom)
            except Exception:
                self._logger.error('%s: caught exception preprocessing DOM',
                                   self._cname,
                                   exc_info=True)
            if self.getRefs:
                try:
                    self.gather_refs(dom)
                except Exception:
                    self._logger.warn('%s: unable to gather refs',
                                      self._cname,
                                      exc_info=True)
            data = self.parse_dom(dom)
        else:
            data = {}
        try:
            data = self.postprocess_data(data)
        except Exception:
            self._logger.error('%s: caught exception postprocessing data',
                               self._cname,
                               exc_info=True)
        if self._containsObjects:
            self.set_objects_params(data)
        data = self.add_refs(data)
        return data

    def get_dom(self, html_string):
        """Return a dom object, from the given string."""
        try:
            if not _USE_LXML:
                html_string = html_to_xhtml(html_string, omit_tags={"script"})
            dom = build_tree(html_string, force_html=True)
            if dom is None:
                dom = build_tree('')
                self._logger.error('%s: using a fake empty DOM', self._cname)
            return dom
        except Exception:
            self._logger.error('%s: caught exception parsing DOM',
                               self._cname,
                               exc_info=True)
            return build_tree('')

    def xpath(self, element, path):
        """Return elements matching the given XPath."""
        try:
            return piculet_xpath(element, path)
        except Exception:
            self._logger.error('%s: caught exception extracting XPath "%s"',
                               self._cname,
                               path,
                               exc_info=True)
            return []

    def tostring(self, element):
        """Convert the element to a string."""
        if isinstance(element, str):
            return str(element)
        else:
            try:
                return ElementTree.tostring(element, encoding='utf8')
            except Exception:
                self._logger.error('%s: unable to convert to string',
                                   self._cname,
                                   exc_info=True)
                return ''

    def clone(self, element):
        """Clone an element."""
        return build_tree(self.tostring(element))

    def preprocess_string(self, html_string):
        """Here we can modify the text, before it's parsed."""
        if not html_string:
            return html_string
        try:
            preprocessors = self.preprocessors
        except AttributeError:
            return html_string
        for src, sub in preprocessors:
            # re._pattern_type is present only since Python 2.5.
            if isinstance(getattr(src, 'sub', None), Callable):
                html_string = src.sub(sub, html_string)
            elif isinstance(src, str) or isinstance(src, unicode):
                html_string = html_string.replace(src, sub)
            elif isinstance(src, Callable):
                try:
                    html_string = src(html_string)
                except Exception:
                    _msg = '%s: caught exception preprocessing html'
                    self._logger.error(_msg, self._cname, exc_info=True)
                    continue
        return html_string

    def gather_refs(self, dom):
        """Collect references."""
        grParser = GatherRefs()
        grParser._as = self._as
        grParser._modFunct = self._modFunct
        refs = grParser.parse_dom(dom)
        refs = grParser.postprocess_data(refs)
        self._namesRefs = refs['names refs']
        self._titlesRefs = refs['titles refs']

    def preprocess_dom(self, dom):
        """Last chance to modify the dom, before the rules are applied."""
        return dom

    def parse_dom(self, dom):
        """Parse the given dom according to the rules specified in self.rules."""
        return Rules(self.rules).extract(dom)

    def postprocess_data(self, data):
        """Here we can modify the data."""
        return data

    def set_objects_params(self, data):
        """Set parameters of Movie/Person/... instances, since they are
        not always set in the parser's code."""
        for obj in flatten(data, yieldDictKeys=True, scalar=_Container):
            obj.accessSystem = self._as
            obj.modFunct = self._modFunct

    def add_refs(self, data):
        """Modify data according to the expected output."""
        if self.getRefs:
            titl_re = r'(%s)' % '|'.join(
                [re.escape(x) for x in list(self._titlesRefs.keys())])
            if titl_re != r'()':
                re_titles = re.compile(titl_re, re.U)
            else:
                re_titles = None
            nam_re = r'(%s)' % '|'.join(
                [re.escape(x) for x in list(self._namesRefs.keys())])
            if nam_re != r'()':
                re_names = re.compile(nam_re, re.U)
            else:
                re_names = None
            _putRefs(data, re_titles, re_names)
        return {
            'data': data,
            'titlesRefs': self._titlesRefs,
            'namesRefs': self._namesRefs
        }
Exemple #5
0
    from contextlib import contextmanager

    @contextmanager
    def redirect_stdout(new_stdout):
        """Context manager for temporarily redirecting stdout."""
        old_stdout, sys.stdout = sys.stdout, new_stdout
        try:
            yield new_stdout
        finally:
            sys.stdout = old_stdout
else:
    from contextlib import redirect_stdout

from imdb.parser.http.logging import logger

_logger = logger.getChild('piculet')


###########################################################
# HTML OPERATIONS
###########################################################


# TODO: this is too fragile
_CHARSET_TAGS = [
    b'<meta http-equiv="content-type" content="text/html; charset=',
    b'<meta charset="'
]


def decode_html(content, charset=None, fallback_charset='utf-8'):