Ejemplo n.º 1
0
def mysql_query(query, params=None, dbname=None, verbose=None):
    """Yield rows from a MySQL query.

    An example query that yields all ns0 pages might look like::

        SELECT
         page_namespace,
         page_title,
        FROM page
        WHERE page_namespace = 0;

    From MediaWiki 1.5, all projects use Unicode (UTF-8) character encoding.
    Cursor charset is utf8.

    @param query: MySQL query to execute
    @type query: str (unicode in py2)
    @param params: input parameters for the query, if needed
        if list or tuple, %s shall be used as placeholder in the query string.
        if a dict, %(key)s shall be used as placeholder in the query string.
    @type params: tuple, list or dict of str (unicode in py2)
    @param dbname: db name
    @type dbname: str
    @param verbose: if True, print query to be executed;
        if None, config.verbose_output will be used.
    @type verbose: None or bool
    @return: generator which yield tuples
    """
    # These are specified in config2.py or user-config.py
    if verbose is None:
        verbose = config.verbose_output

    if config.db_connect_file is None:
        credentials = {
            'user': config.db_username,
            'passwd': config.db_password
        }
    else:
        credentials = {'read_default_file': config.db_connect_file}

    with closing(pymysql.connect(config.db_hostname,
                                 db=config.db_name_format.format(dbname),
                                 port=config.db_port,
                                 charset='utf8',
                                 **credentials)) as conn, \
         closing(conn.cursor()) as cursor:

        if verbose:
            _query = cursor.mogrify(query, params)

            if not isinstance(_query, UnicodeType):
                _query = UnicodeType(_query, encoding='utf-8')
            _query = _query.strip()
            _query = '\n'.join('    {0}'.format(line)
                               for line in _query.splitlines())
            pywikibot.output('Executing query:\n' + _query)

        cursor.execute(query, params)

        for row in cursor:
            yield row
Ejemplo n.º 2
0
    def xml(self, table):
        """
        Fetch and parse XML for a table.

        @param table: table of data to fetch
        @type table: basestring
        @rtype: list
        """
        if table in self._data.setdefault('xml', {}):
            return self._data['xml'][table]

        from xml.etree import cElementTree

        data = self.raw_cached(table, 'xml')

        f = BytesIO(data)
        tree = cElementTree.parse(f)

        data = []

        for row in tree.findall('row'):
            site = {}

            for field in row.findall('field'):
                name = UnicodeType(field.get('name'))
                site[name] = UnicodeType(field.text)

            data.append(site)

        self._data['xml'][table] = data

        return data
Ejemplo n.º 3
0
 def _convert_bytes(self, result):
     """Convert everything into unicode."""
     if PY2 and isinstance(result, str):
         assert result == b''
         result = ''  # This is changing it into a unicode
     elif not isinstance(result, UnicodeType):
         result = UnicodeType(result)
     return result
Ejemplo n.º 4
0
 def test_unicode_method(self):
     """Test __unicode__() method."""
     djvu = DjVuFile(self.file_djvu)
     expected = "DjVuFile('{}')".format(self.file_djvu)
     if PY2:
         self.assertEqual(UnicodeType(djvu), expected)
     else:
         self.assertEqual(djvu.__unicode__(), expected)
Ejemplo n.º 5
0
 def changeUrl(self, url):
     """Change url."""
     self.url = url
     # we ignore the fragment
     (self.scheme, self.host, self.path, self.query,
      self.fragment) = urlparse.urlsplit(self.url)
     if not self.path:
         self.path = '/'
     if self.query:
         self.query = '?' + self.query
     self.protocol = url.split(':', 1)[0]
     # check if there are non-ASCII characters inside path or query, and if
     # so, encode them in an encoding that hopefully is the right one.
     try:
         self.path.encode('ascii')
         self.query.encode('ascii')
     except UnicodeEncodeError:
         encoding = self.getEncodingUsedByServer()
         self.path = UnicodeType(urllib.quote(self.path.encode(encoding)))
         self.query = UnicodeType(
             urllib.quote(self.query.encode(encoding), '=&'))
Ejemplo n.º 6
0
    def pressedOK(self):
        """
        Perform OK operation.

        Called when user pushes the OK button.
        Saves the buffer into a variable, and closes the window.
        """
        self.text = self.editbox.get('1.0', Tkinter.END)
        # if the editbox contains ASCII characters only, get() will
        # return string, otherwise unicode (very annoying). We only want
        # it to return unicode, so we work around this.
        if PY2 and isinstance(self.text, str):
            self.text = UnicodeType(self.text)
        self.parent.destroy()
 def args(self):
     """Expose args."""
     return UnicodeType(self.reason)
Ejemplo n.º 8
0
class LinkChecker(object):
    """
    Check links.

    Given a HTTP URL, tries to load the page from the Internet and checks if it
    is still online.

    Returns a (boolean, string) tuple saying if the page is online and
    including a status reason.

    Per-domain user-agent faking is not supported in this deprecated class.

    Warning: Also returns false if your Internet connection isn't working
    correctly! (This will give a Socket Error)

    """
    def __init__(self,
                 url,
                 redirectChain=[],
                 serverEncoding=None,
                 HTTPignore=[]):
        """
        Initializer.

        redirectChain is a list of redirects which were resolved by
        resolveRedirect(). This is needed to detect redirect loops.
        """
        self.url = url
        self.serverEncoding = serverEncoding

        fake_ua_config = config.fake_user_agent_default.get(
            'weblinkchecker', False)
        if fake_ua_config and isinstance(fake_ua_config, str):
            user_agent = fake_ua_config
        elif fake_ua_config:
            user_agent = comms.http.fake_user_agent()
        else:
            user_agent = comms.http.user_agent()
        self.header = {
            'user-agent': user_agent,
            'Accept': 'text/xml,application/xml,application/xhtml+xml,'
            'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            'Keep-Alive': '30',
            'Connection': 'keep-alive',
        }
        self.redirectChain = redirectChain + [url]
        self.changeUrl(url)
        self.HTTPignore = HTTPignore

    def getConnection(self):
        """Get a connection."""
        if self.scheme == 'http':
            return httplib.HTTPConnection(self.host)
        elif self.scheme == 'https':
            return httplib.HTTPSConnection(self.host)
        else:
            raise NotAnURLError(self.url)

    def getEncodingUsedByServer(self):
        """Get encodung used by server."""
        if not self.serverEncoding:
            try:
                pywikibot.output(
                    'Contacting server %s to find out its default encoding...'
                    % self.host)
                conn = self.getConnection()
                conn.request('HEAD', '/', None, self.header)
                self.response = conn.getresponse()
                self.readEncodingFromResponse(self.response)
            except Exception:
                pass
            if not self.serverEncoding:
                # TODO: We might also load a page, then check for an encoding
                # definition in a HTML meta tag.
                pywikibot.output("Error retrieving server's default charset. "
                                 'Using ISO 8859-1.')
                # most browsers use ISO 8859-1 (Latin-1) as the default.
                self.serverEncoding = 'iso8859-1'
        return self.serverEncoding

    def readEncodingFromResponse(self, response):
        """Read encoding from response."""
        if not self.serverEncoding:
            try:
                ct = response.getheader('Content-Type')
                charsetR = re.compile('charset=(.+)')
                charset = charsetR.search(ct).group(1)
                self.serverEncoding = charset
            except Exception:
                pass

    def changeUrl(self, url):
        """Change url."""
        self.url = url
        # we ignore the fragment
        (self.scheme, self.host, self.path, self.query,
         self.fragment) = urlparse.urlsplit(self.url)
        if not self.path:
            self.path = '/'
        if self.query:
            self.query = '?' + self.query
        self.protocol = url.split(':', 1)[0]
        # check if there are non-ASCII characters inside path or query, and if
        # so, encode them in an encoding that hopefully is the right one.
        try:
            self.path.encode('ascii')
            self.query.encode('ascii')
        except UnicodeEncodeError:
            encoding = self.getEncodingUsedByServer()
            self.path = UnicodeType(urllib.quote(self.path.encode(encoding)))
            self.query = UnicodeType(
                urllib.quote(self.query.encode(encoding), '=&'))

    def resolveRedirect(self, useHEAD=False):
        """
        Return the redirect target URL as a string, if it is a HTTP redirect.

        If useHEAD is true, uses the HTTP HEAD method, which saves bandwidth
        by not downloading the body. Otherwise, the HTTP GET method is used.

        @rtype: str or None
        """
        conn = self.getConnection()
        try:
            if useHEAD:
                conn.request('HEAD', '%s%s' % (self.path, self.query), None,
                             self.header)
            else:
                conn.request('GET', '%s%s' % (self.path, self.query), None,
                             self.header)
            self.response = conn.getresponse()
            # read the server's encoding, in case we need it later
            self.readEncodingFromResponse(self.response)
        except httplib.BadStatusLine:
            # Some servers don't seem to handle HEAD requests properly,
            # e.g. http://www.radiorus.ru/ which is running on a very old
            # Apache server. Using GET instead works on these (but it uses
            # more bandwidth).
            if useHEAD:
                return self.resolveRedirect(useHEAD=False)
            else:
                raise
        if self.response.status >= 300 and self.response.status <= 399:
            # to debug, print response.getheaders()
            redirTarget = self.response.getheader('Location')
            if redirTarget:
                try:
                    redirTarget.encode('ascii')
                except UnicodeError:
                    redirTarget = redirTarget.decode(
                        self.getEncodingUsedByServer())
                if redirTarget.startswith(('http://', 'https://')):
                    self.changeUrl(redirTarget)
                    return True
                elif redirTarget.startswith('/'):
                    self.changeUrl('{0}://{1}{2}'.format(
                        self.protocol, self.host, redirTarget))
                    return True
                else:  # redirect to relative position
                    # cut off filename
                    directory = self.path[:self.path.rindex('/') + 1]
                    # handle redirect to parent directory
                    while redirTarget.startswith('../'):
                        redirTarget = redirTarget[3:]
                        # some servers redirect to .. although we are already
                        # in the root directory; ignore this.
                        if directory != '/':
                            # change /foo/bar/ to /foo/
                            directory = directory[:-1]
                            directory = directory[:directory.rindex('/') + 1]
                    self.changeUrl('{0}://{1}{2}{3}'.format(
                        self.protocol, self.host, directory, redirTarget))
                    return True
        else:
            return False  # not a redirect

    def check(self, useHEAD=False):
        """
        Return True and the server status message if the page is alive.

        @rtype: tuple of (bool, unicode)
        """
        try:
            wasRedirected = self.resolveRedirect(useHEAD=useHEAD)
        except UnicodeError as error:
            return False, 'Encoding Error: {0} ({1})'.format(
                error.__class__.__name__, error)
        except httplib.error as error:
            return False, 'HTTP Error: {}'.format(error.__class__.__name__)
        except socket.error as error:
            # https://docs.python.org/3/library/socket.html :
            # socket.error :
            # The accompanying value is either a string telling what went
            # wrong or a pair (errno, string) representing an error
            # returned by a system call, similar to the value
            # accompanying os.error
            if isinstance(error, UnicodeType):
                msg = error
            else:
                try:
                    msg = error[1]
                except IndexError:
                    pywikibot.output('### DEBUG information for T57282')
                    raise IndexError(type(error))
            # TODO: decode msg. On Linux, it's encoded in UTF-8.
            # How is it encoded in Windows? Or can we somehow just
            # get the English message?
            return False, 'Socket Error: {}'.format(repr(msg))
        if wasRedirected:
            if self.url in self.redirectChain:
                if useHEAD:
                    # Some servers don't seem to handle HEAD requests properly,
                    # which leads to a cyclic list of redirects.
                    # We simply start from the beginning, but this time,
                    # we don't use HEAD, but GET requests.
                    redirChecker = LinkChecker(
                        self.redirectChain[0],
                        serverEncoding=self.serverEncoding,
                        HTTPignore=self.HTTPignore)
                    return redirChecker.check(useHEAD=False)
                else:
                    urlList = [
                        '[{0}]'.format(url)
                        for url in self.redirectChain + [self.url]
                    ]
                    return (False, 'HTTP Redirect Loop: {0}'.format(
                        ' -> '.join(urlList)))
            elif len(self.redirectChain) >= 19:
                if useHEAD:
                    # Some servers don't seem to handle HEAD requests properly,
                    # which leads to a long (or infinite) list of redirects.
                    # We simply start from the beginning, but this time,
                    # we don't use HEAD, but GET requests.
                    redirChecker = LinkChecker(
                        self.redirectChain[0],
                        serverEncoding=self.serverEncoding,
                        HTTPignore=self.HTTPignore)
                    return redirChecker.check(useHEAD=False)
                else:
                    urlList = [
                        '[{0}]'.format(url)
                        for url in self.redirectChain + [self.url]
                    ]
                    return (False, 'Long Chain of Redirects: {0}'.format(
                        ' -> '.join(urlList)))
            else:
                redirChecker = LinkChecker(self.url,
                                           self.redirectChain,
                                           self.serverEncoding,
                                           HTTPignore=self.HTTPignore)
                return redirChecker.check(useHEAD=useHEAD)
        else:
            try:
                conn = self.getConnection()
            except httplib.error as error:
                return False, 'HTTP Error: {0}'.format(
                    error.__class__.__name__)
            try:
                conn.request('GET', '{0}{1}'.format(self.path, self.query),
                             None, self.header)
            except socket.error as error:
                return False, 'Socket Error: {0}'.format(repr(error[1]))
            try:
                self.response = conn.getresponse()
            except Exception as error:
                return False, 'Error: {0}'.format(error)
            # read the server's encoding, in case we need it later
            self.readEncodingFromResponse(self.response)
            # site down if the server status is between 400 and 499
            alive = not (400 <= self.response.status < 500)
            if self.response.status in self.HTTPignore:
                alive = False
            return alive, '{0} {1}'.format(self.response.status,
                                           self.response.reason)
Ejemplo n.º 9
0
 def ip_address_patched(IP):
     """Safe ip_address."""
     return orig_ip_address(UnicodeType(IP))
Ejemplo n.º 10
0
def mysql_query(query, params=None, dbname=None, verbose=None):
    """Yield rows from a MySQL query.

    An example query that yields all ns0 pages might look like::

        SELECT
         page_namespace,
         page_title,
        FROM page
        WHERE page_namespace = 0;

    From MediaWiki 1.5, all projects use Unicode (UTF-8) character encoding.
    Cursor charset is utf8.

    @param query: MySQL query to execute
    @type query: str (unicode in py2)
    @param params: input parametes for the query, if needed
        if list or tuple, %s shall be used as placeholder in the query string.
        if a dict, %(key)s shall be used as placeholder in the query string.
    @type params: tuple, list or dict of str (unicode in py2)
    @param dbname: db name
    @type dbname: str
    @param verbose: if True, print query to be executed;
        if None, config.verbose_output will be used.
    @type verbose: None or bool
    @return: generator which yield tuples
    """
    # These are specified in config2.py or user-config.py
    if verbose is None:
        verbose = config.verbose_output

    if config.db_connect_file is None:
        credentials = {'user': config.db_username,
                       'passwd': config.db_password}
    else:
        credentials = {'read_default_file': config.db_connect_file}

    conn = mysqldb.connect(config.db_hostname,
                           db=config.db_name_format.format(dbname),
                           port=config.db_port,
                           charset='utf8',
                           **credentials)

    cursor = conn.cursor()

    if verbose:
        try:
            _query = cursor.mogrify(query, params)
        except AttributeError:  # if MySQLdb is used.
            # Not exactly the same encoding handling as cursor.execute()
            # Here it is just for the sake of verbose.
            _query = query
            if params is not None:
                _query = query.format(params)

        if not isinstance(_query, UnicodeType):
            _query = UnicodeType(_query, encoding='utf-8')
        _query = _query.strip()
        _query = '\n'.join('    {0}'.format(l) for l in _query.splitlines())
        pywikibot.output('Executing query:\n%s' % _query)

    cursor.execute(query, params)

    for row in cursor:
        yield row

    cursor.close()
    conn.close()