def mysql_query(query, params=None, dbname=None, verbose=None): """Yield rows from a MySQL query. An example query that yields all ns0 pages might look like:: SELECT page_namespace, page_title, FROM page WHERE page_namespace = 0; From MediaWiki 1.5, all projects use Unicode (UTF-8) character encoding. Cursor charset is utf8. @param query: MySQL query to execute @type query: str (unicode in py2) @param params: input parameters for the query, if needed if list or tuple, %s shall be used as placeholder in the query string. if a dict, %(key)s shall be used as placeholder in the query string. @type params: tuple, list or dict of str (unicode in py2) @param dbname: db name @type dbname: str @param verbose: if True, print query to be executed; if None, config.verbose_output will be used. @type verbose: None or bool @return: generator which yield tuples """ # These are specified in config2.py or user-config.py if verbose is None: verbose = config.verbose_output if config.db_connect_file is None: credentials = { 'user': config.db_username, 'passwd': config.db_password } else: credentials = {'read_default_file': config.db_connect_file} with closing(pymysql.connect(config.db_hostname, db=config.db_name_format.format(dbname), port=config.db_port, charset='utf8', **credentials)) as conn, \ closing(conn.cursor()) as cursor: if verbose: _query = cursor.mogrify(query, params) if not isinstance(_query, UnicodeType): _query = UnicodeType(_query, encoding='utf-8') _query = _query.strip() _query = '\n'.join(' {0}'.format(line) for line in _query.splitlines()) pywikibot.output('Executing query:\n' + _query) cursor.execute(query, params) for row in cursor: yield row
def xml(self, table): """ Fetch and parse XML for a table. @param table: table of data to fetch @type table: basestring @rtype: list """ if table in self._data.setdefault('xml', {}): return self._data['xml'][table] from xml.etree import cElementTree data = self.raw_cached(table, 'xml') f = BytesIO(data) tree = cElementTree.parse(f) data = [] for row in tree.findall('row'): site = {} for field in row.findall('field'): name = UnicodeType(field.get('name')) site[name] = UnicodeType(field.text) data.append(site) self._data['xml'][table] = data return data
def _convert_bytes(self, result): """Convert everything into unicode.""" if PY2 and isinstance(result, str): assert result == b'' result = '' # This is changing it into a unicode elif not isinstance(result, UnicodeType): result = UnicodeType(result) return result
def test_unicode_method(self): """Test __unicode__() method.""" djvu = DjVuFile(self.file_djvu) expected = "DjVuFile('{}')".format(self.file_djvu) if PY2: self.assertEqual(UnicodeType(djvu), expected) else: self.assertEqual(djvu.__unicode__(), expected)
def changeUrl(self, url): """Change url.""" self.url = url # we ignore the fragment (self.scheme, self.host, self.path, self.query, self.fragment) = urlparse.urlsplit(self.url) if not self.path: self.path = '/' if self.query: self.query = '?' + self.query self.protocol = url.split(':', 1)[0] # check if there are non-ASCII characters inside path or query, and if # so, encode them in an encoding that hopefully is the right one. try: self.path.encode('ascii') self.query.encode('ascii') except UnicodeEncodeError: encoding = self.getEncodingUsedByServer() self.path = UnicodeType(urllib.quote(self.path.encode(encoding))) self.query = UnicodeType( urllib.quote(self.query.encode(encoding), '=&'))
def pressedOK(self): """ Perform OK operation. Called when user pushes the OK button. Saves the buffer into a variable, and closes the window. """ self.text = self.editbox.get('1.0', Tkinter.END) # if the editbox contains ASCII characters only, get() will # return string, otherwise unicode (very annoying). We only want # it to return unicode, so we work around this. if PY2 and isinstance(self.text, str): self.text = UnicodeType(self.text) self.parent.destroy()
def args(self): """Expose args.""" return UnicodeType(self.reason)
class LinkChecker(object): """ Check links. Given a HTTP URL, tries to load the page from the Internet and checks if it is still online. Returns a (boolean, string) tuple saying if the page is online and including a status reason. Per-domain user-agent faking is not supported in this deprecated class. Warning: Also returns false if your Internet connection isn't working correctly! (This will give a Socket Error) """ def __init__(self, url, redirectChain=[], serverEncoding=None, HTTPignore=[]): """ Initializer. redirectChain is a list of redirects which were resolved by resolveRedirect(). This is needed to detect redirect loops. """ self.url = url self.serverEncoding = serverEncoding fake_ua_config = config.fake_user_agent_default.get( 'weblinkchecker', False) if fake_ua_config and isinstance(fake_ua_config, str): user_agent = fake_ua_config elif fake_ua_config: user_agent = comms.http.fake_user_agent() else: user_agent = comms.http.user_agent() self.header = { 'user-agent': user_agent, 'Accept': 'text/xml,application/xml,application/xhtml+xml,' 'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Keep-Alive': '30', 'Connection': 'keep-alive', } self.redirectChain = redirectChain + [url] self.changeUrl(url) self.HTTPignore = HTTPignore def getConnection(self): """Get a connection.""" if self.scheme == 'http': return httplib.HTTPConnection(self.host) elif self.scheme == 'https': return httplib.HTTPSConnection(self.host) else: raise NotAnURLError(self.url) def getEncodingUsedByServer(self): """Get encodung used by server.""" if not self.serverEncoding: try: pywikibot.output( 'Contacting server %s to find out its default encoding...' % self.host) conn = self.getConnection() conn.request('HEAD', '/', None, self.header) self.response = conn.getresponse() self.readEncodingFromResponse(self.response) except Exception: pass if not self.serverEncoding: # TODO: We might also load a page, then check for an encoding # definition in a HTML meta tag. pywikibot.output("Error retrieving server's default charset. " 'Using ISO 8859-1.') # most browsers use ISO 8859-1 (Latin-1) as the default. self.serverEncoding = 'iso8859-1' return self.serverEncoding def readEncodingFromResponse(self, response): """Read encoding from response.""" if not self.serverEncoding: try: ct = response.getheader('Content-Type') charsetR = re.compile('charset=(.+)') charset = charsetR.search(ct).group(1) self.serverEncoding = charset except Exception: pass def changeUrl(self, url): """Change url.""" self.url = url # we ignore the fragment (self.scheme, self.host, self.path, self.query, self.fragment) = urlparse.urlsplit(self.url) if not self.path: self.path = '/' if self.query: self.query = '?' + self.query self.protocol = url.split(':', 1)[0] # check if there are non-ASCII characters inside path or query, and if # so, encode them in an encoding that hopefully is the right one. try: self.path.encode('ascii') self.query.encode('ascii') except UnicodeEncodeError: encoding = self.getEncodingUsedByServer() self.path = UnicodeType(urllib.quote(self.path.encode(encoding))) self.query = UnicodeType( urllib.quote(self.query.encode(encoding), '=&')) def resolveRedirect(self, useHEAD=False): """ Return the redirect target URL as a string, if it is a HTTP redirect. If useHEAD is true, uses the HTTP HEAD method, which saves bandwidth by not downloading the body. Otherwise, the HTTP GET method is used. @rtype: str or None """ conn = self.getConnection() try: if useHEAD: conn.request('HEAD', '%s%s' % (self.path, self.query), None, self.header) else: conn.request('GET', '%s%s' % (self.path, self.query), None, self.header) self.response = conn.getresponse() # read the server's encoding, in case we need it later self.readEncodingFromResponse(self.response) except httplib.BadStatusLine: # Some servers don't seem to handle HEAD requests properly, # e.g. http://www.radiorus.ru/ which is running on a very old # Apache server. Using GET instead works on these (but it uses # more bandwidth). if useHEAD: return self.resolveRedirect(useHEAD=False) else: raise if self.response.status >= 300 and self.response.status <= 399: # to debug, print response.getheaders() redirTarget = self.response.getheader('Location') if redirTarget: try: redirTarget.encode('ascii') except UnicodeError: redirTarget = redirTarget.decode( self.getEncodingUsedByServer()) if redirTarget.startswith(('http://', 'https://')): self.changeUrl(redirTarget) return True elif redirTarget.startswith('/'): self.changeUrl('{0}://{1}{2}'.format( self.protocol, self.host, redirTarget)) return True else: # redirect to relative position # cut off filename directory = self.path[:self.path.rindex('/') + 1] # handle redirect to parent directory while redirTarget.startswith('../'): redirTarget = redirTarget[3:] # some servers redirect to .. although we are already # in the root directory; ignore this. if directory != '/': # change /foo/bar/ to /foo/ directory = directory[:-1] directory = directory[:directory.rindex('/') + 1] self.changeUrl('{0}://{1}{2}{3}'.format( self.protocol, self.host, directory, redirTarget)) return True else: return False # not a redirect def check(self, useHEAD=False): """ Return True and the server status message if the page is alive. @rtype: tuple of (bool, unicode) """ try: wasRedirected = self.resolveRedirect(useHEAD=useHEAD) except UnicodeError as error: return False, 'Encoding Error: {0} ({1})'.format( error.__class__.__name__, error) except httplib.error as error: return False, 'HTTP Error: {}'.format(error.__class__.__name__) except socket.error as error: # https://docs.python.org/3/library/socket.html : # socket.error : # The accompanying value is either a string telling what went # wrong or a pair (errno, string) representing an error # returned by a system call, similar to the value # accompanying os.error if isinstance(error, UnicodeType): msg = error else: try: msg = error[1] except IndexError: pywikibot.output('### DEBUG information for T57282') raise IndexError(type(error)) # TODO: decode msg. On Linux, it's encoded in UTF-8. # How is it encoded in Windows? Or can we somehow just # get the English message? return False, 'Socket Error: {}'.format(repr(msg)) if wasRedirected: if self.url in self.redirectChain: if useHEAD: # Some servers don't seem to handle HEAD requests properly, # which leads to a cyclic list of redirects. # We simply start from the beginning, but this time, # we don't use HEAD, but GET requests. redirChecker = LinkChecker( self.redirectChain[0], serverEncoding=self.serverEncoding, HTTPignore=self.HTTPignore) return redirChecker.check(useHEAD=False) else: urlList = [ '[{0}]'.format(url) for url in self.redirectChain + [self.url] ] return (False, 'HTTP Redirect Loop: {0}'.format( ' -> '.join(urlList))) elif len(self.redirectChain) >= 19: if useHEAD: # Some servers don't seem to handle HEAD requests properly, # which leads to a long (or infinite) list of redirects. # We simply start from the beginning, but this time, # we don't use HEAD, but GET requests. redirChecker = LinkChecker( self.redirectChain[0], serverEncoding=self.serverEncoding, HTTPignore=self.HTTPignore) return redirChecker.check(useHEAD=False) else: urlList = [ '[{0}]'.format(url) for url in self.redirectChain + [self.url] ] return (False, 'Long Chain of Redirects: {0}'.format( ' -> '.join(urlList))) else: redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding, HTTPignore=self.HTTPignore) return redirChecker.check(useHEAD=useHEAD) else: try: conn = self.getConnection() except httplib.error as error: return False, 'HTTP Error: {0}'.format( error.__class__.__name__) try: conn.request('GET', '{0}{1}'.format(self.path, self.query), None, self.header) except socket.error as error: return False, 'Socket Error: {0}'.format(repr(error[1])) try: self.response = conn.getresponse() except Exception as error: return False, 'Error: {0}'.format(error) # read the server's encoding, in case we need it later self.readEncodingFromResponse(self.response) # site down if the server status is between 400 and 499 alive = not (400 <= self.response.status < 500) if self.response.status in self.HTTPignore: alive = False return alive, '{0} {1}'.format(self.response.status, self.response.reason)
def ip_address_patched(IP): """Safe ip_address.""" return orig_ip_address(UnicodeType(IP))
def mysql_query(query, params=None, dbname=None, verbose=None): """Yield rows from a MySQL query. An example query that yields all ns0 pages might look like:: SELECT page_namespace, page_title, FROM page WHERE page_namespace = 0; From MediaWiki 1.5, all projects use Unicode (UTF-8) character encoding. Cursor charset is utf8. @param query: MySQL query to execute @type query: str (unicode in py2) @param params: input parametes for the query, if needed if list or tuple, %s shall be used as placeholder in the query string. if a dict, %(key)s shall be used as placeholder in the query string. @type params: tuple, list or dict of str (unicode in py2) @param dbname: db name @type dbname: str @param verbose: if True, print query to be executed; if None, config.verbose_output will be used. @type verbose: None or bool @return: generator which yield tuples """ # These are specified in config2.py or user-config.py if verbose is None: verbose = config.verbose_output if config.db_connect_file is None: credentials = {'user': config.db_username, 'passwd': config.db_password} else: credentials = {'read_default_file': config.db_connect_file} conn = mysqldb.connect(config.db_hostname, db=config.db_name_format.format(dbname), port=config.db_port, charset='utf8', **credentials) cursor = conn.cursor() if verbose: try: _query = cursor.mogrify(query, params) except AttributeError: # if MySQLdb is used. # Not exactly the same encoding handling as cursor.execute() # Here it is just for the sake of verbose. _query = query if params is not None: _query = query.format(params) if not isinstance(_query, UnicodeType): _query = UnicodeType(_query, encoding='utf-8') _query = _query.strip() _query = '\n'.join(' {0}'.format(l) for l in _query.splitlines()) pywikibot.output('Executing query:\n%s' % _query) cursor.execute(query, params) for row in cursor: yield row cursor.close() conn.close()