Exemple #1
0
def getInternetArchiveURL(url, timestamp=None):
    """Return archived URL by Internet Archive.

    Parameters:
        url - url to search an archived version for
        timestamp - requested archive date. The version closest to that moment
                    is returned. Format: YYYYMMDDhhmmss or part thereof.

    See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php
    for more details.
    """
    import json
    uri = u'https://archive.org/wayback/available?'

    query = {'url': url}

    if timestamp is not None:
        query['timestamp'] = timestamp

    uri = uri + urllib.urlencode(query)
    jsontext = http.request(uri=uri, site=None)
    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
Exemple #2
0
def getWebCitationURL(url, timestamp=None):
    """Return archived URL by Web Citation.

    Parameters:
        url - url to search an archived version for
        timestamp - requested archive date. The version closest to that moment
                    is returned. Format: YYYYMMDDhhmmss or part thereof.

    See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
    for more details
    """
    import xml.etree.ElementTree as ET
    uri = u'http://www.webcitation.org/query?'

    query = {'returnxml': 'true', 'url': url}

    if timestamp is not None:
        query['date'] = timestamp

    uri = uri + urlencode(query)
    xmltext = http.request(uri=uri, site=None)
    if "success" in xmltext:
        data = ET.fromstring(xmltext)
        return data.find('.//webcite_url').text
    else:
        return None
Exemple #3
0
def getWebCitationURL(url, timestamp=None):
    """Return archived URL by Web Citation.

    Parameters:
        url - url to search an archived version for
        timestamp - requested archive date. The version closest to that moment
                    is returned. Format: YYYYMMDDhhmmss or part thereof.

    See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
    for more details
    """
    import xml.etree.ElementTree as ET
    uri = u'http://www.webcitation.org/query?'

    query = {'returnxml': 'true',
             'url': url}

    if not timestamp is None:
        query['date'] = timestamp

    uri = uri + urllib.urlencode(query)
    xmltext = http.request(uri=uri, site=None)
    if "success" in xmltext:
        data = ET.fromstring(xmltext)
        return data.find('.//webcite_url').text
    else:
        return None
 def test_https(self):
     """Test http.request using https://www.wikiquote.org/."""
     r = http.request(site=None, uri='https://www.wikiquote.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
     self.assertOneDeprecationParts(
         'Invoking http.request without argument site', 'http.fetch()')
 def test_https(self):
     """Test http.request using https://www.wikiquote.org/."""
     r = http.request(site=None, uri='https://www.wikiquote.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
     self.assertOneDeprecationParts(
         'Invoking http.request without argument site', 'http.fetch()')
Exemple #6
0
def github_svn_rev2hash(tag, rev):
    """Convert a Subversion revision to a Git hash using Github.

    @param tag: name of the Subversion repo on Github
    @param rev: Subversion revision identifier
    @return: the git hash
    @rtype: str
    """
    from io import StringIO
    import xml.dom.minidom
    from pywikibot.comms import http

    uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag
    data = http.request(site=None,
                        uri=uri,
                        method='PROPFIND',
                        body="<?xml version='1.0' encoding='utf-8'?>"
                        "<propfind xmlns=\"DAV:\"><allprop/></propfind>",
                        headers={
                            'label': str(rev),
                            'user-agent': 'SVN/1.7.5 {pwb}'
                        })

    dom = xml.dom.minidom.parse(StringIO(data))
    hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue
    return hsh
Exemple #7
0
def getInternetArchiveURL(url, timestamp=None):
    """Return archived URL by Internet Archive.

    Parameters:
        url - url to search an archived version for
        timestamp - requested archive date. The version closest to that moment
                    is returned. Format: YYYYMMDDhhmmss or part thereof.

    See [[:mw:Archived Pages]] and https://archive.org/help/wayback_api.php
    for more details.
    """
    import json
    uri = u'https://archive.org/wayback/available?'

    query = {'url': url}

    if timestamp is not None:
        query['timestamp'] = timestamp

    uri = uri + urlencode(query)
    jsontext = http.request(uri=uri, site=None)
    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
 def test_https_ignore_cert_error(self):
     """Test http.request ignoring invalid vikidia SSL certificate."""
     # As the connection is cached, the above test will cause
     # subsequent requests to go to the existing, broken, connection.
     # So, this uses a different host, which hopefully hasnt been
     # connected previously by other tests.
     r = http.request(site=None,
                      uri='https://en.vikidia.org/wiki/Main_Page',
                      disable_ssl_certificate_validation=True)
     self.assertIsInstance(r, unicode)
     self.assertIn('<title>Vikidia</title>', r)
 def test_https_ignore_cert_error(self):
     """Test http.request ignoring invalid vikidia SSL certificate."""
     # As the connection is cached, the above test will cause
     # subsequent requests to go to the existing, broken, connection.
     # So, this uses a different host, which hopefully hasnt been
     # connected previously by other tests.
     r = http.request(site=None,
                      uri='https://en.vikidia.org/wiki/Main_Page',
                      disable_ssl_certificate_validation=True)
     self.assertIsInstance(r, unicode)
     self.assertIn('<title>Vikidia</title>', r)
Exemple #10
0
def postForm(site, address, predata, method="POST"):
    # replaces:
    # data = pywikibot.getSite().postForm(address, predata=predata)

    address = site.family.apipath(site.lang)

    from pywikibot.comms import http
    from urllib import urlencode

    urldata = urlencode(predata)
    data = http.request(site, uri=address, method=method, body=urldata)
    return data
Exemple #11
0
def getversion_onlinerepo(repo=None):
    """Retrieve current framework revision number from online repository.

    @param repo: (optional) Online repository location
    @type repo: URL or string
    """
    from pywikibot.comms import http

    url = repo or 'https://git.wikimedia.org/feed/pywikibot/core'
    hsh = None
    buf = http.request(site=None, uri=url)
    buf = buf.split('\r\n')
    try:
        hsh = buf[13].split('/')[5][:-1]
    except Exception as e:
        raise ParseError(repr(e) + ' while parsing ' + repr(buf))
    return hsh
Exemple #12
0
def getInternetArchiveURL(site, url, timestamp=None):
    """Return archived URL by Internet Archive."""
    # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
    import json
    query = u'http://archive.org/wayback/available?'
    query += u'url='
    query += url
    if not timestamp is None:
        query += u'&timestamp='
        query += timestamp
    if pywikibot.verbose:
        pywikibot.output(u"Requesting query from Internet Archive: %s" % query)
    jsontext = http.request(uri=query, site=site, retry=False, no_hostname=True)
    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
Exemple #13
0
def getWebCitationURL(site, url, timestamp=None):
    """Return archived URL by Web Citation."""
    # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
    from BeautifulSoup import BeautifulStoneSoup
    query = u'http://www.webcitation.org/query?'
    query += u'returnxml=true'
    query += u'&url='
    query += url
    if not timestamp is None:
        query += u'&date='
        query += timestamp
    if pywikibot.verbose:
        pywikibot.output(u"Requesting query from Web Citation: %s" % query)
    xmltext = http.request(uri=query, site=site, retry=False, no_hostname=True)
    if "success" in xmltext:
        data = BeautifulStoneSoup(xmltext)
        return data.find('webcite_url').string
    else:
        return None
Exemple #14
0
    def getDataFromHost(self, queryStr):
        """
        Go and fetch a query from the host's API.
        """
        url = self.getUrl(queryStr)

        try:
            resp = http.request(None, url)
        except:
            pywikibot.warning(u"Failed to retrieve %s" % url)
            raise

        try:
            data = json.loads(resp)
        except ValueError:
            pywikibot.warning(u"Data received from host but no JSON could be decoded")
            raise pywikibot.ServerError

        return data
Exemple #15
0
def getWebCitationURL(site, url, timestamp=None):
    """Return archived URL by Web Citation."""
    # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
    from BeautifulSoup import BeautifulStoneSoup
    query = u'http://www.webcitation.org/query?'
    query += u'returnxml=true'
    query += u'&url='
    query += url
    if not timestamp is None:
        query += u'&date='
        query += timestamp
    if pywikibot.verbose:
        pywikibot.output(u"Requesting query from Web Citation: %s" % query)
    xmltext = http.request(uri=query, site=site, retry=False, no_hostname=True)
    if "success" in xmltext:
        data = BeautifulStoneSoup(xmltext)
        return data.find('webcite_url').string
    else:
        return None
Exemple #16
0
def getInternetArchiveURL(site, url, timestamp=None):
    """Return archived URL by Internet Archive."""
    # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
    import json
    query = u'http://archive.org/wayback/available?'
    query += u'url='
    query += url
    if not timestamp is None:
        query += u'&timestamp='
        query += timestamp
    if pywikibot.verbose:
        pywikibot.output(u"Requesting query from Internet Archive: %s" % query)
    jsontext = http.request(uri=query,
                            site=site,
                            retry=False,
                            no_hostname=True)
    if "closest" in jsontext:
        data = json.loads(jsontext)
        return data['archived_snapshots']['closest']['url']
    else:
        return None
Exemple #17
0
def github_svn_rev2hash(tag, rev):
    """Convert a Subversion revision to a Git hash using Github.

    @param tag: name of the Subversion repo on Github
    @param rev: Subversion revision identifier
    @return: the git hash
    @rtype: str
    """
    from io import StringIO
    import xml.dom.minidom
    from pywikibot.comms import http

    uri = 'https://github.com/wikimedia/%s/!svn/vcc/default' % tag
    data = http.request(site=None, uri=uri, method='PROPFIND',
                        body="<?xml version='1.0' encoding='utf-8'?>"
                        "<propfind xmlns=\"DAV:\"><allprop/></propfind>",
                        headers={'label': str(rev), 'user-agent': 'SVN/1.7.5 {pwb}'})

    dom = xml.dom.minidom.parse(StringIO(data))
    hsh = dom.getElementsByTagName("C:git-commit")[0].firstChild.nodeValue
    return hsh
Exemple #18
0
 def test_https(self):
     """Test http.request using https://www.wikiquote.org/."""
     r = http.request(site=None, uri='https://www.wikiquote.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
Exemple #19
0
    def submit(self):
        """Submit a query and parse the response.

        @return: a dict containing data retrieved from api.php

        """
        while True:
            paramstring = self.http_params()
            action = self.params.get("action", "")
            simulate = self._simulate(action)
            if simulate:
                return simulate
            if self.throttle:
                self.site.throttle(write=self.write)
            else:
                pywikibot.log("Action '{0}' is submitted not throttled.".format(action))
            uri = self.site.scriptpath() + "/api.php"
            try:
                if self.mime:
                    # construct a MIME message containing all API key/values
                    container = MIMEMultipart(_subtype='form-data')
                    for key in self.params:
                        # key "file" requires special treatment in a multipart
                        # message
                        if key == "file":
                            local_filename = self.params[key]
                            filetype = mimetypes.guess_type(local_filename)[0] \
                                or 'application/octet-stream'
                            file_content = file(local_filename, "rb").read()
                            submsg = Request._generate_MIME_part(
                                key, file_content, filetype.split('/'),
                                {'filename': local_filename})
                        else:
                            submsg = Request._generate_MIME_part(
                                key, self.params[key], None, None)
                        container.attach(submsg)
                    for key, value in self.mime_params.items():
                        container.attach(Request._generate_MIME_part(key, *value))
                    # strip the headers to get the HTTP message body
                    body = container.as_string()
                    marker = "\n\n"  # separates headers from body
                    eoh = body.find(marker)
                    body = body[eoh + len(marker):]
                    # retrieve the headers from the MIME object
                    headers = dict(list(container.items()))
                else:
                    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
                    body = paramstring

                rawdata = http.request(
                    self.site, uri, method="POST",
                    headers=headers, body=body)

#                import traceback
#                traceback.print_stack()
#                print rawdata
            except Server504Error:
                pywikibot.log(u"Caught HTTP 504 error; retrying")
                self.wait()
                continue
            except FatalServerError:
                # This error is not going to be fixed by just waiting
                pywikibot.error(traceback.format_exc())
                raise
            # TODO: what other exceptions can occur here?
            except Exception:
                # for any other error on the http request, wait and retry
                pywikibot.error(traceback.format_exc())
                pywikibot.log(u"%s, %s" % (uri, paramstring))
                self.wait()
                continue
            if not isinstance(rawdata, unicode):
                rawdata = rawdata.decode(self.site.encoding())
            pywikibot.debug(u"API response received:\n" + rawdata, _logger)
            if rawdata.startswith(u"unknown_action"):
                raise APIError(rawdata[:14], rawdata[16:])
            try:
                result = json.loads(rawdata)
            except ValueError:
                # if the result isn't valid JSON, there must be a server
                # problem.  Wait a few seconds and try again
                pywikibot.warning(
                    "Non-JSON response received from server %s; the server may be down."
                    % self.site)
                pywikibot.debug(rawdata, _logger)
                # there might also be an overflow, so try a smaller limit
                for param in self.params:
                    if param.endswith("limit"):
                        value = self.params[param]
                        try:
                            self.params[param] = str(int(value) // 2)
                            pywikibot.output(u"Set %s = %s"
                                             % (param, self.params[param]))
                        except:
                            pass
                self.wait()
                continue
            if not result:
                result = {}
            if not isinstance(result, dict):
                raise APIError("Unknown",
                               "Unable to process query response of type %s."
                               % type(result),
                               data=result)
            if self['action'] == 'query':
                if 'userinfo' in result.get('query', ()):
                    if hasattr(self.site, '_userinfo'):
                        self.site._userinfo.update(result['query']['userinfo'])
                    else:
                        self.site._userinfo = result['query']['userinfo']
                status = self.site._loginstatus  # save previous login status
                if (("error" in result
                     and result["error"]["code"].endswith("limit"))
                    or (status >= 0
                        and self.site._userinfo['name'] != self.site._username[status])):
                    # user is no longer logged in (session expired?)
                    # reset userinfo, then make user log in again
                    del self.site._userinfo
                    self.site._loginstatus = -1
                    if status < 0:
                        status = 0  # default to non-sysop login
                    self.site.login(status)
                    # retry the previous query
                    continue
            self._handle_warnings(result)
            if "error" not in result:
                return result

            if "*" in result["error"]:
                # help text returned
                result['error']['help'] = result['error'].pop("*")
            code = result["error"].pop("code", "Unknown")
            info = result["error"].pop("info", None)
            if code == "maxlag":
                lag = lagpattern.search(info)
                if lag:
                    pywikibot.log(
                        u"Pausing due to database lag: " + info)
                    self.site.throttle.lag(int(lag.group("lag")))
                    continue

            if code.startswith(u'internal_api_error_'):
                class_name = code[len(u'internal_api_error_'):]
                if class_name in ['DBConnectionError',  # r 4984 & r 4580
                                  'DBQueryError',  # bug 58158
                                  'ReadOnlyError'  # bug 59227
                                  ]:

                    pywikibot.log(u'MediaWiki exception %s; retrying.'
                                  % class_name)
                    self.wait()
                    continue

                pywikibot.log(u"MediaWiki exception %s: query=\n%s"
                              % (class_name,
                                 pprint.pformat(self.params)))
                pywikibot.log(u"           response=\n%s" % result)

                raise APIMWException(class_name, info, **result["error"])

            # bugs 46535, 62126, 64494, 66619
            # maybe removed when it 46535 is solved
            if code == "failed-save" and \
               action == 'wbeditentity' and \
               self._is_wikibase_error_retryable(result["error"]):
                self.wait()
                continue
            # raise error
            try:
                pywikibot.log(u"API Error: query=\n%s"
                              % pprint.pformat(self.params))
                pywikibot.log(u"           response=\n%s"
                              % result)

                raise APIError(code, info, **result["error"])
            except TypeError:
                raise RuntimeError(result)
 def test_https(self):
     """Test http.request using https://www.wikiquote.org/."""
     r = http.request(site=None, uri='https://www.wikiquote.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
Exemple #21
0
 def test_get(self):
     r = http.request(site=None, uri='http://www.wikipedia.org/')
     self.assertIsInstance(r, str if sys.version_info[0] >= 3 else unicode)
     self.assertIn('<html lang="mul"', r)
 def test_http(self):
     """Test http request function."""
     r = http.request(site=None, uri='http://www.wikipedia.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
Exemple #23
0
 def test_http(self):
     """Test http request function."""
     r = http.request(site=None, uri='http://www.wikipedia.org/')
     self.assertIsInstance(r, unicode)
     self.assertIn('<html lang="mul"', r)
Exemple #24
0
    def subTemplate(self, content, param):
        """Substitute the template tags in content according to param.

           @param content: Content with tags to substitute.
           @type  content: string
           @param param: Param with data how to substitute tags.
           @type  param: dict

           Returns a tuple containig the new content with tags
           substituted and a list of those tags.
        """

        substed_tags = []  # DRTRIGON-73
        metadata     = {'mw-signature': u'~~~~',
                        'mw-timestamp': u'~~~~~',}  # DRTRIGON-132

        # 0.2.) check for 'simple' mode and get additional params
        if param['simple']:
            p = self.site.getExpandedString(param['simple'])
            param.update(pywikibot.extract_templates_and_params(p)[0][1])

        # 0.5.) check cron/date
        if param['cron']:
            # [min] [hour] [day of month] [month] [day of week]
            # (date supported only, thus [min] and [hour] dropped)
            if not (param['cron'][0] == '@'):
                param['cron'] = '* * ' + param['cron']
            entry = crontab.CronTab(param['cron'])
            # find the delay from midnight (does not return 0.0 - but next)
            delay = entry.next(datetime.datetime.now().replace(hour=0,
                                                               minute=0,
                                                               second=0,
                                                               microsecond=0)- \
                               datetime.timedelta(microseconds=1))

            pywikibot.output(u'CRON delay for execution: %.3f (<= %i)'
                             % (delay, self._bot_config['CRONMaxDelay']))

            if not (delay <= self._bot_config['CRONMaxDelay']):
                return (content, substed_tags, metadata)

        # 1.) getUrl or wiki text
        # (security: check url not to point to a local file on the server,
        #  e.g. 'file://' - same as used in xsalt.py)
        secure = False
        for item in [u'http://', u'https://',
                     u'mail://', u'local://', u'wiki://']:
            secure = secure or (param['url'][:len(item)] == item)
        param['zip'] = ast.literal_eval(param['zip'])
        if not secure:
            return (content, substed_tags, metadata)
        if   (param['url'][:7] == u'wiki://'):
            url = param['url'][7:].strip('[]')              # enable wiki-links
            if ast.literal_eval(param['expandtemplates']):  # DRTRIGON-93 (only with 'wiki://')
                external_buffer = pywikibot.Page(self.site,
                                                 url).get(expandtemplates=True)
            else:
                external_buffer = self.load( pywikibot.Page(self.site, url) )
        elif (param['url'][:7] == u'mail://'):              # DRTRIGON-101
            url = param['url'].replace(u'{{@}}', u'@')     # e.g. nlwiki
            mbox = SubsterMailbox(
              pywikibot.config.datafilepath(self._bot_config['data_path'],
                                            self._bot_config['mbox_file'], ''))
            external_buffer = mbox.find_data(url)
            mbox.close()
        elif (param['url'][:8] == u'local://'):             # DRTRIGON-131
            if (param['url'][8:] == u'cache/state_bots'):
                # filename hard-coded
                d = shelve.open(pywikibot.config.datafilepath('cache',
                                                              'state_bots'))
                external_buffer = pprint.pformat(
                    ast.literal_eval(pprint.pformat(d)))
                d.close()
            else:
                external_buffer = u'n/a'
        else:
            # consider using 'expires', 'last-modified', 'etag' in order to
            # make the updating data requests more efficient! use those stored
            # on page, if the user placed them, else use the conventional mode.
            # http://www.diveintopython.net/http_web_services/etags.html
            f_url, external_buffer = http.request(self.site, param['url'],
                                                  no_hostname = True,
                                                  back_response = True)
            headers = f_url.headers # same like 'f_url.info()'
            #if param['zip']:
            if ('text/' not in headers['content-type']):
                pywikibot.output(u'Source is of non-text content-type, '
                                 u'using raw data instead.')
                external_buffer = f_url.read()
            del f_url               # free some memory (no need to keep copy)

            for h in ['content-length', 'date', 'last-modified', 'expires']:
                if h in headers:
                    metadata['url-%s' % h] = headers[h]

        # some intermediate processing (unzip, xlsx2csv, ...)
        if param['zip']:    # 'application/zip', ...
            fileno          = 0 if (param['zip'] == True) else (param['zip']-1)
            external_buffer = self.unzip(external_buffer, fileno)
        if param['xlsx']:   # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            external_buffer = self.xlsx2csv(external_buffer, param['xlsx'])
        if param['ods']:    # 'application/vnd.oasis.opendocument.spreadsheet'
            external_buffer = self.ods2csv(external_buffer, param['ods'])

        if not ast.literal_eval(param['beautifulsoup']):    # DRTRIGON-88
            # 2.) regexp
            #for subitem in param['regex']:
            subitem = param['regex']
            regex = re.compile(subitem, re.S | re.I)

            # 3.) subst in content
            external_data = regex.search(external_buffer)

            external_data_dict = {}
            if external_data:    # not None
                external_data = external_data.groups()

                pywikibot.output(u'Groups found by regex: %i'
                                 % len(external_data))

                # DRTRIGON-114: Support for named groups in regexs
                if regex.groupindex:
                    for item in regex.groupindex:
                        external_data_dict[u'%s-%s' % (param['value'], item)] = external_data[regex.groupindex[item]-1]
                elif (len(external_data) == 1):
                    external_data_dict = {param['value']: external_data[0]}
                else:
                    external_data_dict = {param['value']: str(external_data)}
            pywikibot.debug( str(external_data_dict) )

            param['postproc'] = eval(param['postproc'])
            # should be secured as given below, but needs code changes in wiki too
            #param['postproc'] = ast.literal_eval(param['postproc'])
            for value in external_data_dict:
                external_data = external_data_dict[value]

                # 4.) postprocessing
                func  = param['postproc'][0]    # needed by exec call of self._code
                DATA  = [ external_data ]       #
                args  = param['postproc'][1:]   #
                scope = {}                      # (scope to run in)
                scope.update( locals() )        # (add DATA, *args, ...)
                scope.update( globals() )       # (add imports and else)
                if func:
                    exec(self._code + (self._bot_config['CodeTemplate'] % func), scope, scope)
                    external_data = DATA[0]
                pywikibot.debug( external_data )

                # 5.) subst content
                (content, tags) = self.subTag(content, value, external_data, int(param['count']))
                substed_tags += tags
        else:
            # DRTRIGON-105: Support for multiple BS template configurations
            value = param['value']
            if value:
                value += u'-'

            # DRTRIGON-88: Enable Beautiful Soup power for Subster
            BS_tags = self.get_BS_regex(value).findall(content)

            pywikibot.output(u'BeautifulSoup tags found by regex: %i' % len(BS_tags))

            prev_content = content

            BS = BeautifulSoup.BeautifulSoup(external_buffer)
            for item in BS_tags:
                external_data = eval('BS.%s' % item[1])
                external_data = self._BS_regex_str%{'var1':value+'BS:'+item[1],'var2':value,'cont':external_data}
                content = content.replace(item[0], external_data, 1)

            if (content != prev_content):
                substed_tags.append(value+'BS')

        metadata['bot-timestamp'] = pywikibot.Timestamp.now().isoformat(' ')

        return (content, substed_tags, metadata)
Exemple #25
0
 def test_get(self):
     r = http.request(site=None, uri='http://www.wikipedia.org/')
     self.assertIsInstance(r, str)
     self.assertTrue('<html lang="mul"' in r)
Exemple #26
0
    def submit(self):
        """Submit a query and parse the response.

        @return:  The data retrieved from api.php (a dict)

        """
        while True:
            paramstring = self.http_params()
            action = self.params.get("action", "")
            simulate = self._simulate(action)
            if simulate:
                return simulate
            self.site.throttle(write=self.write)
            uri = self.site.scriptpath() + "/api.php"
            ssl = False
            if self.site.family.name in config.available_ssl_project:
                if action == "login" and config.use_SSL_onlogin:
                    ssl = True
                elif config.use_SSL_always:
                    ssl = True
            try:
                if self.mime:
                    # construct a MIME message containing all API key/values
                    container = MIMEMultipart(_subtype='form-data')
                    for key in self.params:
                        # key "file" requires special treatment in a multipart
                        # message
                        if key == "file":
                            local_filename = self.params[key]
                            filetype = mimetypes.guess_type(local_filename)[0] \
                                or 'application/octet-stream'
                            file_content = file(local_filename, "rb").read()
                            submsg = MIMENonMultipart(*filetype.split("/"))
                            submsg.add_header("Content-disposition",
                                              "form-data",
                                              name=key,
                                              filename=local_filename)
                            submsg.set_payload(file_content)
                        else:
                            try:
                                self.params[key].encode("ascii")
                                keytype = ("text", "plain")
                            except UnicodeError:
                                keytype = ("application", "octet-stream")
                            submsg = MIMENonMultipart(*keytype)
                            submsg.add_header("Content-disposition",
                                              "form-data",
                                              name=key)
                            submsg.set_payload(self.params[key])
                        container.attach(submsg)
                    # strip the headers to get the HTTP message body
                    body = container.as_string()
                    marker = "\n\n"  # separates headers from body
                    eoh = body.find(marker)
                    body = body[eoh + len(marker):]
                    # retrieve the headers from the MIME object
                    mimehead = dict(list(container.items()))
                    rawdata = http.request(self.site,
                                           uri,
                                           ssl,
                                           method="POST",
                                           headers=mimehead,
                                           body=body)
                else:
                    rawdata = http.request(
                        self.site,
                        uri,
                        ssl,
                        method="POST",
                        headers={
                            'Content-Type': 'application/x-www-form-urlencoded'
                        },
                        body=paramstring)


#                import traceback
#                traceback.print_stack()
#                print rawdata
            except Server504Error:
                pywikibot.log(u"Caught HTTP 504 error; retrying")
                self.wait()
                continue
            except FatalServerError:
                # This error is not going to be fixed by just waiting
                pywikibot.error(traceback.format_exc())
                raise
            # TODO: what other exceptions can occur here?
            except Exception:
                # for any other error on the http request, wait and retry
                pywikibot.error(traceback.format_exc())
                pywikibot.log(u"%s, %s" % (uri, paramstring))
                self.wait()
                continue
            if not isinstance(rawdata, unicode):
                rawdata = rawdata.decode(self.site.encoding())
            pywikibot.debug(u"API response received:\n" + rawdata, _logger)
            if rawdata.startswith(u"unknown_action"):
                raise APIError(rawdata[:14], rawdata[16:])
            try:
                result = json.loads(rawdata)
            except ValueError:
                # if the result isn't valid JSON, there must be a server
                # problem.  Wait a few seconds and try again
                pywikibot.warning(
                    "Non-JSON response received from server %s; the server may be down."
                    % self.site)
                pywikibot.debug(rawdata, _logger)
                # there might also be an overflow, so try a smaller limit
                for param in self.params:
                    if param.endswith("limit"):
                        value = self.params[param]
                        try:
                            self.params[param] = str(int(value) // 2)
                            pywikibot.output(u"Set %s = %s" %
                                             (param, self.params[param]))
                        except:
                            pass
                self.wait()
                continue
            if not result:
                result = {}
            if not isinstance(result, dict):
                raise APIError("Unknown",
                               "Unable to process query response of type %s." %
                               type(result),
                               data=result)
            if self['action'] == 'query':
                if 'userinfo' in result.get('query', ()):
                    if hasattr(self.site, '_userinfo'):
                        self.site._userinfo.update(result['query']['userinfo'])
                    else:
                        self.site._userinfo = result['query']['userinfo']
                status = self.site._loginstatus  # save previous login status
                if (("error" in result
                     and result["error"]["code"].endswith("limit"))
                        or (status >= 0 and self.site._userinfo['name'] !=
                            self.site._username[status])):
                    # user is no longer logged in (session expired?)
                    # reset userinfo, then make user log in again
                    del self.site._userinfo
                    self.site._loginstatus = -1
                    if status < 0:
                        status = 0  # default to non-sysop login
                    self.site.login(status)
                    # retry the previous query
                    continue
            if "warnings" in result:
                modules = [k for k in result["warnings"] if k != "info"]
                for mod in modules:
                    if '*' in result["warnings"][mod]:
                        text = result["warnings"][mod]['*']
                    elif 'html' in result["warnings"][mod]:
                        # Bugzilla 49978
                        text = result["warnings"][mod]['html']['*']
                    else:
                        # This is just a warning, we shouldn't raise an
                        # exception because of it
                        continue
                    pywikibot.warning(u"API warning (%s): %s" % (mod, text))
            if "error" not in result:
                return result
            if "*" in result["error"]:
                # help text returned
                result['error']['help'] = result['error'].pop("*")
            code = result["error"].pop("code", "Unknown")
            info = result["error"].pop("info", None)
            if code == "maxlag":
                lag = lagpattern.search(info)
                if lag:
                    pywikibot.log(u"Pausing due to database lag: " + info)
                    self.site.throttle.lag(int(lag.group("lag")))
                    continue
            if code.startswith(u'internal_api_error_'):
                self.wait()
                continue
            # bugs 46535, 62126, 64494
            # maybe removed when it 46535 is solved
            if code == "failed-save" and action == 'wbeditentity':
                try:
                    message = result["error"]["messages"]["0"]["name"]
                except KeyError:
                    message = None
                if message == u'edit-already-exists':
                    self.wait()
                    continue
            # raise error
            try:
                pywikibot.log(u"API Error: query=\n%s" %
                              pprint.pformat(self.params))
                pywikibot.log(u"           response=\n%s" % result)
                raise APIError(code, info, **result["error"])
            except TypeError:
                raise RuntimeError(result)
Exemple #27
0
    def submit(self):
        """Submit a query and parse the response.

        @return:  The data retrieved from api.php (a dict)

        """
        paramstring = self.http_params()
        while True:
            action = self.params.get("action", "")
            simulate = self._simulate(action)
            if simulate:
                return simulate
            self.site.throttle(write=self.write)
            uri = self.site.scriptpath() + "/api.php"
            ssl = False
            if self.site.family.name in config.available_ssl_project:
                if action == "login" and config.use_SSL_onlogin:
                    ssl = True
                elif config.use_SSL_always:
                    ssl = True
            try:
                if self.mime:
                    # construct a MIME message containing all API key/values
                    container = MIMEMultipart(_subtype='form-data')
                    for key in self.params:
                        # key "file" requires special treatment in a multipart
                        # message
                        if key == "file":
                            local_filename = self.params[key]
                            filetype = mimetypes.guess_type(local_filename)[0] \
                                       or 'application/octet-stream'
                            file_content = file(local_filename, "rb").read()
                            submsg = MIMENonMultipart(*filetype.split("/"))
                            submsg.add_header("Content-disposition",
                                              "form-data", name=key,
                                              filename=local_filename)
                            submsg.set_payload(file_content)
                        else:
                            try:
                                self.params[key].encode("ascii")
                                keytype = ("text", "plain")
                            except UnicodeError:
                                keytype = ("application", "octet-stream")
                            submsg = MIMENonMultipart(*keytype)
                            submsg.add_header("Content-disposition", "form-data",
                                              name=key)
                            submsg.set_payload(self.params[key])
                        container.attach(submsg)
                    # strip the headers to get the HTTP message body
                    body = container.as_string()
                    marker = "\n\n" # separates headers from body
                    eoh = body.find(marker)
                    body = body[ eoh + len(marker): ]
                    # retrieve the headers from the MIME object
                    mimehead = dict(container.items())
                    rawdata = http.request(self.site, uri, ssl, method="POST",
                                           headers=mimehead, body=body)
                else:
                    rawdata = http.request(self.site, uri, ssl, method="POST",
                                headers={'Content-Type':
                                         'application/x-www-form-urlencoded'},
                                body=paramstring)
##                import traceback
##                traceback.print_stack()
##                print rawdata
            except Server504Error:
                pywikibot.log(u"Caught HTTP 504 error; retrying")
                self.wait()
                continue
            #TODO: what other exceptions can occur here?
            except Exception, e:
                # for any other error on the http request, wait and retry
                pywikibot.error(traceback.format_exc())
                pywikibot.log(u"%s, %s" % (uri, paramstring))
                self.wait()
                continue
            if not isinstance(rawdata, unicode):
                rawdata = rawdata.decode(self.site.encoding())
            pywikibot.debug(u"API response received:\n" + rawdata, _logger)
            if rawdata.startswith(u"unknown_action"):
                raise APIError(rawdata[:14], rawdata[16:])
            try:
                result = json.loads(rawdata)
            except ValueError:
                # if the result isn't valid JSON, there must be a server
                # problem.  Wait a few seconds and try again
                pywikibot.warning(
"Non-JSON response received from server %s; the server may be down."
                                 % self.site)
                pywikibot.debug(rawdata, _logger)
                # there might also be an overflow, so try a smaller limit
                for param in self.params:
                    if param.endswith("limit"):
                        value = self.params[param]
                        try:
                            self.params[param] = str(int(value) // 2)
                            pywikibot.output(u"Set %s = %s"
                                             % (param, self.params[param]))
                        except:
                            pass
                self.wait()
                continue
            if not result:
                result = {}
            if type(result) is not dict:
                raise APIError("Unknown",
                               "Unable to process query response of type %s."
                                   % type(result),
                               {'data': result})
            if self['action'] == 'query':
                if 'userinfo' in result.get('query', ()):
                    if hasattr(self.site, '_userinfo'):
                        self.site._userinfo.update(result['query']['userinfo'])
                    else:
                        self.site._userinfo = result['query']['userinfo']
                status = self.site._loginstatus  # save previous login status
                if ( ("error" in result
                            and result["error"]["code"].endswith("limit"))
                      or (status >= 0
                            and self.site._userinfo['name']
                                != self.site._username[status])):
                    # user is no longer logged in (session expired?)
                    # reset userinfo, then make user log in again
                    del self.site._userinfo
                    self.site._loginstatus = -1
                    if status < 0:
                        status = 0  # default to non-sysop login
                    self.site.login(status)
                    # retry the previous query
                    continue
            if "warnings" in result:
                modules = [k for k in result["warnings"] if k != "info"]
                for mod in modules:
                    if '*' in result["warnings"][mod]:
                        text = result["warnings"][mod]['*']
                    elif 'html' in result["warnings"][mod]:
                        # Bugzilla 49978
                        text = result["warnings"][mod]['html']['*']
                    else:
                        # This is just a warning, we shouldn't raise an
                        # exception because of it
                        continue
                    pywikibot.warning(
                        u"API warning (%s): %s"
                        % (mod, text))
            if "error" not in result:
                return result
            if "*" in result["error"]:
                # help text returned
                result['error']['help'] = result['error'].pop("*")
            code = result["error"].pop("code", "Unknown")
            info = result["error"].pop("info", None)
            if code == "maxlag":
                lag = lagpattern.search(info)
                if lag:
                    pywikibot.log(
                        u"Pausing due to database lag: " + info)
                    self.site.throttle.lag(int(lag.group("lag")))
                    continue
            if code in (u'internal_api_error_DBConnectionError', ):
                self.wait()
                continue
            # raise error
            try:
                pywikibot.log(u"API Error: query=\n%s"
                               % pprint.pformat(self.params))
                pywikibot.log(u"           response=\n%s"
                               % result)
                raise APIError(code, info, **result["error"])
            except TypeError:
                raise RuntimeError(result)
Exemple #28
0
 def test_get(self):
     r = http.request(site=None, uri='http://www.wikipedia.org/')
     self.assertIsInstance(r, str if sys.version_info[0] >= 3 else unicode)
     self.assertIn('<html lang="mul"', r)
def is_translation(page):
    url = "%s/index.php?title=%s" % (SITE.scriptpath(), page.title(asUrl=True))
    return '"wgTranslatePageTranslation":"translation"' in http.request(
        SITE, url)
Exemple #30
0
    def subTemplate(self, content, param):
        """Substitute the template tags in content according to param.

           @param content: Content with tags to substitute.
           @type  content: string
           @param param: Param with data how to substitute tags.
           @type  param: dict

           Returns a tuple containig the new content with tags
           substituted and a list of those tags.
        """

        substed_tags = []  # DRTRIGON-73
        metadata     = {'mw-signature': u'~~~~',
                        'mw-timestamp': u'~~~~~',}  # DRTRIGON-132

        # 0.2.) check for 'simple' mode and get additional params
        if param['simple']:
            p = self.site.getExpandedString(param['simple'])
            param.update(pywikibot.extract_templates_and_params(p)[0][1])

        # 0.5.) check cron/date
        if param['cron']:
            # [min] [hour] [day of month] [month] [day of week]
            # (date supported only, thus [min] and [hour] dropped)
            if not (param['cron'][0] == '@'):
                param['cron'] = '* * ' + param['cron']
            entry = crontab.CronTab(param['cron'])
            # find the delay from midnight (does not return 0.0 - but next)
            delay = entry.next(datetime.datetime.now().replace(hour=0,
                                                               minute=0,
                                                               second=0,
                                                               microsecond=0)- \
                               datetime.timedelta(microseconds=1))

            pywikibot.output(u'CRON delay for execution: %.3f (<= %i)'
                             % (delay, self._bot_config['CRONMaxDelay']))

            if not (delay <= self._bot_config['CRONMaxDelay']):
                return (content, substed_tags, metadata)

        # 1.) getUrl or wiki text
        # (security: check url not to point to a local file on the server,
        #  e.g. 'file://' - same as used in xsalt.py)
        secure = False
        for item in [u'http://', u'https://',
                     u'mail://', u'local://', u'wiki://']:
            secure = secure or (param['url'][:len(item)] == item)
        param['zip'] = ast.literal_eval(param['zip'])
        if not secure:
            return (content, substed_tags, metadata)
        if   (param['url'][:7] == u'wiki://'):
            url = param['url'][7:].strip('[]')              # enable wiki-links
            if ast.literal_eval(param['expandtemplates']):  # DRTRIGON-93 (only with 'wiki://')
                external_buffer = pywikibot.Page(self.site,
                                                 url).get(expandtemplates=True)
            else:
                external_buffer = self.load( pywikibot.Page(self.site, url) )
        elif (param['url'][:7] == u'mail://'):              # DRTRIGON-101
            url = param['url'].replace(u'{{@}}', u'@')     # e.g. nlwiki
            mbox = SubsterMailbox(
              pywikibot.config.datafilepath(self._bot_config['data_path'],
                                            self._bot_config['mbox_file'], ''))
            external_buffer = mbox.find_data(url)
            mbox.close()
        elif (param['url'][:8] == u'local://'):             # DRTRIGON-131
            if (param['url'][8:] == u'cache/state_bots'):
                # filename hard-coded
                d = shelve.open(pywikibot.config.datafilepath('cache',
                                                              'state_bots'))
                external_buffer = pprint.pformat(
                    ast.literal_eval(pprint.pformat(d)))
                d.close()
            else:
                external_buffer = u'n/a'
        else:
            # consider using 'expires', 'last-modified', 'etag' in order to
            # make the updating data requests more efficient! use those stored
            # on page, if the user placed them, else use the conventional mode.
            # http://www.diveintopython.net/http_web_services/etags.html
            f_url, external_buffer = http.request(self.site, param['url'],
                                                  no_hostname = True,
                                                  back_response = True)
            headers = f_url.headers # same like 'f_url.info()'
            #if param['zip']:
            if ('text/' not in headers['content-type']):
                pywikibot.output(u'Source is of non-text content-type, '
                                 u'using raw data instead.')
                external_buffer = f_url.read()
            del f_url               # free some memory (no need to keep copy)

            for h in ['content-length', 'date', 'last-modified', 'expires']:
                if h in headers:
                    metadata['url-%s' % h] = headers[h]

        # some intermediate processing (unzip, xlsx2csv, ...)
        if param['zip']:    # 'application/zip', ...
            fileno          = 0 if (param['zip'] == True) else (param['zip']-1)
            external_buffer = self.unzip(external_buffer, fileno)
        if param['xlsx']:   # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            external_buffer = self.xlsx2csv(external_buffer, param['xlsx'])
        if param['ods']:    # 'application/vnd.oasis.opendocument.spreadsheet'
            external_buffer = self.ods2csv(external_buffer, param['ods'])

        if not ast.literal_eval(param['beautifulsoup']):    # DRTRIGON-88
            # 2.) regexp
            #for subitem in param['regex']:
            subitem = param['regex']
            regex = re.compile(subitem, re.S | re.I)

            # 3.) subst in content
            external_data = regex.search(external_buffer)

            external_data_dict = {}
            if external_data:    # not None
                external_data = external_data.groups()

                pywikibot.output(u'Groups found by regex: %i'
                                 % len(external_data))

                # DRTRIGON-114: Support for named groups in regexs
                if regex.groupindex:
                    for item in regex.groupindex:
                        external_data_dict[u'%s-%s' % (param['value'], item)] = external_data[regex.groupindex[item]-1]
                elif (len(external_data) == 1):
                    external_data_dict = {param['value']: external_data[0]}
                else:
                    external_data_dict = {param['value']: str(external_data)}
            pywikibot.debug( str(external_data_dict) )

            param['postproc'] = eval(param['postproc'])
            # should be secured as given below, but needs code changes in wiki too
            #param['postproc'] = ast.literal_eval(param['postproc'])
            for value in external_data_dict:
                external_data = external_data_dict[value]

                # 4.) postprocessing
                func  = param['postproc'][0]    # needed by exec call of self._code
                DATA  = [ external_data ]       #
                args  = param['postproc'][1:]   #
                scope = {}                      # (scope to run in)
                scope.update( locals() )        # (add DATA, *args, ...)
                scope.update( globals() )       # (add imports and else)
                if func:
                    exec(self._code + (self._bot_config['CodeTemplate'] % func), scope, scope)
                    external_data = DATA[0]
                pywikibot.debug( external_data )

                # 5.) subst content
                (content, tags) = self.subTag(content, value, external_data, int(param['count']))
                substed_tags += tags
        else:
            # DRTRIGON-105: Support for multiple BS template configurations
            value = param['value']
            if value:
                value += u'-'

            # DRTRIGON-88: Enable Beautiful Soup power for Subster
            BS_tags = self.get_BS_regex(value).findall(content)

            pywikibot.output(u'BeautifulSoup tags found by regex: %i' % len(BS_tags))

            prev_content = content

            BS = BeautifulSoup.BeautifulSoup(external_buffer)
            for item in BS_tags:
                external_data = eval('BS.%s' % item[1])
                external_data = self._BS_regex_str%{'var1':value+'BS:'+item[1],'var2':value,'cont':external_data}
                content = content.replace(item[0], external_data, 1)

            if (content != prev_content):
                substed_tags.append(value+'BS')

        metadata['bot-timestamp'] = pywikibot.Timestamp.now().isoformat(' ')

        return (content, substed_tags, metadata)