def procesPage(self, page):
     """
     Proces a single page
     """
     item = pywikibot.DataPage(page)
     pywikibot.output('Processing %s' % page)
     if not item.exists():
         pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
         #TODO FIXME: We should provide an option to create the page
     else:
         pagetext = page.get()
         pagetext = pywikibot.removeDisabledParts(pagetext)
         templates = pywikibot.extract_templates_and_params(pagetext)
         for (template, fielddict) in templates:
             # We found the template we were looking for
             if template.replace(u'_', u' ') == self.templateTitle:
                 for field, value in fielddict.items():
                     # This field contains something useful for us
                     if field in self.fields:
                         # Check if the property isn't already set
                         claim = self.fields[field]
                         if claim in item.get().get('claims'):
                             pywikibot.output(
                                 u'A claim for %s already exists. Skipping'
                                 % (claim,))
                             # TODO FIXME: This is a very crude way of dupe
                             # checking
                         else:
                             # Try to extract a valid page
                             match = re.search(re.compile(
                                 r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'),
                                               value)
                             if match:
                                 try:
                                     link = match.group(1)
                                     linkedPage = pywikibot.Page(self.site,
                                                                 link)
                                     if linkedPage.isRedirectPage():
                                         linkedPage = linkedPage.getRedirectTarget()
                                     linkedItem = pywikibot.DataPage(linkedPage)
                                     pywikibot.output('Adding %s --> %s'
                                                      % (claim,
                                                         linkedItem.getID()))
                                     if self.setSource(self.site().language()):
                                         item.editclaim(
                                             str(claim),
                                             linkedItem.getID(),
                                             refs={self.setSource(
                                                 self.site().language())})
                                     else:
                                         item.editclaim(str(claim),
                                                        linkedItem.getID())
                                 except pywikibot.NoPage:
                                     pywikibot.output(
                                         "[[%s]] doesn't exist so I can't link to it"
                                         % linkedItem.title())
Exemple #2
0
    def loadTemplates(self, page, template, default={}):
        """Get operating mode from page with template by searching the template.

           @param page: The user (page) for which the data should be retrieved.

           Returns a list of dict with the templates parameters found.
        """

        self._content = self.load(page)  # 'None' if not existing page

        templates = []
        if not self._content:
            return templates  # catch empty or not existing page

        for tmpl in pywikibot.extract_templates_and_params(self._content):
            if tmpl[0] == template:
                param_default = {}
                param_default.update(default)
                param_default.update(tmpl[1])
                templates.append(param_default)
        return templates
Exemple #3
0
    def loadTemplates(self, page, template, default={}):
        """Get operating mode from page with template by searching the template.

           @param page: The user (page) for which the data should be retrieved.

           Returns a list of dict with the templates parameters found.
        """

        self._content = self.load(page)  # 'None' if not existing page

        templates = []
        if not self._content:
            return templates  # catch empty or not existing page

        for tmpl in pywikibot.extract_templates_and_params(self._content):
            if tmpl[0] == template:
                param_default = {}
                param_default.update(default)
                param_default.update(tmpl[1])
                templates.append(param_default)
        return templates
Exemple #4
0
    def subTemplate(self, content, param):
        """Substitute the template tags in content according to param.

           @param content: Content with tags to substitute.
           @type  content: string
           @param param: Param with data how to substitute tags.
           @type  param: dict

           Returns a tuple containig the new content with tags
           substituted and a list of those tags.
        """

        substed_tags = []  # DRTRIGON-73
        metadata     = {'mw-signature': u'~~~~',
                        'mw-timestamp': u'~~~~~',}  # DRTRIGON-132

        # 0.2.) check for 'simple' mode and get additional params
        if param['simple']:
            p = self.site.getExpandedString(param['simple'])
            param.update(pywikibot.extract_templates_and_params(p)[0][1])

        # 0.5.) check cron/date
        if param['cron']:
            # [min] [hour] [day of month] [month] [day of week]
            # (date supported only, thus [min] and [hour] dropped)
            if not (param['cron'][0] == '@'):
                param['cron'] = '* * ' + param['cron']
            entry = crontab.CronTab(param['cron'])
            # find the delay from midnight (does not return 0.0 - but next)
            delay = entry.next(datetime.datetime.now().replace(hour=0,
                                                               minute=0,
                                                               second=0,
                                                               microsecond=0)- \
                               datetime.timedelta(microseconds=1))

            pywikibot.output(u'CRON delay for execution: %.3f (<= %i)'
                             % (delay, self._bot_config['CRONMaxDelay']))

            if not (delay <= self._bot_config['CRONMaxDelay']):
                return (content, substed_tags, metadata)

        # 1.) getUrl or wiki text
        # (security: check url not to point to a local file on the server,
        #  e.g. 'file://' - same as used in xsalt.py)
        secure = False
        for item in [u'http://', u'https://',
                     u'mail://', u'local://', u'wiki://']:
            secure = secure or (param['url'][:len(item)] == item)
        param['zip'] = ast.literal_eval(param['zip'])
        if not secure:
            return (content, substed_tags, metadata)
        if   (param['url'][:7] == u'wiki://'):
            url = param['url'][7:].strip('[]')              # enable wiki-links
            if ast.literal_eval(param['expandtemplates']):  # DRTRIGON-93 (only with 'wiki://')
                external_buffer = pywikibot.Page(self.site,
                                                 url).get(expandtemplates=True)
            else:
                external_buffer = self.load( pywikibot.Page(self.site, url) )
        elif (param['url'][:7] == u'mail://'):              # DRTRIGON-101
            url = param['url'].replace(u'{{@}}', u'@')     # e.g. nlwiki
            mbox = SubsterMailbox(
              pywikibot.config.datafilepath(self._bot_config['data_path'],
                                            self._bot_config['mbox_file'], ''))
            external_buffer = mbox.find_data(url)
            mbox.close()
        elif (param['url'][:8] == u'local://'):             # DRTRIGON-131
            if (param['url'][8:] == u'cache/state_bots'):
                # filename hard-coded
                d = shelve.open(pywikibot.config.datafilepath('cache',
                                                              'state_bots'))
                external_buffer = pprint.pformat(
                    ast.literal_eval(pprint.pformat(d)))
                d.close()
            else:
                external_buffer = u'n/a'
        else:
            # consider using 'expires', 'last-modified', 'etag' in order to
            # make the updating data requests more efficient! use those stored
            # on page, if the user placed them, else use the conventional mode.
            # http://www.diveintopython.net/http_web_services/etags.html
            f_url, external_buffer = http.request(self.site, param['url'],
                                                  no_hostname = True,
                                                  back_response = True)
            headers = f_url.headers # same like 'f_url.info()'
            #if param['zip']:
            if ('text/' not in headers['content-type']):
                pywikibot.output(u'Source is of non-text content-type, '
                                 u'using raw data instead.')
                external_buffer = f_url.read()
            del f_url               # free some memory (no need to keep copy)

            for h in ['content-length', 'date', 'last-modified', 'expires']:
                if h in headers:
                    metadata['url-%s' % h] = headers[h]

        # some intermediate processing (unzip, xlsx2csv, ...)
        if param['zip']:    # 'application/zip', ...
            fileno          = 0 if (param['zip'] == True) else (param['zip']-1)
            external_buffer = self.unzip(external_buffer, fileno)
        if param['xlsx']:   # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            external_buffer = self.xlsx2csv(external_buffer, param['xlsx'])
        if param['ods']:    # 'application/vnd.oasis.opendocument.spreadsheet'
            external_buffer = self.ods2csv(external_buffer, param['ods'])

        if not ast.literal_eval(param['beautifulsoup']):    # DRTRIGON-88
            # 2.) regexp
            #for subitem in param['regex']:
            subitem = param['regex']
            regex = re.compile(subitem, re.S | re.I)

            # 3.) subst in content
            external_data = regex.search(external_buffer)

            external_data_dict = {}
            if external_data:    # not None
                external_data = external_data.groups()

                pywikibot.output(u'Groups found by regex: %i'
                                 % len(external_data))

                # DRTRIGON-114: Support for named groups in regexs
                if regex.groupindex:
                    for item in regex.groupindex:
                        external_data_dict[u'%s-%s' % (param['value'], item)] = external_data[regex.groupindex[item]-1]
                elif (len(external_data) == 1):
                    external_data_dict = {param['value']: external_data[0]}
                else:
                    external_data_dict = {param['value']: str(external_data)}
            pywikibot.debug( str(external_data_dict) )

            param['postproc'] = eval(param['postproc'])
            # should be secured as given below, but needs code changes in wiki too
            #param['postproc'] = ast.literal_eval(param['postproc'])
            for value in external_data_dict:
                external_data = external_data_dict[value]

                # 4.) postprocessing
                func  = param['postproc'][0]    # needed by exec call of self._code
                DATA  = [ external_data ]       #
                args  = param['postproc'][1:]   #
                scope = {}                      # (scope to run in)
                scope.update( locals() )        # (add DATA, *args, ...)
                scope.update( globals() )       # (add imports and else)
                if func:
                    exec(self._code + (self._bot_config['CodeTemplate'] % func), scope, scope)
                    external_data = DATA[0]
                pywikibot.debug( external_data )

                # 5.) subst content
                (content, tags) = self.subTag(content, value, external_data, int(param['count']))
                substed_tags += tags
        else:
            # DRTRIGON-105: Support for multiple BS template configurations
            value = param['value']
            if value:
                value += u'-'

            # DRTRIGON-88: Enable Beautiful Soup power for Subster
            BS_tags = self.get_BS_regex(value).findall(content)

            pywikibot.output(u'BeautifulSoup tags found by regex: %i' % len(BS_tags))

            prev_content = content

            BS = BeautifulSoup.BeautifulSoup(external_buffer)
            for item in BS_tags:
                external_data = eval('BS.%s' % item[1])
                external_data = self._BS_regex_str%{'var1':value+'BS:'+item[1],'var2':value,'cont':external_data}
                content = content.replace(item[0], external_data, 1)

            if (content != prev_content):
                substed_tags.append(value+'BS')

        metadata['bot-timestamp'] = pywikibot.Timestamp.now().isoformat(' ')

        return (content, substed_tags, metadata)
Exemple #5
0
    def subTemplate(self, content, param):
        """Substitute the template tags in content according to param.

           @param content: Content with tags to substitute.
           @type  content: string
           @param param: Param with data how to substitute tags.
           @type  param: dict

           Returns a tuple containig the new content with tags
           substituted and a list of those tags.
        """

        substed_tags = []  # DRTRIGON-73
        metadata     = {'mw-signature': u'~~~~',
                        'mw-timestamp': u'~~~~~',}  # DRTRIGON-132

        # 0.2.) check for 'simple' mode and get additional params
        if param['simple']:
            p = self.site.getExpandedString(param['simple'])
            param.update(pywikibot.extract_templates_and_params(p)[0][1])

        # 0.5.) check cron/date
        if param['cron']:
            # [min] [hour] [day of month] [month] [day of week]
            # (date supported only, thus [min] and [hour] dropped)
            if not (param['cron'][0] == '@'):
                param['cron'] = '* * ' + param['cron']
            entry = crontab.CronTab(param['cron'])
            # find the delay from midnight (does not return 0.0 - but next)
            delay = entry.next(datetime.datetime.now().replace(hour=0,
                                                               minute=0,
                                                               second=0,
                                                               microsecond=0)- \
                               datetime.timedelta(microseconds=1))

            pywikibot.output(u'CRON delay for execution: %.3f (<= %i)'
                             % (delay, self._bot_config['CRONMaxDelay']))

            if not (delay <= self._bot_config['CRONMaxDelay']):
                return (content, substed_tags, metadata)

        # 1.) getUrl or wiki text
        # (security: check url not to point to a local file on the server,
        #  e.g. 'file://' - same as used in xsalt.py)
        secure = False
        for item in [u'http://', u'https://',
                     u'mail://', u'local://', u'wiki://']:
            secure = secure or (param['url'][:len(item)] == item)
        param['zip'] = ast.literal_eval(param['zip'])
        if not secure:
            return (content, substed_tags, metadata)
        if   (param['url'][:7] == u'wiki://'):
            url = param['url'][7:].strip('[]')              # enable wiki-links
            if ast.literal_eval(param['expandtemplates']):  # DRTRIGON-93 (only with 'wiki://')
                external_buffer = pywikibot.Page(self.site,
                                                 url).get(expandtemplates=True)
            else:
                external_buffer = self.load( pywikibot.Page(self.site, url) )
        elif (param['url'][:7] == u'mail://'):              # DRTRIGON-101
            url = param['url'].replace(u'{{@}}', u'@')     # e.g. nlwiki
            mbox = SubsterMailbox(
              pywikibot.config.datafilepath(self._bot_config['data_path'],
                                            self._bot_config['mbox_file'], ''))
            external_buffer = mbox.find_data(url)
            mbox.close()
        elif (param['url'][:8] == u'local://'):             # DRTRIGON-131
            if (param['url'][8:] == u'cache/state_bots'):
                # filename hard-coded
                d = shelve.open(pywikibot.config.datafilepath('cache',
                                                              'state_bots'))
                external_buffer = pprint.pformat(
                    ast.literal_eval(pprint.pformat(d)))
                d.close()
            else:
                external_buffer = u'n/a'
        else:
            # consider using 'expires', 'last-modified', 'etag' in order to
            # make the updating data requests more efficient! use those stored
            # on page, if the user placed them, else use the conventional mode.
            # http://www.diveintopython.net/http_web_services/etags.html
            f_url, external_buffer = http.request(self.site, param['url'],
                                                  no_hostname = True,
                                                  back_response = True)
            headers = f_url.headers # same like 'f_url.info()'
            #if param['zip']:
            if ('text/' not in headers['content-type']):
                pywikibot.output(u'Source is of non-text content-type, '
                                 u'using raw data instead.')
                external_buffer = f_url.read()
            del f_url               # free some memory (no need to keep copy)

            for h in ['content-length', 'date', 'last-modified', 'expires']:
                if h in headers:
                    metadata['url-%s' % h] = headers[h]

        # some intermediate processing (unzip, xlsx2csv, ...)
        if param['zip']:    # 'application/zip', ...
            fileno          = 0 if (param['zip'] == True) else (param['zip']-1)
            external_buffer = self.unzip(external_buffer, fileno)
        if param['xlsx']:   # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            external_buffer = self.xlsx2csv(external_buffer, param['xlsx'])
        if param['ods']:    # 'application/vnd.oasis.opendocument.spreadsheet'
            external_buffer = self.ods2csv(external_buffer, param['ods'])

        if not ast.literal_eval(param['beautifulsoup']):    # DRTRIGON-88
            # 2.) regexp
            #for subitem in param['regex']:
            subitem = param['regex']
            regex = re.compile(subitem, re.S | re.I)

            # 3.) subst in content
            external_data = regex.search(external_buffer)

            external_data_dict = {}
            if external_data:    # not None
                external_data = external_data.groups()

                pywikibot.output(u'Groups found by regex: %i'
                                 % len(external_data))

                # DRTRIGON-114: Support for named groups in regexs
                if regex.groupindex:
                    for item in regex.groupindex:
                        external_data_dict[u'%s-%s' % (param['value'], item)] = external_data[regex.groupindex[item]-1]
                elif (len(external_data) == 1):
                    external_data_dict = {param['value']: external_data[0]}
                else:
                    external_data_dict = {param['value']: str(external_data)}
            pywikibot.debug( str(external_data_dict) )

            param['postproc'] = eval(param['postproc'])
            # should be secured as given below, but needs code changes in wiki too
            #param['postproc'] = ast.literal_eval(param['postproc'])
            for value in external_data_dict:
                external_data = external_data_dict[value]

                # 4.) postprocessing
                func  = param['postproc'][0]    # needed by exec call of self._code
                DATA  = [ external_data ]       #
                args  = param['postproc'][1:]   #
                scope = {}                      # (scope to run in)
                scope.update( locals() )        # (add DATA, *args, ...)
                scope.update( globals() )       # (add imports and else)
                if func:
                    exec(self._code + (self._bot_config['CodeTemplate'] % func), scope, scope)
                    external_data = DATA[0]
                pywikibot.debug( external_data )

                # 5.) subst content
                (content, tags) = self.subTag(content, value, external_data, int(param['count']))
                substed_tags += tags
        else:
            # DRTRIGON-105: Support for multiple BS template configurations
            value = param['value']
            if value:
                value += u'-'

            # DRTRIGON-88: Enable Beautiful Soup power for Subster
            BS_tags = self.get_BS_regex(value).findall(content)

            pywikibot.output(u'BeautifulSoup tags found by regex: %i' % len(BS_tags))

            prev_content = content

            BS = BeautifulSoup.BeautifulSoup(external_buffer)
            for item in BS_tags:
                external_data = eval('BS.%s' % item[1])
                external_data = self._BS_regex_str%{'var1':value+'BS:'+item[1],'var2':value,'cont':external_data}
                content = content.replace(item[0], external_data, 1)

            if (content != prev_content):
                substed_tags.append(value+'BS')

        metadata['bot-timestamp'] = pywikibot.Timestamp.now().isoformat(' ')

        return (content, substed_tags, metadata)
 def procesPage(self, page):
     """
     Proces a single page
     """
     item = pywikibot.DataPage(page)
     pywikibot.output('Processing %s' % page)
     if not item.exists():
         pywikibot.output('%s doesn\'t have a wikidata item :(' % page)
         #TODO FIXME: We should provide an option to create the page
     else:
         pagetext = page.get()
         pagetext = pywikibot.removeDisabledParts(pagetext)
         templates = pywikibot.extract_templates_and_params(pagetext)
         for (template, fielddict) in templates:
             # We found the template we were looking for
             if template.replace(u'_', u' ') == self.templateTitle:
                 for field, value in fielddict.items():
                     # This field contains something useful for us
                     if field in self.fields:
                         # Check if the property isn't already set
                         claim = self.fields[field]
                         if claim in item.get().get('claims'):
                             pywikibot.output(
                                 u'A claim for %s already exists. Skipping'
                                 % (claim, ))
                             # TODO FIXME: This is a very crude way of dupe
                             # checking
                         else:
                             # Try to extract a valid page
                             match = re.search(
                                 re.compile(
                                     r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'
                                 ), value)
                             if match:
                                 try:
                                     link = match.group(1)
                                     linkedPage = pywikibot.Page(
                                         self.site, link)
                                     if linkedPage.isRedirectPage():
                                         linkedPage = linkedPage.getRedirectTarget(
                                         )
                                     linkedItem = pywikibot.DataPage(
                                         linkedPage)
                                     pywikibot.output(
                                         'Adding %s --> %s' %
                                         (claim, linkedItem.getID()))
                                     if self.setSource(
                                             self.site().language()):
                                         item.editclaim(
                                             str(claim),
                                             linkedItem.getID(),
                                             refs={
                                                 self.setSource(
                                                     self.site().language())
                                             })
                                     else:
                                         item.editclaim(
                                             str(claim), linkedItem.getID())
                                 except pywikibot.NoPage:
                                     pywikibot.output(
                                         "[[%s]] doesn't exist so I can't link to it"
                                         % linkedItem.title())
def iterate_monuments_on_page(page):
    for (tpl, params) in pywikibot.extract_templates_and_params(page.get()):
        if tpl != u'műemlék':
            continue
        yield params