Beispiel #1
0
    def get_table_rows(self, conn):
        templates_in_cat = set()
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': 'Category:Ticker symbol templates',
            'cmnamespace': 10,
            'cmlimit': 'max',
            'format': 'json'
        }
        request = wikitools.APIRequest(self.wiki, params)
        response = request.query(querycontinue=True)
        members = response['query']['categorymembers']
        for member in members:
            templates_in_cat.add(member[u'title'].split(':', 1)[1])

        template_variations = templates_in_cat
        for template in templates_in_cat:
            template_redirects = self.get_template_redirects(template)
            template_variations = template_variations.union(template_redirects)

        page_texts = {}
        for template in templates_in_cat:
            params = {
                'action': 'query',
                'generator': 'embeddedin',
                'geititle': 'Template:%s' % template,
                'geinamespace': 0,
                'geilimit': 'max',
                'prop': 'revisions',
                'rvprop': 'content',
                'rvsection': 0,
                'format': 'json'
            }
            request = wikitools.APIRequest(self.wiki, params)
            response = request.query(querycontinue=True)
            try:
                pages = response['query']['pages']
            except KeyError:
                # This means no transclusions
                continue
            for page_id, page_data in pages.iteritems():
                page_title = page_data['title']
                page_text = page_data['revisions'][0]['*']
                page_texts[page_title] = page_text

        i = 1
        ticker_templates_re = re.compile(
            r"\{\{(%s)\|" % '|'.join(template_variations), re.I)
        ticker_templates_in_lead_re = re.compile(
            r"'''.+\{\{(%s)\|" % '|'.join(template_variations), re.I)
        for title, text in page_texts.iteritems():
            if i > 1000:
                break
            instances = len(ticker_templates_re.findall(text))
            if ticker_templates_in_lead_re.search(text):
                yield [u'{{dbr link|1=%s}}' % title, str(instances)]
                i += 1
Beispiel #2
0
    def getCatMembers(self):
        """
		Get the members of the specified category and their metadata.
		Example: http://meta.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtype=page&cmtitle=Category:IEG/Proposals/IdeaLab&cmnamespace=200&cmprop=title|timestamp|ids&cmsort=timestamp&cmdir=desc&format=jsonfm
		...will return a dict like
		{'page id' : someid, 'page path' : 'somepath', 'datetime added' : 'sometimestamp'}
		"""
        if self.mem_type == 'page':
            query_params = {
                'action': 'query',
                'list': 'categorymembers',
                'cmtitle': self.cat_title,
                'cmtype': self.mem_type,
                'cmnamespace': self.mem_namespace,
                'cmprop': 'title|timestamp|ids',
                'cmsort': 'timestamp',
                'cmdir': 'desc',
                'rawcontinue': '1',
            }
            req = wikitools.APIRequest(self.wiki, query_params)
            response = req.query()
            mem_list = [{
                'page id': str(x['pageid']),
                'page path': x['title'],
                'timestamp': x['timestamp']
            } for x in response['query']['categorymembers']]
            for mem in mem_list:
                mem = self.getPageMetaData(mem)
            return mem_list
        else:
            print "not set up to get " + self.mem_type + " category members yet"
def last_log_entry(page):
    params = {
        'action': 'query',
        'list': 'logevents',
        'lelimit': '1',
        'letitle': page,
        'format': 'json',
        'ledir': 'older',
        'letype': 'protect',
        'leprop': 'user|timestamp|comment'
    }
    request = wikitools.APIRequest(wiki, params)
    response = request.query(querycontinue=False)
    lastlog = response['query']['logevents']
    try:
        timestamp = datetime.datetime.strptime(
            lastlog[0]['timestamp'],
            '%Y-%m-%dT%H:%M:%SZ').strftime('%Y%m%d%H%M%S')
    except:
        timestamp = ''
    try:
        user = lastlog[0]['user']
    except:
        user = ''
    try:
        comment = lastlog[0]['comment']
    except:
        comment = ''
    return {'timestamp': timestamp, 'user': user, 'comment': comment}
Beispiel #4
0
 def _resolve_redirects_to_templates(self, templates):
     templates = set(templates)
     if self._wikipedia is None:
         # Testing
         return templates
     params = {
         'action':
         'query',
         'format':
         'json',
         'prop':
         'redirects',
         'titles':
         '|'.join(
             # The API resolves Template: to the relevant per-language prefix
             'Template:' + tplname
             for tplname in self._cfg.citation_needed_templates),
         'rnamespace':
         10,
     }
     request = wikitools.APIRequest(self._wikipedia, params)
     # We could fall back to just using self._cfg.citation_needed_templates
     # if the API request fails, but for now let's just crash
     for result in request.queryGen():
         for page in result['query']['pages'].values():
             for redirect in page.get('redirects', []):
                 # TODO We technically only need to keep the templates that
                 # mwparserfromhell will consider different from one another
                 # (e.g., no need to have both Cn and CN)
                 if ':' not in redirect['title']:
                     # Not a template?
                     continue
                 tplname = redirect['title'].split(':', 1)[1]
                 templates.add(tplname)
     return templates
Beispiel #5
0
    def _to_html(self, snippet):
        if self._wikipedia is None:
            # Testing
            return snippet

        params = {
            'action': 'parse',
            'format': 'json',
            'text': snippet,
        }
        request = wikitools.APIRequest(self._wikipedia, params)
        # FIXME Sometimes the request fails because the text is too long;
        # in that case, the API response is HTML, not JSON, which raises
        # an exception when wikitools tries to parse it.
        #
        # Normally this would cause wikitools to happily retry forever
        # (https://github.com/alexz-enwp/wikitools/blob/b71481796c350/wikitools/api.py#L304),
        # which is a bug, but due to our use of a custom opener, wikitools'
        # handling of the exception raises its own exception: the object returned
        # by our opener doesnt support seek().
        #
        # We use that interesting coincidence to catch the exception and move
        # on, bypassing wikitools' faulty retry, but this is obviously a terrible
        # "solution".
        try:
            html = request.query()['parse']['text']['*']
        except:
            return ''
        return self._cleanup_snippet_html(html)
Beispiel #6
0
 def get_template_redirects(self, template_title):
     template_redirects = set()
     params = {
         'action': 'query',
         'list': 'backlinks',
         'bltitle': 'Template:%s' % template_title,
         'blnamespace': 10,
         'blfilterredir': 'redirects',
         'format': 'json'
     }
     request = wikitools.APIRequest(self.wiki, params)
     response = request.query(querycontinue=True)
     backlinks = response['query']['backlinks']
     for backlink in backlinks:
         template_redirects.add(backlink[u'title'].split(':', 1)[1])
     return template_redirects
Beispiel #7
0
def query_pageids(wiki, pageids):
    params = {
        'action': 'query',
        'pageids': '|'.join(map(str, pageids)),
        'prop': 'revisions',
        'rvprop': 'content'
    }

    request = wikitools.APIRequest(wiki, params)
    for response in request.queryGen():
        for id, page in response['query']['pages'].items():
            if 'title' not in page:
                continue
            title = d(page['title'])

            text = page['revisions'][0]['*']
            if not text:
                continue
            text = d(text)
            yield (id, title, text)
Beispiel #8
0
    def queryApi(self, apiurl, query):
        """
        This function queries the API by running query on apiurl and outputs
        the result in JSON format.

        - apiurl (string): The URL to the API's base.
        - query (dict): A dictionary of API parameters.

        Returns: Dict with the API results.

        TODO: The API query should be reimplemented here so that we do not have
        the wikitools library requirement.
        """
        Wiki = wikitools.Wiki(apiurl)
        if type(query) != dict:
            raise TypeError('Query parameter should be type dict'
                            ', got %s instead' % (type(query)))
        else:
            API = wikitools.APIRequest(Wiki, query)
            return API.query(querycontinue=False)
Beispiel #9
0
    def getPageMetaData(self,
                        mempage):  #Need to make this a call to profiles.py.
        """
		Gets some additional metadata about each page.
		Currently just the local talkpage id or subjectid and the full url.
		"""
        params = {
            'action': 'query',
            'titles': mempage['page path'],
            'prop': 'info',
            'inprop': 'talkid|subjectid|url',
            'rawcontinue': '1',
        }
        req = wikitools.APIRequest(self.wiki, params)
        response = req.query()
        pageid = str(mempage['page id'])
        try:
            mempage['talkpage id'] = str(
                response['query']['pages'][pageid]['talkid'])
        except KeyError:
            mempage[
                'talkpage id'] = ""  #probably not necessary anymore, if I add these default params in to every one anyway.
        return mempage
    wikidataCodes[x:x + chunkSize]
    for x in xrange(0, len(wikidataCodes), chunkSize)
]
#Fetch data
for c in chunks:
    print("Getting %d items from Wikidata..." % len(c))

    cQueryString = '|'.join(c)
    params = {
        'action': 'wbgetentities',
        'languages': languagesQueryString,
        'props': 'labels|claims',
        'ids': cQueryString,
        'format': 'json'
    }
    request = wikitools.APIRequest(site, params)
    result = request.query()

    if "entities" in result:
        for qid, e in result["entities"].items():
            nation = {}

            # Look for translations (labels)
            nation["n"] = getLabelsFromEntity(e)

            #Lookfor properties
            if "claims" in e:

                # Look for entity properties
                # These will be pointers to other WD enteties, from which we will get the names later
                for pkey, (pid, abbr) in settings.entityProperties.items():
Beispiel #11
0
i = 1
output = []
params = {
    'action': 'query',
    'generator': 'random',
    'grnnamespace': 0,
    'grnlimit': 'max',
    'prop': 'revisions',
    'rvprop': 'content',
    'format': 'json'
}
while True:
    if i > 1000:
        break
    request = wikitools.APIRequest(wiki, params)
    response = request.query(querycontinue=False)
    pages = response['query']['pages']
    for page_id, page_data in pages.iteritems():
        if i > 1000:
            break
        page_title = page_data['title']
        page_text = page_data['revisions'][0]['*']
        if not re.search(r'(\[\[(file:|image:)|\.jpg|\.png|\.gif)', page_text,
                         re.I):
            table_row = u"""\
|-
| %d
| {{dbr link|1=%s}}""" % (i, page_title)
            output.append(table_row)
            i += 1