Example #1
0
def get_captions(title):
    params = {
        'action': 'query',
        'list': 'allpages',
        'apfrom': title.split(':', 1)[1],
        'aplimit': '100',
        'apnamespace': '10'
    }
    data = api(**params)
    langs = {}
    prefix = title + ' '
    for item in data['query']['allpages']:
        if item['title'].startswith(prefix):
            lang = item['title'].split('(')[1].split(')')[0]
            langs[lang] = item['title']
    text = ''
    for lang in sorted(langs):
        lang_name = get_language_name(lang)
        content = page_content(langs[lang])
        if content.strip().startswith('#REDIRECT'):
            # ???
            continue
        code = mwparserfromhell.parse(content)
        try:
            temp = code.filter_templates()[0]
        except IndexError:
            continue
        caption_code = temp.get(1).value
        # We want templates like {{w|FooBar}} to render, so expand them
        expanded = expand_templates(unicode(caption_code))
        caption = unicode(mwparserfromhell.parse(expanded).strip_code())
        text += '%s: %s\n' % (lang_name, caption)

    return text
Example #2
0
def parse():
    text = rfd.get()
    code = mwparserfromhell.parse(text)
    requests = []
    section = code.get_sections()[2]
    for section in code.get_sections()[1:]:

    #print section
    #print type(section)
        data = {'section': section}
        header = unicode(section.filter_headings()[0])
        data['header'] = header
        text = mwparserfromhell.parse(unicode(section).replace(header +'\n', ''))
        data['text'] = text
        #print text
        item = None
        for template in text.filter_templates():
            if unicode(template.name).startswith('Rfd group'):
                data['type'] = 'bulk'
                break
            elif template.name == 'rfd links':
                data['type'] = 'single'
                item = template.get(1).value
                break
        if item:
            item = pywikibot.ItemPage(repo, item)
        data['item'] = item
        requests.append(data)
    return requests
Example #3
0
 def update(self, push=True):
     self.fetch_info()
     self.parse_info()
     print self.LOCATION
     print self.CATEGORY
     print self.ABOUT
     print self.MOVEMENT
     print self.PRESSURE
     print self.WINDS
     #print self.UTC_TIMESTAMP
     #actually update crap
     #return
     text = self.wikipage.get()
     code = mwparserfromhell.parse(text)
     main = pywikibot.Page(self.wikipage.site, '2012 Atlantic hurricane season')
     main_text = main.get()
     main_code = mwparserfromhell.parse(main_text)
     for template in code.filter_templates():
         name = template.name.lower().strip()
         if name == 'Infobox hurricane current'.lower():
             if template.get('name').value.strip() == 'Hurricane Sandy':
                 template.get('time').value = self.UTC_TIMESTAMP
                 template.get('category').value = self.CATEGORY
                 template.get('gusts').value = self.format_wind(self.WINDS)
                 template.get('lat').value = self.LOCATION['latc']
                 template.get(1).value = self.LOCATION['latd']
                 template.get('lon').value = self.LOCATION['lonc']
                 template.get(2).value = self.LOCATION['lond']
                 template.get('movement').value = self.format_movement(self.MOVEMENT)
                 template.get('pressure').value = self.format_pressure(self.PRESSURE)
     pywikibot.showDiff(text, unicode(code))
     if push:
         self.wikipage.put(unicode(code), 'Bot: Updating hurricane infobox. Errors? [[User talk:Legoktm|report them!]]')
Example #4
0
 def test_multiple_nodes_spaces(self):
     snippet = "foo [[link1]] [[link2]] [[link3]] bar"
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link1]]", "foo [[link2]] [[link3]] bar")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link2]]", "foo [[link1]] [[link3]] bar")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link3]]", "foo [[link1]] [[link2]] bar")
Example #5
0
 def test_multiple_nodes_newlines(self):
     snippet = "[[link1]]\n[[link2]]\n[[link3]]"
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link1]]", "[[link2]]\n[[link3]]")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link2]]", "[[link1]]\n[[link3]]")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link3]]", "[[link1]]\n[[link2]]")
Example #6
0
    def cleanup_sectionlink(self, section_title):
        code = mwparser.parse(section_title)
        template = code.filter_templates()
        if len(template) == 1 and template[0].name.matches(('Erl', 'erl')):
            section_title = template[0].get(1)

        title = mwparser.parse(unicode(section_title))
        clean_title = title.strip_code(normalize=True, collapse=True).strip()
        return clean_title
Example #7
0
	def __init__(self, title=None, text=None):
		super(Article, self).__init__(text=text)
		self.title = title
		self.paragraphs = None
		self.readable_text = None
		self.lede_length = 1
		if title is not None:
			self.page = pwb.Page(site, title)
			self.text = mwp.parse(self.page.text)
			self.wikitext = mwp.parse(self.page.text) 
Example #8
0
 def test_contains(self):
     """test Wikicode.contains()"""
     code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]")
     tmpl1, tmpl2, tmpl3 = code.filter_templates()
     tmpl4 = parse("{{ccc}}").filter_templates()[0]
     self.assertTrue(code.contains(tmpl1))
     self.assertTrue(code.contains(tmpl3))
     self.assertFalse(code.contains(tmpl4))
     self.assertTrue(code.contains(str(tmpl4)))
     self.assertTrue(code.contains(tmpl2.params[0].value))
Example #9
0
 def wikicode(self):
     """
     Return the parsed wikitext (mwparserfromhell.wikicode.Wikicode object)
     """
     if not self._wikicode:
         try:
             self._wikicode = mwparserfromhell.parse(self.wikitext)
         except SystemError:
             self._wikicode = mwparserfromhell.parse('')
     return self._wikicode
def _parse_revs_into_wcode(rev_text_dict):
    result = []
    for rev_id in rev_text_dict:
        try:
            result.append((rev_id, mwp.parse(rev_text_dict[rev_id])))
        except mwp.parser.ParserError as e:
            logger.warning(e)
            logger.warning('Error parsing {0}'.format(rev_id))
            result.append((rev_id, mwp.parse('')))
    return result
Example #11
0
def page_f(pg):
    count = 0
    text = pg.get()
    code = mwparserfromhell.parse(text)
    for template in code.filter_templates(recursive=True):
        if template.name.lower().strip() in CITE_TEMPLATES:
            url = template.get('url').value.strip()
            if 'msnbc.com' in url:
                continue
            isup = is_up(url)
            if isup:
                continue
            if template.has_param('archiveurl'):
                #if template.has_param('deadurl'):
                #    if template.get('deadurl').value.strip() == 'no':
                #        template.remove('deadurl')
                #        template.add('deadurl', 'yes')
                #        continue
                continue
            #find it on archive.org
            ai_url = archive_page(url)
            if not ai_url:
                print 'Not found. :('
                continue
            raw_date = ai_url[27:27+14]
            year = int(raw_date[:4])
            day = int(raw_date[6:8])
            month_num = int(raw_date[4:6])
            month = MONTH_NAMES[month_num-1]
            template.add('archiveurl', ai_url)
            template.add('deadurl', 'yes')
            template.add('archivedate', '%s %s %s' % (day, month, year))
            count += 1

    #lets remove all the {{dead link}} now
    code = unicode(code)
    for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', code):
        p = mwparserfromhell.parse(tag.group(2))
        for template in p.filter_templates():
            set = False
            if template.name.lower().strip() in CITE_TEMPLATES:
                if template.has_param('archiveurl'):
                    set = True
            elif template.name.lower().strip() in DEAD_LINK:
                if set:
                    del p.nodes[p.nodes.index(unicode(template))]
                    code = code.replace(tag.group(2), unicode(p))
    if text != code:
        print 'No changes made on %s' % pg.title(asLink=True)
        return
    pywikibot.showDiff(text, unicode(code))
    if raw_input('Save?').lower() == 'y':
        pg.put(unicode(code), 'Manually-assisted archive url fetching.')
Example #12
0
 def test_matches(self):
     """test Wikicode.matches()"""
     code1 = parse("Cleanup")
     code2 = parse("\nstub<!-- TODO: make more specific -->")
     self.assertTrue(code1.matches("Cleanup"))
     self.assertTrue(code1.matches("cleanup"))
     self.assertTrue(code1.matches("  cleanup\n"))
     self.assertFalse(code1.matches("CLEANup"))
     self.assertFalse(code1.matches("Blah"))
     self.assertTrue(code2.matches("stub"))
     self.assertTrue(code2.matches("Stub<!-- no, it's fine! -->"))
     self.assertFalse(code2.matches("StuB"))
def process_page(page):
    text = page.get()
    text, blah = AWB.do_page(text, date=False)
    code = mwparserfromhell.parse(text)
    urls = []
    for m in urlregex.MATCH_URL.finditer(unicode(code)):
        u = m.group(0)
        if u.startswith(('http://ap.google', 'https://ap.google')):
            urls.append(u)
    """
    buffer = unicode(code)
    for template in code.filter_templates():
        for url in urls:
            if url in template:
                if template.has_param('archiveurl'):
                    urls.remove(url)
                else:
                    buffer = buffer.replace(unicode(template), unicode(template)+TAG)
                    urls.remove(url)
    code = buffer
    """
    #find ref tags
    loop1= False
    for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)):
        for url in urls:
            if url in tag.group(2):
                for template in mwparserfromhell.parse(tag.group(2)).filter_templates():
                    if template.has_param('archiveurl'):
                        try:
                            urls.remove(url)
                        except ValueError:
                            pass
                        loop1 = True
                if loop1:
                    break
                if 'dead link' in tag.group(0).lower():
                    urls.remove(url)
                else:
                    code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+TAG+'</ref>')
                    urls.remove(url)
            if loop1:
                loop1 = False
                break
    if urls:
        print 'STILL HAVE THESE LEFT: '+', '.join(urls)

    pywikibot.showDiff(text, unicode(code))
    if text != unicode(code):
        page.put(unicode(code), 'Bot: Tagging ap.google.* links with {{dead link}}')
        return True
    else:
        return None
Example #14
0
 def load_stub_templates(self):
     self.stub_templates = []
     st = pywikibot.Page(self.site, 'Wikipedia:WikiProject Stub sorting/Stub types')
     text = st.get()
     code = mwparserfromhell.parse(text)
     for template in code.filter_templates():
         if template.name.startswith('Wikipedia:WikiProject Stub sorting/Stub types/'):
             st_page = pywikibot.Page(self.site, unicode(template.name))
             text = st_page.get()
             code = mwparserfromhell.parse(text)
             for template in code.filter_templates():
                 if template.name.lower() == 'tl':
                     self.stub_templates.append(unicode(template.get(1).value).lower())
    def test_transform(self):
        wcode_list = [mwp.parse('{{Infobox something | thing}}'
                                '{{not-one else}}'
                                '{{infobox again}}'),
                      mwp.parse('{{Infobox num1 | thing}}'
                                '{{not-one else}}'
                                '{{infobox num2}}')]

        result = ifb._transform(wcode_list)

        self.assertEqual(len(result), 2)
        self.assertEqual(result[0], 'infobox-something infobox-again')
        self.assertEqual(result[1], 'infobox-num1 infobox-num2')
    def process_page(self, page):
        text = page.get()
        text, blah = self.AWB.do_page(text, date=False)
        code = mwparserfromhell.parse(text)
        urls = []
        for m in urlregex.MATCH_URL.finditer(unicode(code)):
            u = m.group(0)
            if self.matching.search(u):
                urls.append(u)
            else:
                pass
                #print 'Did not match: '+u
        #find ref tags
        loop1= False
        for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)):
            for url in urls:
                if url in tag.group(2):
                    for template in mwparserfromhell.parse(tag.group(2)).filter_templates():
                        if template.has_param('archiveurl'):
                            try:
                                urls.remove(url)
                            except ValueError:
                                pass
                            loop1 = True
                    if loop1:
                        break
                    if 'dead link' in tag.group(0).lower():
                        urls.remove(url)
                    elif 'wayback' in tag.group(0).lower():
                        urls.remove(url)
                    elif 'webcite' in tag.group(0).lower():
                        urls.remove(url)
                    else:
                        code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+self.tag+'</ref>')
                        urls.remove(url)
                if loop1:
                    loop1 = False
                    break
        if urls:
            print 'STILL HAVE THESE LEFT: '+', '.join(urls)

        pywikibot.showDiff(text, unicode(code))
        if text != unicode(code):
            if self.simulate:
                print 'Not editing, just simulating.'
                return None
            page.put(unicode(code), 'Bot: Tagging %s links with {{dead link}}' %self.domain)
            return True
        else:
            return None
Example #17
0
    def _test_search(self, meth, expected):
        """Base test for insert_before(), insert_after(), and replace()."""
        code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}")
        func = partial(meth, code)
        func("{{b}}", "x", recursive=True)
        func("{{d}}", "[[y]]", recursive=False)
        func(code.get(2), "z")
        self.assertEqual(expected[0], code)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False)
        fake = parse("{{a}}").get(0)
        self.assertRaises(ValueError, func, fake, "n", recursive=True)
        self.assertRaises(ValueError, func, fake, "n", recursive=False)

        code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}")
        func = partial(meth, code2)
        func(code2.get(1), "c", recursive=False)
        func("{{a}}", "d", recursive=False)
        func(code2.get(-1), "e", recursive=True)
        func("{{b}}", "f", recursive=True)
        self.assertEqual(expected[1], code2)

        code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}")
        func = partial(meth, code3)
        obj = code3.get(0).params[0].value.get(0)
        self.assertRaises(ValueError, func, obj, "x", recursive=False)
        func(obj, "x", recursive=True)
        self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False)
        func("{{f}}", "y", recursive=True)
        self.assertEqual(expected[2], code3)

        code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}")
        func = partial(meth, code4)
        fake = parse("{{b}}{{c}}")
        self.assertRaises(ValueError, func, fake, "q", recursive=False)
        self.assertRaises(ValueError, func, fake, "q", recursive=True)
        func("{{b}}{{c}}", "w", recursive=False)
        func("{{d}}{{e}}", "x", recursive=True)
        func(wrap(code4.nodes[-2:]), "y", recursive=False)
        func(wrap(code4.nodes[-2:]), "z", recursive=True)
        self.assertEqual(expected[3], code4)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True)

        code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}")
        func = partial(meth, code5)
        self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False)
        func("{{b}}{{c}}", "x", recursive=True)
        obj = code5.get(0).params[1].value.get(0).params[0].value
        self.assertRaises(ValueError, func, obj, "y", recursive=False)
        func(obj, "y", recursive=True)
        self.assertEqual(expected[4], code5)

        code6 = parse("here is {{some text and a {{template}}}}")
        func = partial(meth, code6)
        self.assertRaises(ValueError, func, "text and", "ab", recursive=False)
        func("text and", "ab", recursive=True)
        self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False)
        func("is {{some", "cd", recursive=True)
        self.assertEqual(expected[5], code6)
Example #18
0
    def test_get_ancestors_parent(self):
        """test Wikicode.get_ancestors() and Wikicode.get_parent()"""
        code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}")
        tmpl = code.filter_templates(matches=lambda n: n.name == "f")[0]
        parent1 = code.filter_templates(matches=lambda n: n.name == "d")[0]
        parent2 = code.filter_templates(matches=lambda n: n.name == "b")[0]
        parent3 = code.filter_templates(matches=lambda n: n.name == "a")[0]
        fake = parse("{{f}}").get(0)

        self.assertEqual([parent3, parent2, parent1], code.get_ancestors(tmpl))
        self.assertIs(parent1, code.get_parent(tmpl))
        self.assertEqual([], code.get_ancestors(parent3))
        self.assertIs(None, code.get_parent(parent3))
        self.assertRaises(ValueError, code.get_ancestors, fake)
        self.assertRaises(ValueError, code.get_parent, fake)
Example #19
0
    def section_to_request(enumerated_section_tuple):
        enum_number, section_tuple = enumerated_section_tuple
        section_header, section_wikitext = section_tuple
        section = mwparserfromhell.parse(section_wikitext)
        r = Request()
        r.row_number = enum_number + 1
        r.title = section_header
        r.replies = unicode(section).count(u"(UTC)") - 1
        signatures = []
        for index, each_node in enumerate(section.nodes):
            if type(each_node) == mwparserfromhell.nodes.text.Text and "(UTC)" in each_node:

                # Get the last timestamp-looking thing (trick from http://stackoverflow.com/a/2988680/1757964)
                for timestamp_match in TIMESTAMP.finditer(unicode(each_node)): pass
                try:
                    timestamp = datetime.datetime.strptime(timestamp_match.group(0), SIGNATURE_TIME_FORMAT)
                except ValueError:
                    timestamp = "{{unknown}}"

                # Use the last user talk page link before the timestamp
                for user_index in itertools.count(index - 1, -1):
                    user = USER.search(unicode(section.get(user_index)))
                    if user:
                        user = user.group(1)
                        break

                # Check for user renames/redirects
                user_page = pywikibot.Page(wiki, "User:"******":")[1]

                signatures.append((user, timestamp))
        # Process usernames by removing anchors
        signatures = [(x.partition('#')[0], y) for x, y in signatures]

        # Default values for everything
        r.last_editor, r.last_edit_time = r.last_botop_editor, r.last_botop_time = "{{no result|None}}", "{{n/a}}"

        if signatures:
            r.last_editor, r.last_edit_time = signatures[-1]
            for user, timestamp in reversed(signatures):
                if is_botop(wiki, user):
                    r.last_botop_editor, r.last_botop_time = user, timestamp
                    break
        return r
Example #20
0
 def mapper_final(self):
     for entry in self.d:
         links_list = []
         working_text = self.d[entry][1]            
         try:
             wikicode = mw.parse(working_text)
             links_list = wikicode.filter_wikilinks() 
         except:
             pass
         article_links = 0
         links_title = []
         if links_list != []:
             for link in links_list:
                 links_title.append(link.title)
             for h in range(0, len(links_title)):
                 links_title[h]=links_title[h].strip().lower()
             title_set = Set(links_title)
             article_links=len(title_set)
         yield "a", article_links
         yield "s", article_links*article_links
         yield "n", 1
     for k in range(0, self.e):
         yield "a", 1
         yield "s", 1
         yield "n", 1
Example #21
0
  def parse_infobox(self, title, page):
    '''Parse out the nice mediawiki markdown to get birth and death
    Input:
      mediawiki unicode page string
    Returns:
      a dictionary with name(string), birth_date:DateTime, death_date:DateTime
    '''
    code = mwparserfromhell.parse(page)
    for template in code.filter_templates():
      if 'Infobox' in template.name:
        # Found the right template -- attempting to extract data
        output = dict(title=title)
        
        for key in ['name', 'birth_name']:
          if template.has(key):
            output['name'] = template.get(key).value.strip()
        
        for date in ['birth_date', 'death_date']:
          try:
            item = self.parse_date(template.get(date))
          except ValueError as e:
            item = None
          output[date] = item

        # ok we are done here
        return output
    raise InfoError()
Example #22
0
def get_template_info(template_checker, commonscat_mapper, text, monument_id=''):
    if not monument_id:
        return {
            "id_not_found": True,
            "category": get_most_specific_category(commonscat_mapper, text),
            "missing_monument_id": True
        }
    id_count = 0
    info = {}
    templates = mwparserfromhell.parse(text).filter_templates()
    for template in template_checker.filter_allowed_templates(templates):
        if template_checker.get_id(template) != monument_id:
            continue
        if id_count:
            id_count += 1
            continue
        id_count = 1
        info = {
            "template": unicode(template),
            "category": get_most_specific_category(commonscat_mapper, text, template),
            "valid_id": template_checker.has_valid_id(template),
            "image_exists": image_exists(template)
        }
    if info:
        info["duplicate_ids"] = id_count > 1
    else:
        info["id_not_found"] = True
        info["category"] = get_most_specific_category(commonscat_mapper, text)
    return info
Example #23
0
def extract_templates_and_params(text):
    """Return a list of templates found in text.

    Return value is a list of tuples. There is one tuple for each use of a
    template in the page, with the template title as the first entry and a
    dict of parameters as the second entry.  Parameters are indexed by
    strings; as in MediaWiki, an unnamed parameter is given a parameter name
    with an integer value corresponding to its position among the unnnamed
    parameters, and if this results multiple parameters with the same name
    only the last value provided will be returned.

    This uses a third party library (mwparserfromhell) if it is installed
    and enabled in the user-config.py. Otherwise it falls back on a
    regex based function defined below.

    @param text: The wikitext from which templates are extracted
    @type text: unicode or string

    """

    if not (config.use_mwparserfromhell and mwparserfromhell):
        return extract_templates_and_params_regex(text)
    code = mwparserfromhell.parse(text)
    result = []
    for template in code.filter_templates(recursive=True):
        params = {}
        for param in template.params:
            params[unicode(param.name)] = unicode(param.value)
        result.append((unicode(template.name.strip()), params))
    return result
Example #24
0
def ensure_flagged_by_template(wikicode, node, template_name, *template_parameters, overwrite_parameters=True):
    """
    Makes sure that ``node`` in ``wikicode`` is immediately (except for
    whitespace) followed by a template with ``template_name`` and optional
    ``template_parameters``.

    :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object
    :param node: a :py:class:`mwparserfromhell.nodes.Node` object
    :param str template_name: the name of the template flag
    :param template_parameters: optional template parameters
    :returns: the template flag, as a
        :py:class:`mwparserfromhell.nodes.template.Template` objet
    """
    parent = get_parent_wikicode(wikicode, node)
    adjacent = get_adjacent_node(parent, node, ignore_whitespace=True)

    if template_parameters:
        flag = "{{%s}}" % "|".join([template_name, *template_parameters])
    else:
        flag = "{{%s}}" % template_name
    flag = mwparserfromhell.parse(flag).nodes[0]
    assert(isinstance(flag, mwparserfromhell.nodes.Template))

    if isinstance(adjacent, mwparserfromhell.nodes.Template) and adjacent.name.matches(template_name):
        # in case of {{Dead link}} we want to preserve the original parameters
        if overwrite_parameters is True:
            wikicode.replace(adjacent, flag)
        else:
            flag = adjacent
    else:
        wikicode.insert_after(node, flag)

    assert(get_parent_wikicode(wikicode, flag) is parent)
    return flag
def main():
    wikidata = pywikibot.getSite('wikidata','wikidata')
    page = pywikibot.Page(wikidata, 'User:Legobot/Dupes')
    text = page.get()
    code = mwparser.parse(text)
    templates = code.filter_templates()
    for template in templates:
        if template.name != 'rfd links':
            continue
        qid = str(template.get(1).value)
        reason = str(template.get(2).value)
        dupe = link.search(reason)
        if not dupe:
            print 'Error: Cannot parse the deletion reason, skipping.'
            continue
        other = pywikibot.Page(wikidata, dupe.group(1))
        if not other.exists():
            print 'Uhoh, the other copy doesn\'t exist. Won\'t delete.'
            continue
        print 'Will delete {0} because: {1}'.format(qid, reason)
        page = pywikibot.Page(wikidata, qid)
        if not page.exists():
            print 'Uhoh, someone already deleted it!'
            continue
        page.delete(reason, prompt=False)
        print 'Destroyed. Will sleep a bit.'
Example #26
0
def section_processing(section, tpl_header, categories, articleTitle):
	# tpl_header = copy.deepcopy(tpl_header)
	tom_tpl = section[0]
	content = section[1]
	section_tpl_wikicode = mwp.parse(tom_tpl)
	for tpl in section_tpl_wikicode.filter_templates():
		if tpl.name.matches('tom'):
			num_izd = str(tpl.get(1).value).strip()
			# {{tom}} - без аналога в изданиях, без 2-го параметра или с 3-м
			tpls_noTerm = []
			# for tpl in wikicode.filter_templates():
			# 	if tpl.name.matches('tom'):
			if not tpl.has(2) or tpl.get(2).value == '' \
					or (tpl.has(3) and tpl.get(3).value != ''):
				# tpls_noTerm.append(tpl)
				# tpl_header.add(tpls_noTerm)
				tpl_header.add('-ТСД%s' % num_izd, '%s<!-- временный шаблон для бота -->' % str(tpl))
				return None

			title_ed = '%s%s/%s' % (PAGENAME_PREFIX, num_izd, articleTitle)
			# articleName = articleTitle.partition('/')[0]
			tpl_header.add('ТСД%s' % num_izd, articleTitle.partition('/')[0])
			# page_new = makePage(num_izd, title_ed, tpl_header, content, categories)
			page_new = [num_izd, title_ed, tpl_header, content]
			return page_new
def extract_plain_text(wiki_body):
    wikicode = mwparserfromhell.parse(wiki_body)
    plain_text = ""
    for node in wikicode.nodes:
        type_of_node = type(node)
        if type_of_node == mwparserfromhell.nodes.template.Template:
            continue
        if type_of_node == mwparserfromhell.nodes.argument.Argument:
            continue
        if type_of_node == mwparserfromhell.nodes.comment.Comment:
            continue
        if type_of_node == mwparserfromhell.nodes.html_entity.HTMLEntity:
            continue
        if type_of_node != mwparserfromhell.nodes.text.Text:
            if type(node) == mwparserfromhell.nodes.tag.Tag:
                str_node = node.contents
            elif type(node) == mwparserfromhell.nodes.external_link.ExternalLink:
                str_node = node.title
            elif type(node) == mwparserfromhell.nodes.heading.Heading:
                str_node = node.title
            elif type(node) == mwparserfromhell.nodes.wikilink.Wikilink:
                str_node = node.title
            plain_text += extract_plain_text(str_node)
        else:
            plain_text += str(node)
    return re.sub(r'\([^)]*\)', '', plain_text)
Example #28
0
    def get_assessments(self, rev_content):
        '''
        For the given revision content, get all assessments.

        @param rev_content: wikitext content of the given talk page revision
                            we're assessing
        @type rev_content: unicode
        '''

        parsed_code = mwp.parse(rev_content)
        templates = parsed_code.filter_templates()
        assessments = []
        for temp in templates:
            if re.match('wikiproject\s+',
                        unicode(temp.name),
                        re.I) \
                or unicode(temp.name) in self.translations \
                or temp.has_param('class'):
                project = unicode(temp.name).lower()
                try:
                    rating = unicode(temp.get('class').value).strip().lower()
                except ValueError:
                    continue # no assessment class in template
                importance = None
                if temp.has_param('importance'):
                    importance = unicode(temp.get('importance').value).strip().lower()
                assessments.append(Assessment(rating,
                                              importance,
                                              project))
        # return all assessments
        return assessments
Example #29
0
    def add_template(self):
        if not self.adtTitle:
            return  # silently fail

        adtPage = pywikibot.Page(self.site, self.adtTitle, ns=1)
        code = mwparserfromhell.parse(adtPage.text)

        war_adt_added = False
        for template in code.filter_templates(recursive=False):
            if template.name.matches("AdT-Vorschlag Hinweis"):
                code.remove(template)
                pywikibot.output(u'D:AdT: {{AdT-Vorschlag Hinweis}} gefunden,'
                                 u'entfernt')
            if template.name.matches("War AdT"):
                if not any(self.snapDate in p for p in template.params):
                    template.add(str(len(template.params)+1), self.snapDate)
                    pywikibot.output(u'D:AdT: {{War AdT}} '
                                     u'gefunden, füge heute hinzu')
                war_adt_added = True
        text = unicode(code)
        if not war_adt_added:
            template = u'{{War AdT|1=' + self.snapDate + u'}}\n'
            text = self.__add_templ(text, template)

        if adtPage.text != text:
            pywikibot.showDiff(adtPage.text, text)  # debug
            adtPage.text = text
            if not self.dry:
                adtPage.save(comment=templateComment, botflag=True, minor=True)
Example #30
0
    def test_index(self):
        """test Wikicode.index()"""
        code = parse("Have a {{template}} and a [[page|link]]")
        self.assertEqual(0, code.index("Have a "))
        self.assertEqual(3, code.index("[[page|link]]"))
        self.assertEqual(1, code.index(code.get(1)))
        self.assertRaises(ValueError, code.index, "foo")

        code = parse("{{foo}}{{bar|{{baz}}}}")
        self.assertEqual(1, code.index("{{bar|{{baz}}}}"))
        self.assertEqual(1, code.index("{{baz}}", recursive=True))
        self.assertEqual(1, code.index(code.get(1).get(1).value,
                                       recursive=True))
        self.assertRaises(ValueError, code.index, "{{baz}}", recursive=False)
        self.assertRaises(ValueError, code.index,
                          code.get(1).get(1).value, recursive=False)
 def test_readme_4(self):
     """test a block of example code in the README"""
     text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}"
     code = mwparserfromhell.parse(text)
     for template in code.filter_templates():
         if template.name.matches("Cleanup") and not template.has("date"):
             template.add("date", "July 2012")
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}}"
     self.assertPrint(code, res)
     code.replace("{{uncategorized}}", "{{bar-stub}}")
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
     self.assertPrint(code, res)
     if py3k:
         res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']"
     else:
         res = "[u'{{cleanup|date=July 2012}}', u'{{bar-stub}}']"
     self.assertPrint(code.filter_templates(), res)
     text = str(code)
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
     self.assertPrint(text, res)
     self.assertEqual(text, code)
def process_wikipedia_article(title, text, template='Infobox film'):
    wikipedia_article_parser = mwparserfromhell.parse(text)

    # Determine whether the article is about a movie or not based on whether it includes the film infobox.
    movie_infobox_matches = wikipedia_article_parser.filter_templates(
        matches=template)

    if len(movie_infobox_matches) >= 1:
        # Extract information from infobox.
        # Don't actually need, but keep in case it's useful later.
        # properties = {param.name.strip_code().strip(): param.value.strip_code().strip() for param in movie_infobox_matches[0].params if param.value.strip_code().strip()}

        # Extract internal wikilinks.
        internal_links = [
            link.title.strip_code().strip()
            for link in wikipedia_article_parser.filter_wikilinks()
        ]

        internal_links = filter_out_most_common_links(internal_links)

        return {'title': title, 'internal_links': internal_links}
Example #33
0
    def mapper_get_page(self, _, line):
        if '<page>' in line:
            self.page_single = []
            self.page_status = 1

        if self.page_status == 1:
            self.page_single.append(line)

        if '</page>' in line:
            #self.page_single.append(line)
            if self.page_status == 1:
                page = ''.join(self.page_single)
                root = etree.XML(page)
                content = root.xpath("//revision")[-1].xpath(".//text")[0].text
                self.page_status = 0
                if content:
                    content = mwparserfromhell.parse(content).strip_code()
                    yield None, content
            else:
                self.page_status = 0
                self.page_single = []
Example #34
0
def keep_only_includes(
    wikicode: mwparserfromhell.wikicode.Wikicode
) -> mwparserfromhell.wikicode.Wikicode:
    """Keeps only the onlyincludes tags if any"""
    only_include_present = False
    to_remove = list()
    for tag in wikicode.filter_tags(
            recursive=False):  # select only the most external one
        if tag.tag.matches('onlyinclude'):
            only_include_present = True
        else:
            to_remove.append(tag)
    if only_include_present:
        for tag in to_remove:
            try:
                wikicode.remove(tag)
            except ValueError:
                pass
    wikicode = mwparserfromhell.parse(
        re.sub(onlyinclude_tag, '', str(wikicode)))
    return wikicode
Example #35
0
def process_text(text):
    parsed = mwparserfromhell.parse(text)
    titles = {}
    editions = []

    for tag in parsed.filter():
        if isinstance(tag, mwparserfromhell.nodes.heading.Heading):
            titles[tag.level] = tag.title
        elif isinstance(tag, mwparserfromhell.nodes.wikilink.Wikilink):
            if link_is_pdf(tag) and titles.get(2) == 'Music files':
                editions[-1].add_pdf(tag)
        elif isinstance(tag, mwparserfromhell.nodes.template.Template):
            if tag.name == 'CPDLno':
                editions.append(Edition())
                editions[-1].cpdlno_params = strparams(tag)
            elif tag.name == 'Editor':
                editions[-1].editor_params = strparams(tag)
            elif tag.name == 'ScoreInfo':
                editions[-1].score_info = strparams(tag)

    return editions
Example #36
0
    def __init__(self, element: Element):
        self.ignored = False

        for child in element.getchildren():
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}redirect":
                self.ignored = True
                return
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}ns" and child.text != "0":
                self.ignored = True
                return
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}title":
                self.title = child.text
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}id":
                self.id = child.text
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}revision":
                for revision in child.getchildren():
                    if revision.tag == "{http://www.mediawiki.org/xml/export-0.10/}text":
                        self.body = mwparserfromhell.parse(revision.text).strip_code()
                        break

        element.clear()
Example #37
0
 def mapper_final(self):
     for entry in self.d:
         working_text = self.d[entry][1]
         wikicode = mw.parse(working_text)
         links_list = wikicode.filter_wikilinks()
         article_links = 0
         links_title = []
         if links_list != []:
             for link in links_list:
                 links_title.append(link.title)
             for h in range(0, len(links_title)):
                 links_title[h] = links_title[h].strip().lower()
             title_set = Set(links_title)
             article_links = len(title_set)
         yield "a", article_links
         yield "s", article_links * article_links
         yield "n", 1
     for k in range(0, self.e):
         yield "a", 1
         yield "s", 1
         yield "n", 1
Example #38
0
def make_new_wikicode(text, form_data, page_name):
    wikicode = mwparserfromhell.parse(text)
    change_made = False
    for template in wikicode.filter_templates():
        edit = main.TemplateEdit(template, page_name)
        if edit.classification == 'ignored' or edit.classification == 'rejected':
            continue
        proposed_addition = form_data.get(edit.orig_hash)
        user_checked = form_data.get(edit.orig_hash + '-addlink')
        if proposed_addition and user_checked == 'checked':
            # Go through one or more suggestions separated by pipe
            for proposed_parameter in proposed_addition.split("|"):
                try:
                    # Get the new wikitext for the template with this parameter added
                    edit.update_template(proposed_parameter)
                    change_made = True
                except ValueError:
                    app.logger.exception(
                        'update_template failed on {}'.format(page_name))
                    pass  # TODO report to the user
    return unicode(wikicode), change_made
def get_bill_for_page(page):
    for template in mwparserfromhell.parse(page["text"]).filter_templates():
        if template.name.strip() == "Infobox U.S. legislation":
            #print page["title"].encode("utf8")
            billref = get_bill_from_infobox(template)
            if billref:
                try:
                    if billref[0] == "PL":
                        # Get by pulic law number.
                        return Bill.objects.get(congress=billref[1],
                                                sliplawpubpriv="PUB",
                                                sliplawnum=billref[2])
                    elif billref[0] == "BILL":
                        # It's a bill number.
                        return Bill.objects.get(congress=billref[1],
                                                bill_type=BillType.by_slug(
                                                    billref[2]),
                                                number=billref[3])
                except Bill.DoesNotExist:
                    return None
    return None
Example #40
0
def expand_templates(defn, keep_lb=True, rm_gram=False):
    from mwparserfromhell import parse
    from wikiparse.assoc.identispan import identispan_text_rm

    wikicode = parse(defn)

    for t in wikicode.filter_templates(recursive=False):
        if t.name in ("l", "link"):
            wikicode.replace(t, "[[{}]]".format(expand_templates(t.get(2))))
        elif t.name in ("lb",) and keep_lb:
            wikicode.replace(t, "({})".format(expand_templates(t.get(2))))
        elif t.name in ("gloss", "qualifier"):
            wikicode.replace(t, "({})".format(expand_templates(t.get(1))))
        else:
            wikicode.remove(t)

    defn = str(wikicode)
    if rm_gram:
        defn = identispan_text_rm(defn)

    return defn
Example #41
0
    def parse_proposal_page(self, page_name):
        """
        Parses a proposal page to extract metadata about the property to create.

        :param text: the name of the proposal page
        """
        self.page_name = page_name
        text = self.get_page_over_api(PROPERTY_PROPOSAL_PREFIX + page_name)
        wikicode = mwparserfromhell.parse(cleanup_text(text.encode('utf-8')))

        for node in wikicode.filter(forcetype=(Template, Heading)):
            if isinstance(node, Heading):
                self.latest_labels = self.parse_translatable(node.title)
            elif isinstance(node, Template):
                template = node
                if (unicode(template.name).strip() == 'Property proposal'
                        and template.get('status').value.strip() == 'ready'):
                    self.parse_proposal_template(template)
                    self.users = self.extract_users(wikicode)
                    break
        self.orig_wikicode = wikicode
Example #42
0
def location_from_grid(grid,
                       e,
                       n,
                       digits,
                       view_direction,
                       use6fig,
                       mapit=None):
    latstr, lonstr, prec = latlon_from_grid(grid, e, n, digits, use6fig)
    precstr = "{:g}".format(prec)
    paramstr = "source:" + source_from_grid(grid, e, n, digits)
    region = region_of(grid, e, n, latstr, lonstr, mapit)
    if region != None:
        paramstr += "_region:{}".format(region)
    if view_direction != None:
        paramstr += "_heading:{}".format(view_direction)
    t = Template(mwparserfromhell.parse('Location'))
    t.add(1, latstr)
    t.add(2, lonstr)
    t.add(3, paramstr)
    t.add('prec', precstr)
    return t
    def update_and_save(self, page, lookup):
        text = page.text()
        wikitext = mwparserfromhell.parse(text)
        for template in wikitext.filter_templates():
            if template.name.matches(['Listplayer/Current']):
                player = template.get('1').value.strip()
                if player not in lookup:
                    template.add('squad', '')
                    continue
                template.add('squad', lookup[player])

        newtext = str(wikitext)
        if text != newtext:
            # print('Saving page %s...' % page.name)
            try:
                self.site.save(page, newtext, summary=self.SUMMARY)
            except EditError:
                self.site.log_error_content(
                    page.name, 'Spam filter prohibited squad point update')
        else:
            pass
def handle_existing_page(player_page, player_name):
    """
    :type player_page: pywikibot.page.Page
    :type player_name: str
    """

    parsed_mw_text = mwparserfromhell.parse(player_page.text)
    football_player_template = parsed_mw_text.filter_templates(player_template_name)[0]

    arguments = __get_football_player_template(player_name)

    for argument_name, argument_value in arguments.items():
        if str(argument_value) != football_player_template.get(argument_name).value and SHOULD_SHOW_DIFF:
            logger.info("Found diff between arguments on this argument_name: {arg_name}\n"
                        "existing value: {existing_value}\nnew_value: {new_value}".
                        format(arg_name=argument_name, existing_value=football_player_template.get(argument_name).value,
                               new_value=argument_value))

            football_player_template.add(argument_name, argument_value)

    player_page.text = parsed_mw_text
    def process_article(self, title, text, template='Infobox person'):
        """Process a wikipedia article looking for template"""

        # Create a parsing object
        wikicode = mwparserfromhell.parse(text)

        # Search through templates for the template
        matches = wikicode.filter_templates(matches=template)
        raw_year_string = 'EMPTY'
        birth_year = 'EMPTY'
        infobox = ''
        if len(matches) >= 1:
            # Extract information from infobox
            for match in matches:
                infobox = str(match)
                for param in match.params:
                    if param.name.strip_code().strip() == 'birth_date':
                        raw_year_string = str(param.value)
                        birth_year = self.get_birth_year(raw_year_string)
            summary = self.get_summary(wikicode.strip_code().strip())
            return (title, birth_year, summary, raw_year_string, infobox)
Example #46
0
 def mapper(self, _, line):
     try:
         self._chunk += line.strip()
         if re.search(r"</page>", line):
             text = ''
             self._slurping = False
             root = etree.fromstring(self._chunk, parser)
             texts = root and root.xpath('//text')
             if texts:
                 text = texts[0].text
             if text:
                 lset = set()
                 mwp = mwparserfromhell.parse(text)
                 links = mwp.filter_wikilinks()
                 for link in links:
                     match = parselink.search(unicode(link))
                     lset.add(match.groups()[0])
                 yield None, len(lset)
             self._chunk = ''
     except:
         self._chunk = ''
Example #47
0
 def test_readme_5(self):
     """test a block of example code in the README; includes a web call"""
     url1 = "https://en.wikipedia.org/w/api.php"
     url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw"
     title = "Test"
     data = {"action": "query", "prop": "revisions", "rvprop": "content",
             "rvslots": "main", "rvlimit": 1, "titles": title,
             "format": "json", "formatversion": "2"}
     try:
         raw = urlopen(url1, urlencode(data).encode("utf8")).read()
     except IOError:
         self.skipTest("cannot continue because of unsuccessful web call")
     res = json.loads(raw.decode("utf8"))
     revision = res["query"]["pages"][0]["revisions"][0]
     text = revision["slots"]["main"]["content"]
     try:
         expected = urlopen(url2.format(title)).read().decode("utf8")
     except IOError:
         self.skipTest("cannot continue because of unsuccessful web call")
     actual = mwparserfromhell.parse(text)
     self.assertEqual(expected, actual)
def count_in_article(article_name):
    soup = bsoup(
        requests.get('https://en.wikipedia.org/wiki/Special:Export/' +
                     article_name).text, 'lxml')
    pagetext = soup.find('text').text

    wikilinks = mwph.parse(pagetext).filter_wikilinks()
    for wl in wikilinks:
        if wl[:7].lower() == '[[file:' or wl[:11].lower() == '[[category:':
            pagetext = pagetext.replace(str(wl), '')

    for wl in wikilinks:
        if wl[:7].lower() != '[[file:' and wl[:11].lower() != '[[category:':
            pagetext = pagetext.replace(str(wl), str(wl)[2:-2])

    wikitemplates = mwph.parse(pagetext).filter_templates()
    for wt in wikitemplates:
        pagetext = pagetext.replace(str(wt), '')

    comments = mwph.parse(pagetext).filter_comments()
    for comment in comments:
        pagetext = pagetext.replace(str(comment), '')

    external_links = mwph.parse(pagetext).filter_external_links()
    for ex_l in external_links:
        pagetext = pagetext.replace(str(ex_l), '')

    headings = mwph.parse(pagetext).filter_headings()
    for heading in headings:
        pagetext = pagetext.replace(str(heading), str(heading).strip('='))

    html_entities = mwph.parse(pagetext).filter_html_entities()
    for h_ent in html_entities:
        pagetext = pagetext.replace(str(h_ent), '')

    pagetext = filter_elements(pagetext, '{| class="wikitable', '|}')
    pagetext = filter_elements(pagetext, '{| class="infobox', '|}')
    pagetext = filter_elements(pagetext, '{| class="floatright', '|}')
    pagetext = filter_elements(pagetext, '{{cite', '}}')
    pagetext = filter_elements(pagetext, '<', '>')

    count_dict = {
        'wikilinks': len(wikilinks),
        'words': len(remove_punctuation(nltk.word_tokenize(pagetext))),
        'sentences': len(nltk.sent_tokenize(pagetext))
    }

    return count_dict
Example #49
0
 def run(self):
     while True:
         page, qid = self.queue.get()
         code = mwparser.parse(page.get())
         found = False
         template = ''  # to make pycharm shut up
         for template in code.filter_templates():
             if template.name.lower().strip() in redirects:
                 found = True
                 break
         if not found:
             print 'Could not find template on ' + page.title()
             return
         data = dictify(template)
         d = list()
         for nm in data:
             if not data[nm] or len(data[nm]) > 250:
                 continue
             d.append((None, page.title(), deqid(qid), nm, data[nm]))
         parsed.put(d)
         self.queue.task_done()
def allow_bots(text, user):
    user = user.lower().strip()
    text = mwparserfromhell.parse(text)
    for tl in text.filter_templates():
        if tl.name in ('bots', 'nobots'):
            break
    else:
        return True
    for param in tl.params:
        bots = [x.lower().strip() for x in param.value.split(",")]
        if param.name == 'allow':
            if ''.join(bots) == 'none': return False
            for bot in bots:
                if bot in (user, 'all'):
                    return True
        elif param.name == 'deny':
            if ''.join(bots) == 'none': return True
            for bot in bots:
                if bot in (user, 'all'):
                    return False
    return True
Example #51
0
def fix_text(text):
    code = mwparserfromhell.parse(text)
    newtext = ''
    for index, x in enumerate(code.nodes):
        flag = False
        #print repr(x)
        if index != 0 and isinstance(x, mwparserfromhell.nodes.Text):
            if x.endswith('\n\n'):
                if isinstance(code.nodes[index - 1],
                              mwparserfromhell.nodes.Tag) and str(
                                  code.nodes[index - 1]) == '*':
                    if len(code.nodes) >= index + 1:
                        if isinstance(code.nodes[index + 1],
                                      mwparserfromhell.nodes.Tag) and str(
                                          code.nodes[index + 1]) == '*':
                            #print 'trimming'
                            flag = True
                            newtext += unicode(x)[:-1]
        if not flag:
            newtext += unicode(x)
    return newtext
    def WIR_member_parse_wikilinks(self, page):
        set_members = set()
        try:
            query = self.url_page + page
            response = requests.get(query).json()
            pages = response['query']['pages']
            for page in pages:
                page_text = pages[page]['revisions'][0]['*']
                wikicode = mwp.parse(page_text)
                for link in wikicode.filter_wikilinks():
                    if link.startswith("[[User:"******"[[User:"******"").replace("]]", "")
                        set_members.add(user_text)

        except Exception:
            print("Error when parsing WIR pages")

        print("Identified {} members from the page: {}.".format(
            len(set_members), page))
        return set_members
Example #53
0
    def process_template(self, template):
        for page in template.getReferences(onlyTemplateInclusion=True,
                                           namespaces=0):
            try:
                text = page.get()
            except pywikibot.Error:
                continue
            else:
                code = mwparserfromhell.parse(text)

            for t in code.ifilter_templates():
                if t.name.lower().strip() == self.singles_template.title(
                        withNamespace=False).lower():
                    for p in t.params:
                        if "date" in p.name:
                            for t2 in p.value.ifilter_templates():
                                if t2.name.lower().strip(
                                ) in self.start_date_template_titles:
                                    date = self._get_date(t2, False)
                                    if date is not None:
                                        p.value.replace(t2, date)
                elif t.name.lower().strip(
                ) == self.episode_list_template.title(
                        withNamespace=False).lower():
                    if t.has_param("AltDate"):
                        for t2 in t.get("AltDate").value.ifilter_templates():
                            if t2.name.lower().strip(
                            ) in self.start_date_template_titles:
                                date = self._get_date(t2)
                                if date is not None:
                                    t.get("AltDate").value.replace(t2, date)
            if text != code:
                try:
                    page.put(
                        code,
                        "[[Wikipedia:Bots|Bot]]: Replacing {{[[Template:Start date|start date]]}} with the actual date"
                        " (it should only be used once in a template that emits microformats;"
                        " see [[Template:Start date/doc]])")
                except pywikibot.Error:
                    continue
Example #54
0
def parse_person(rec):
    wikitext = rec['wikitext']
    parsed = mwparserfromhell.parse(wikitext)

    words = [w.lower() for w in WORD_RE.findall(parsed.strip_code())]
    word_count = len(words)
    word_counts = Counter(words)
    gender_words = {w: word_counts[w] for w in ('him', 'his', 'he', 'her', 'she')}

    res = {}
    for template in parsed.filter_templates():
        if template.name.lower().startswith('infobox'):
            for param in template.params:
                res[param.name.strip().lower()] = param.value
    wikilinks = [str(x.title) for x in parsed.filter_wikilinks()]
    locations = []
    for k in 'birth_place', 'death_place':
        if k in res:
            locations += [str(x.title) for x in res[k].filter_wikilinks()]

    born = None
    died = None
    for wl in parsed.filter_wikilinks():
        title = str(wl.title)
        if title.startswith(CAT_PREFIX):
            if title.endswith(BIRTH_POSTFIX):
                born = tolerant_int(title[len(CAT_PREFIX): -len(BIRTH_POSTFIX)])
            if title.endswith(DIED_POSTFIX):
                died = tolerant_int(title[len(CAT_PREFIX): -len(DIED_POSTFIX)])

    return {'person_name': rec['title'],
            'wiki_id': rec['wiki_id'],
            'infobox': rec['infobox'],
            'locations': locations,
            'word_count': word_count,
            'gender_words': gender_words,
            'view_count': rec['viewcount'],
            'wikilinks': wikilinks,
            'born': born,
            'died': died}
    def run(self):
        where_condition = ' OR '.join(
            ['MSG.{} IS NOT NULL'.format(_) for _ in self.vod_params])
        vod_options = ['MSG.{}'.format(_) for _ in self.vod_params]
        fields = [
            'COALESCE({})=Vod'.format(', '.join(vod_options)),
            'MSG._pageName=MSGPage', 'SG._pageName=SBPage',
            'SG.N_MatchInPage=N_MatchInPage', 'SG.N_GameInMatch=N_GameInMatch'
        ]
        result = self.site.cargo_client.query(
            tables="MatchScheduleGame=MSG,ScoreboardGames=SG",
            join_on="MSG.GameId=SG.GameId",
            where=
            f"(SG.VOD IS NULL AND SG._pageName IS NOT NULL AND ({where_condition}))"
            f" OR (SG.VOD != COALESCE(MSG.Vod, MSG.VodPB, MSG.VodGameStart, MSG.VodPostgame))",
            fields=', '.join(fields),
            order_by=
            'SG._pageName, SG.N_MatchInPage',  # this is just to group same pages consecutively
        )

        current_page = {
            'page': None,
            'wikitext': None,
            'page_name': None,
        }
        for item in result:
            if current_page['page_name'] != item['SBPage']:
                if current_page['page'] is not None:
                    self.save_page(current_page)
                current_page['page_name'] = item['SBPage']
                current_page['page'] = self.site.client.pages[
                    current_page['page_name']]
                current_page['wikitext'] = mwparserfromhell.parse(
                    current_page['page'].text())
                # print('Discovered page {}'.format(current_page['page_name']))
            self.add_vod_to_page(item, current_page['wikitext'])

        # we need to catch the last iteration too (assuming we actually did anything)
        if current_page['page'] is not None:
            self.save_page(current_page)
Example #56
0
 def do_page(self, page):
     #print page.title(asLink=True).encode('utf-8')
     if page.namespace() != 6:
         return
     text = page.get()
     text, gen_fix_summary = self.AWBGenFixes.do_page(text)
     code = mwparserfromhell.parse(text)
     tag = False
     log = '* '
     summary = 'Bot: Updating license tag(s) with image has rationale=yes (errors? [[User:Legobot/Stop/22|stop me]])'
     for template in code.filter_templates(recursive=True):
         name = pywikibot.removeDisabledParts(template.name.lower()).strip()
         #print self.NFURs
         #time.sleep(5)
         if name in self.NFURs:
             tag = True
     if tag:
         for template in code.filter_templates(recursive=True):
             name = pywikibot.removeDisabledParts(
                 template.name.lower()).strip()
             if name in self.licenses:
                 template.add('image has rationale', 'yes')
                 log += '[[:%s]]: Adding <code>|image has rationale=yes</code>' % page.title(
                 )
     else:
         #print 'Skipping '+page.title(asLink=True).encode('utf-8')
         return
     #if gen_fix_summary:
     #    summary += ', also dating ' + gen_fix_summary
     puttext = unicode(code).lstrip('\n')
     pywikibot.showDiff(text, puttext)
     self.output(log)
     self.check_page()
     try:
         page.text = puttext
         page.save(summary, async=True, nocreate=True)
     except pywikibot.exceptions.PageNotSaved:
         pass
     except pywikibot.exceptions.LockedPage:
         pass
Example #57
0
class test_parented_ifilter:
    wikicode = mwparserfromhell.parse("""<span>
            foo {{bar|some text and {{another|template}}}}
            </span>
            {{foo|bar}}
            """)

    def test_recursive(self):
        nodes = []
        for parent, node in parented_ifilter(self.wikicode,
                                             recursive=True):
            nodes.append(node)
            assert parent.index(node) >= 0
        assert nodes == self.wikicode.filter(recursive=True)

    def test_nonrecursive(self):
        nodes = []
        for parent, node in parented_ifilter(self.wikicode,
                                             recursive=False):
            nodes.append(node)
            assert parent.index(node) >= 0
        assert nodes == self.wikicode.filter(recursive=False)

    def test_recursive_templates(self):
        templates = []
        for parent, template in parented_ifilter(self.wikicode,
                                                 forcetype=mwparserfromhell.nodes.template.Template,
                                                 recursive=True):
            templates.append(template)
            assert parent.index(template) >= 0
        assert templates == self.wikicode.filter_templates(recursive=True)

    def test_nonrecursive_templates(self):
        templates = []
        for parent, template in parented_ifilter(self.wikicode,
                                                 forcetype=mwparserfromhell.nodes.template.Template,
                                                 recursive=False):
            templates.append(template)
            assert parent.index(template) >= 0
        assert templates == self.wikicode.filter_templates(recursive=False)
 def run(self):
     matches = self.parser.run()
     i = 0
     match = matches[i]
     match: Match
     cur_page = None  # trailing index for printing at the end
     for page in self.data_pages:
         cur_page = page
         text = page.text()
         wikitext = mwparserfromhell.parse(text)
         for template in wikitext.filter_templates():
             template: Template
             if template.name.matches('MatchSchedule'):
                 # allow for the possibility of partially updating an event
                 # that starts in the latter half of a toornament scrape, e.g. playoffs
                 # n.b. we can only do this if we added correct page and n_in_page tagging
                 # when we first created the event
                 if template.has('page', ignore_empty=True) and \
                         template.has('n_in_page', ignore_empty=True):
                     while match.page < int(template.get('page').value.strip()) \
                             or match.index_in_page < int(template.get('n_in_page').value.strip()):
                         i += 1
                         if i >= len(matches):
                             break
                         match = matches[i]
                 team1 = template.get('team1').value.strip()
                 team2 = template.get('team2').value.strip()
                 # TODO: some team validation? however remember there can be disambiguation
                 # TODO: so parse out anything in () when doing validation
                 if match.completed:
                     match.merge_into(template)
                 
                 # do a normal increment here
                 # this is necessary for legacy behavior in case the indices in_page etc aren't defined
                 i += 1
                 if i >= len(matches):
                     break
                 match = matches[i]
         self.site.save(page, str(wikitext), summary=self.summary)
     return 'https://lol.gamepedia.com/' + cur_page.name.replace(' ', '_')
Example #59
0
def actuallista(pllista, diccipa, pagprova=False):
    resultat = u""
    origen = pllista.title()
    text = pllista.get()
    text0 = text
    code = mwparserfromhell.parse(text)
    t = code.filter_templates()
    #print(t)
    for template in t:
        #print (template.name)
        if template.name.matches(("Filera IPA")):
            if template.has("wikidata"):
                wd = template.get("wikidata").value.strip()
                wd = re.sub("<!-- no ?[Ww][Dd] ?auto -->", "", wd)
                #print(wd)
            else:
                wd = ""
            if wd == "" and template.has("nomcoor"):
                nombusca = template.get("nomcoor").value.strip()
                nombusca = nombusca.split("(")[0].strip()
                print("Per",
                      template.get("nomcoor").value.strip(), "busquem nom:",
                      nombusca)
                if nombusca in diccipa.keys():
                    print(diccipa[nombusca])
                    wdposar = diccipa[nombusca]
                    #print(wdposar)
                    template.add("wikidata", wdposar)
                else:
                    print("Inexistent")
    text = code
    if text != text0:
        print("Desant", pllista)
        pllista.put(
            text,
            u"Robot actualitza el paràmetre wikidata a partir dels noms dels monuments"
        )
    else:
        print("Cap canvi")
    return ()
def process_article(title, text, timestamp, template='Infobox film'):
    """Process a wikipedia article looking for template"""

    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)

    # Search through templates for the template
    matches = wikicode.filter_templates(matches=template)

    # Filter out errant matches
    matches = [
        x for x in matches
        if x.name.strip_code().strip().lower() == template.lower()
    ]

    if len(matches) >= 1:
        # template_name = matches[0].name.strip_code().strip()

        # Extract information from infobox
        properties = {
            param.name.strip_code().strip(): param.value.strip_code().strip()
            for param in matches[0].params if param.value.strip_code().strip()
        }

        # Extract internal wikilinks
        wikilinks = [
            x.title.strip_code().strip() for x in wikicode.filter_wikilinks()
        ]

        # Extract external links
        exlinks = [
            x.url.strip_code().strip()
            for x in wikicode.filter_external_links()
        ]

        # Find approximate length of article
        text_length = len(wikicode.strip_code().strip())

        return (title, properties, wikilinks, exlinks, timestamp, text_length)