def get_captions(title): params = { 'action': 'query', 'list': 'allpages', 'apfrom': title.split(':', 1)[1], 'aplimit': '100', 'apnamespace': '10' } data = api(**params) langs = {} prefix = title + ' ' for item in data['query']['allpages']: if item['title'].startswith(prefix): lang = item['title'].split('(')[1].split(')')[0] langs[lang] = item['title'] text = '' for lang in sorted(langs): lang_name = get_language_name(lang) content = page_content(langs[lang]) if content.strip().startswith('#REDIRECT'): # ??? continue code = mwparserfromhell.parse(content) try: temp = code.filter_templates()[0] except IndexError: continue caption_code = temp.get(1).value # We want templates like {{w|FooBar}} to render, so expand them expanded = expand_templates(unicode(caption_code)) caption = unicode(mwparserfromhell.parse(expanded).strip_code()) text += '%s: %s\n' % (lang_name, caption) return text
def parse(): text = rfd.get() code = mwparserfromhell.parse(text) requests = [] section = code.get_sections()[2] for section in code.get_sections()[1:]: #print section #print type(section) data = {'section': section} header = unicode(section.filter_headings()[0]) data['header'] = header text = mwparserfromhell.parse(unicode(section).replace(header +'\n', '')) data['text'] = text #print text item = None for template in text.filter_templates(): if unicode(template.name).startswith('Rfd group'): data['type'] = 'bulk' break elif template.name == 'rfd links': data['type'] = 'single' item = template.get(1).value break if item: item = pywikibot.ItemPage(repo, item) data['item'] = item requests.append(data) return requests
def update(self, push=True): self.fetch_info() self.parse_info() print self.LOCATION print self.CATEGORY print self.ABOUT print self.MOVEMENT print self.PRESSURE print self.WINDS #print self.UTC_TIMESTAMP #actually update crap #return text = self.wikipage.get() code = mwparserfromhell.parse(text) main = pywikibot.Page(self.wikipage.site, '2012 Atlantic hurricane season') main_text = main.get() main_code = mwparserfromhell.parse(main_text) for template in code.filter_templates(): name = template.name.lower().strip() if name == 'Infobox hurricane current'.lower(): if template.get('name').value.strip() == 'Hurricane Sandy': template.get('time').value = self.UTC_TIMESTAMP template.get('category').value = self.CATEGORY template.get('gusts').value = self.format_wind(self.WINDS) template.get('lat').value = self.LOCATION['latc'] template.get(1).value = self.LOCATION['latd'] template.get('lon').value = self.LOCATION['lonc'] template.get(2).value = self.LOCATION['lond'] template.get('movement').value = self.format_movement(self.MOVEMENT) template.get('pressure').value = self.format_pressure(self.PRESSURE) pywikibot.showDiff(text, unicode(code)) if push: self.wikipage.put(unicode(code), 'Bot: Updating hurricane infobox. Errors? [[User talk:Legoktm|report them!]]')
def test_multiple_nodes_spaces(self): snippet = "foo [[link1]] [[link2]] [[link3]] bar" wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link1]]", "foo [[link2]] [[link3]] bar") wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link2]]", "foo [[link1]] [[link3]] bar") wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link3]]", "foo [[link1]] [[link2]] bar")
def test_multiple_nodes_newlines(self): snippet = "[[link1]]\n[[link2]]\n[[link3]]" wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link1]]", "[[link2]]\n[[link3]]") wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link2]]", "[[link1]]\n[[link3]]") wikicode = mwparserfromhell.parse(snippet) self._do_test(wikicode, "[[link3]]", "[[link1]]\n[[link2]]")
def cleanup_sectionlink(self, section_title): code = mwparser.parse(section_title) template = code.filter_templates() if len(template) == 1 and template[0].name.matches(('Erl', 'erl')): section_title = template[0].get(1) title = mwparser.parse(unicode(section_title)) clean_title = title.strip_code(normalize=True, collapse=True).strip() return clean_title
def __init__(self, title=None, text=None): super(Article, self).__init__(text=text) self.title = title self.paragraphs = None self.readable_text = None self.lede_length = 1 if title is not None: self.page = pwb.Page(site, title) self.text = mwp.parse(self.page.text) self.wikitext = mwp.parse(self.page.text)
def test_contains(self): """test Wikicode.contains()""" code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]") tmpl1, tmpl2, tmpl3 = code.filter_templates() tmpl4 = parse("{{ccc}}").filter_templates()[0] self.assertTrue(code.contains(tmpl1)) self.assertTrue(code.contains(tmpl3)) self.assertFalse(code.contains(tmpl4)) self.assertTrue(code.contains(str(tmpl4))) self.assertTrue(code.contains(tmpl2.params[0].value))
def wikicode(self): """ Return the parsed wikitext (mwparserfromhell.wikicode.Wikicode object) """ if not self._wikicode: try: self._wikicode = mwparserfromhell.parse(self.wikitext) except SystemError: self._wikicode = mwparserfromhell.parse('') return self._wikicode
def _parse_revs_into_wcode(rev_text_dict): result = [] for rev_id in rev_text_dict: try: result.append((rev_id, mwp.parse(rev_text_dict[rev_id]))) except mwp.parser.ParserError as e: logger.warning(e) logger.warning('Error parsing {0}'.format(rev_id)) result.append((rev_id, mwp.parse(''))) return result
def page_f(pg): count = 0 text = pg.get() code = mwparserfromhell.parse(text) for template in code.filter_templates(recursive=True): if template.name.lower().strip() in CITE_TEMPLATES: url = template.get('url').value.strip() if 'msnbc.com' in url: continue isup = is_up(url) if isup: continue if template.has_param('archiveurl'): #if template.has_param('deadurl'): # if template.get('deadurl').value.strip() == 'no': # template.remove('deadurl') # template.add('deadurl', 'yes') # continue continue #find it on archive.org ai_url = archive_page(url) if not ai_url: print 'Not found. :(' continue raw_date = ai_url[27:27+14] year = int(raw_date[:4]) day = int(raw_date[6:8]) month_num = int(raw_date[4:6]) month = MONTH_NAMES[month_num-1] template.add('archiveurl', ai_url) template.add('deadurl', 'yes') template.add('archivedate', '%s %s %s' % (day, month, year)) count += 1 #lets remove all the {{dead link}} now code = unicode(code) for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', code): p = mwparserfromhell.parse(tag.group(2)) for template in p.filter_templates(): set = False if template.name.lower().strip() in CITE_TEMPLATES: if template.has_param('archiveurl'): set = True elif template.name.lower().strip() in DEAD_LINK: if set: del p.nodes[p.nodes.index(unicode(template))] code = code.replace(tag.group(2), unicode(p)) if text != code: print 'No changes made on %s' % pg.title(asLink=True) return pywikibot.showDiff(text, unicode(code)) if raw_input('Save?').lower() == 'y': pg.put(unicode(code), 'Manually-assisted archive url fetching.')
def test_matches(self): """test Wikicode.matches()""" code1 = parse("Cleanup") code2 = parse("\nstub<!-- TODO: make more specific -->") self.assertTrue(code1.matches("Cleanup")) self.assertTrue(code1.matches("cleanup")) self.assertTrue(code1.matches(" cleanup\n")) self.assertFalse(code1.matches("CLEANup")) self.assertFalse(code1.matches("Blah")) self.assertTrue(code2.matches("stub")) self.assertTrue(code2.matches("Stub<!-- no, it's fine! -->")) self.assertFalse(code2.matches("StuB"))
def process_page(page): text = page.get() text, blah = AWB.do_page(text, date=False) code = mwparserfromhell.parse(text) urls = [] for m in urlregex.MATCH_URL.finditer(unicode(code)): u = m.group(0) if u.startswith(('http://ap.google', 'https://ap.google')): urls.append(u) """ buffer = unicode(code) for template in code.filter_templates(): for url in urls: if url in template: if template.has_param('archiveurl'): urls.remove(url) else: buffer = buffer.replace(unicode(template), unicode(template)+TAG) urls.remove(url) code = buffer """ #find ref tags loop1= False for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)): for url in urls: if url in tag.group(2): for template in mwparserfromhell.parse(tag.group(2)).filter_templates(): if template.has_param('archiveurl'): try: urls.remove(url) except ValueError: pass loop1 = True if loop1: break if 'dead link' in tag.group(0).lower(): urls.remove(url) else: code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+TAG+'</ref>') urls.remove(url) if loop1: loop1 = False break if urls: print 'STILL HAVE THESE LEFT: '+', '.join(urls) pywikibot.showDiff(text, unicode(code)) if text != unicode(code): page.put(unicode(code), 'Bot: Tagging ap.google.* links with {{dead link}}') return True else: return None
def load_stub_templates(self): self.stub_templates = [] st = pywikibot.Page(self.site, 'Wikipedia:WikiProject Stub sorting/Stub types') text = st.get() code = mwparserfromhell.parse(text) for template in code.filter_templates(): if template.name.startswith('Wikipedia:WikiProject Stub sorting/Stub types/'): st_page = pywikibot.Page(self.site, unicode(template.name)) text = st_page.get() code = mwparserfromhell.parse(text) for template in code.filter_templates(): if template.name.lower() == 'tl': self.stub_templates.append(unicode(template.get(1).value).lower())
def test_transform(self): wcode_list = [mwp.parse('{{Infobox something | thing}}' '{{not-one else}}' '{{infobox again}}'), mwp.parse('{{Infobox num1 | thing}}' '{{not-one else}}' '{{infobox num2}}')] result = ifb._transform(wcode_list) self.assertEqual(len(result), 2) self.assertEqual(result[0], 'infobox-something infobox-again') self.assertEqual(result[1], 'infobox-num1 infobox-num2')
def process_page(self, page): text = page.get() text, blah = self.AWB.do_page(text, date=False) code = mwparserfromhell.parse(text) urls = [] for m in urlregex.MATCH_URL.finditer(unicode(code)): u = m.group(0) if self.matching.search(u): urls.append(u) else: pass #print 'Did not match: '+u #find ref tags loop1= False for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)): for url in urls: if url in tag.group(2): for template in mwparserfromhell.parse(tag.group(2)).filter_templates(): if template.has_param('archiveurl'): try: urls.remove(url) except ValueError: pass loop1 = True if loop1: break if 'dead link' in tag.group(0).lower(): urls.remove(url) elif 'wayback' in tag.group(0).lower(): urls.remove(url) elif 'webcite' in tag.group(0).lower(): urls.remove(url) else: code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+self.tag+'</ref>') urls.remove(url) if loop1: loop1 = False break if urls: print 'STILL HAVE THESE LEFT: '+', '.join(urls) pywikibot.showDiff(text, unicode(code)) if text != unicode(code): if self.simulate: print 'Not editing, just simulating.' return None page.put(unicode(code), 'Bot: Tagging %s links with {{dead link}}' %self.domain) return True else: return None
def _test_search(self, meth, expected): """Base test for insert_before(), insert_after(), and replace().""" code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}") func = partial(meth, code) func("{{b}}", "x", recursive=True) func("{{d}}", "[[y]]", recursive=False) func(code.get(2), "z") self.assertEqual(expected[0], code) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False) fake = parse("{{a}}").get(0) self.assertRaises(ValueError, func, fake, "n", recursive=True) self.assertRaises(ValueError, func, fake, "n", recursive=False) code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") func = partial(meth, code2) func(code2.get(1), "c", recursive=False) func("{{a}}", "d", recursive=False) func(code2.get(-1), "e", recursive=True) func("{{b}}", "f", recursive=True) self.assertEqual(expected[1], code2) code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") func = partial(meth, code3) obj = code3.get(0).params[0].value.get(0) self.assertRaises(ValueError, func, obj, "x", recursive=False) func(obj, "x", recursive=True) self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False) func("{{f}}", "y", recursive=True) self.assertEqual(expected[2], code3) code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}") func = partial(meth, code4) fake = parse("{{b}}{{c}}") self.assertRaises(ValueError, func, fake, "q", recursive=False) self.assertRaises(ValueError, func, fake, "q", recursive=True) func("{{b}}{{c}}", "w", recursive=False) func("{{d}}{{e}}", "x", recursive=True) func(wrap(code4.nodes[-2:]), "y", recursive=False) func(wrap(code4.nodes[-2:]), "z", recursive=True) self.assertEqual(expected[3], code4) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True) code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") func = partial(meth, code5) self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False) func("{{b}}{{c}}", "x", recursive=True) obj = code5.get(0).params[1].value.get(0).params[0].value self.assertRaises(ValueError, func, obj, "y", recursive=False) func(obj, "y", recursive=True) self.assertEqual(expected[4], code5) code6 = parse("here is {{some text and a {{template}}}}") func = partial(meth, code6) self.assertRaises(ValueError, func, "text and", "ab", recursive=False) func("text and", "ab", recursive=True) self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False) func("is {{some", "cd", recursive=True) self.assertEqual(expected[5], code6)
def test_get_ancestors_parent(self): """test Wikicode.get_ancestors() and Wikicode.get_parent()""" code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}") tmpl = code.filter_templates(matches=lambda n: n.name == "f")[0] parent1 = code.filter_templates(matches=lambda n: n.name == "d")[0] parent2 = code.filter_templates(matches=lambda n: n.name == "b")[0] parent3 = code.filter_templates(matches=lambda n: n.name == "a")[0] fake = parse("{{f}}").get(0) self.assertEqual([parent3, parent2, parent1], code.get_ancestors(tmpl)) self.assertIs(parent1, code.get_parent(tmpl)) self.assertEqual([], code.get_ancestors(parent3)) self.assertIs(None, code.get_parent(parent3)) self.assertRaises(ValueError, code.get_ancestors, fake) self.assertRaises(ValueError, code.get_parent, fake)
def section_to_request(enumerated_section_tuple): enum_number, section_tuple = enumerated_section_tuple section_header, section_wikitext = section_tuple section = mwparserfromhell.parse(section_wikitext) r = Request() r.row_number = enum_number + 1 r.title = section_header r.replies = unicode(section).count(u"(UTC)") - 1 signatures = [] for index, each_node in enumerate(section.nodes): if type(each_node) == mwparserfromhell.nodes.text.Text and "(UTC)" in each_node: # Get the last timestamp-looking thing (trick from http://stackoverflow.com/a/2988680/1757964) for timestamp_match in TIMESTAMP.finditer(unicode(each_node)): pass try: timestamp = datetime.datetime.strptime(timestamp_match.group(0), SIGNATURE_TIME_FORMAT) except ValueError: timestamp = "{{unknown}}" # Use the last user talk page link before the timestamp for user_index in itertools.count(index - 1, -1): user = USER.search(unicode(section.get(user_index))) if user: user = user.group(1) break # Check for user renames/redirects user_page = pywikibot.Page(wiki, "User:"******":")[1] signatures.append((user, timestamp)) # Process usernames by removing anchors signatures = [(x.partition('#')[0], y) for x, y in signatures] # Default values for everything r.last_editor, r.last_edit_time = r.last_botop_editor, r.last_botop_time = "{{no result|None}}", "{{n/a}}" if signatures: r.last_editor, r.last_edit_time = signatures[-1] for user, timestamp in reversed(signatures): if is_botop(wiki, user): r.last_botop_editor, r.last_botop_time = user, timestamp break return r
def mapper_final(self): for entry in self.d: links_list = [] working_text = self.d[entry][1] try: wikicode = mw.parse(working_text) links_list = wikicode.filter_wikilinks() except: pass article_links = 0 links_title = [] if links_list != []: for link in links_list: links_title.append(link.title) for h in range(0, len(links_title)): links_title[h]=links_title[h].strip().lower() title_set = Set(links_title) article_links=len(title_set) yield "a", article_links yield "s", article_links*article_links yield "n", 1 for k in range(0, self.e): yield "a", 1 yield "s", 1 yield "n", 1
def parse_infobox(self, title, page): '''Parse out the nice mediawiki markdown to get birth and death Input: mediawiki unicode page string Returns: a dictionary with name(string), birth_date:DateTime, death_date:DateTime ''' code = mwparserfromhell.parse(page) for template in code.filter_templates(): if 'Infobox' in template.name: # Found the right template -- attempting to extract data output = dict(title=title) for key in ['name', 'birth_name']: if template.has(key): output['name'] = template.get(key).value.strip() for date in ['birth_date', 'death_date']: try: item = self.parse_date(template.get(date)) except ValueError as e: item = None output[date] = item # ok we are done here return output raise InfoError()
def get_template_info(template_checker, commonscat_mapper, text, monument_id=''): if not monument_id: return { "id_not_found": True, "category": get_most_specific_category(commonscat_mapper, text), "missing_monument_id": True } id_count = 0 info = {} templates = mwparserfromhell.parse(text).filter_templates() for template in template_checker.filter_allowed_templates(templates): if template_checker.get_id(template) != monument_id: continue if id_count: id_count += 1 continue id_count = 1 info = { "template": unicode(template), "category": get_most_specific_category(commonscat_mapper, text, template), "valid_id": template_checker.has_valid_id(template), "image_exists": image_exists(template) } if info: info["duplicate_ids"] = id_count > 1 else: info["id_not_found"] = True info["category"] = get_most_specific_category(commonscat_mapper, text) return info
def extract_templates_and_params(text): """Return a list of templates found in text. Return value is a list of tuples. There is one tuple for each use of a template in the page, with the template title as the first entry and a dict of parameters as the second entry. Parameters are indexed by strings; as in MediaWiki, an unnamed parameter is given a parameter name with an integer value corresponding to its position among the unnnamed parameters, and if this results multiple parameters with the same name only the last value provided will be returned. This uses a third party library (mwparserfromhell) if it is installed and enabled in the user-config.py. Otherwise it falls back on a regex based function defined below. @param text: The wikitext from which templates are extracted @type text: unicode or string """ if not (config.use_mwparserfromhell and mwparserfromhell): return extract_templates_and_params_regex(text) code = mwparserfromhell.parse(text) result = [] for template in code.filter_templates(recursive=True): params = {} for param in template.params: params[unicode(param.name)] = unicode(param.value) result.append((unicode(template.name.strip()), params)) return result
def ensure_flagged_by_template(wikicode, node, template_name, *template_parameters, overwrite_parameters=True): """ Makes sure that ``node`` in ``wikicode`` is immediately (except for whitespace) followed by a template with ``template_name`` and optional ``template_parameters``. :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object :param node: a :py:class:`mwparserfromhell.nodes.Node` object :param str template_name: the name of the template flag :param template_parameters: optional template parameters :returns: the template flag, as a :py:class:`mwparserfromhell.nodes.template.Template` objet """ parent = get_parent_wikicode(wikicode, node) adjacent = get_adjacent_node(parent, node, ignore_whitespace=True) if template_parameters: flag = "{{%s}}" % "|".join([template_name, *template_parameters]) else: flag = "{{%s}}" % template_name flag = mwparserfromhell.parse(flag).nodes[0] assert(isinstance(flag, mwparserfromhell.nodes.Template)) if isinstance(adjacent, mwparserfromhell.nodes.Template) and adjacent.name.matches(template_name): # in case of {{Dead link}} we want to preserve the original parameters if overwrite_parameters is True: wikicode.replace(adjacent, flag) else: flag = adjacent else: wikicode.insert_after(node, flag) assert(get_parent_wikicode(wikicode, flag) is parent) return flag
def main(): wikidata = pywikibot.getSite('wikidata','wikidata') page = pywikibot.Page(wikidata, 'User:Legobot/Dupes') text = page.get() code = mwparser.parse(text) templates = code.filter_templates() for template in templates: if template.name != 'rfd links': continue qid = str(template.get(1).value) reason = str(template.get(2).value) dupe = link.search(reason) if not dupe: print 'Error: Cannot parse the deletion reason, skipping.' continue other = pywikibot.Page(wikidata, dupe.group(1)) if not other.exists(): print 'Uhoh, the other copy doesn\'t exist. Won\'t delete.' continue print 'Will delete {0} because: {1}'.format(qid, reason) page = pywikibot.Page(wikidata, qid) if not page.exists(): print 'Uhoh, someone already deleted it!' continue page.delete(reason, prompt=False) print 'Destroyed. Will sleep a bit.'
def section_processing(section, tpl_header, categories, articleTitle): # tpl_header = copy.deepcopy(tpl_header) tom_tpl = section[0] content = section[1] section_tpl_wikicode = mwp.parse(tom_tpl) for tpl in section_tpl_wikicode.filter_templates(): if tpl.name.matches('tom'): num_izd = str(tpl.get(1).value).strip() # {{tom}} - без аналога в изданиях, без 2-го параметра или с 3-м tpls_noTerm = [] # for tpl in wikicode.filter_templates(): # if tpl.name.matches('tom'): if not tpl.has(2) or tpl.get(2).value == '' \ or (tpl.has(3) and tpl.get(3).value != ''): # tpls_noTerm.append(tpl) # tpl_header.add(tpls_noTerm) tpl_header.add('-ТСД%s' % num_izd, '%s<!-- временный шаблон для бота -->' % str(tpl)) return None title_ed = '%s%s/%s' % (PAGENAME_PREFIX, num_izd, articleTitle) # articleName = articleTitle.partition('/')[0] tpl_header.add('ТСД%s' % num_izd, articleTitle.partition('/')[0]) # page_new = makePage(num_izd, title_ed, tpl_header, content, categories) page_new = [num_izd, title_ed, tpl_header, content] return page_new
def extract_plain_text(wiki_body): wikicode = mwparserfromhell.parse(wiki_body) plain_text = "" for node in wikicode.nodes: type_of_node = type(node) if type_of_node == mwparserfromhell.nodes.template.Template: continue if type_of_node == mwparserfromhell.nodes.argument.Argument: continue if type_of_node == mwparserfromhell.nodes.comment.Comment: continue if type_of_node == mwparserfromhell.nodes.html_entity.HTMLEntity: continue if type_of_node != mwparserfromhell.nodes.text.Text: if type(node) == mwparserfromhell.nodes.tag.Tag: str_node = node.contents elif type(node) == mwparserfromhell.nodes.external_link.ExternalLink: str_node = node.title elif type(node) == mwparserfromhell.nodes.heading.Heading: str_node = node.title elif type(node) == mwparserfromhell.nodes.wikilink.Wikilink: str_node = node.title plain_text += extract_plain_text(str_node) else: plain_text += str(node) return re.sub(r'\([^)]*\)', '', plain_text)
def get_assessments(self, rev_content): ''' For the given revision content, get all assessments. @param rev_content: wikitext content of the given talk page revision we're assessing @type rev_content: unicode ''' parsed_code = mwp.parse(rev_content) templates = parsed_code.filter_templates() assessments = [] for temp in templates: if re.match('wikiproject\s+', unicode(temp.name), re.I) \ or unicode(temp.name) in self.translations \ or temp.has_param('class'): project = unicode(temp.name).lower() try: rating = unicode(temp.get('class').value).strip().lower() except ValueError: continue # no assessment class in template importance = None if temp.has_param('importance'): importance = unicode(temp.get('importance').value).strip().lower() assessments.append(Assessment(rating, importance, project)) # return all assessments return assessments
def add_template(self): if not self.adtTitle: return # silently fail adtPage = pywikibot.Page(self.site, self.adtTitle, ns=1) code = mwparserfromhell.parse(adtPage.text) war_adt_added = False for template in code.filter_templates(recursive=False): if template.name.matches("AdT-Vorschlag Hinweis"): code.remove(template) pywikibot.output(u'D:AdT: {{AdT-Vorschlag Hinweis}} gefunden,' u'entfernt') if template.name.matches("War AdT"): if not any(self.snapDate in p for p in template.params): template.add(str(len(template.params)+1), self.snapDate) pywikibot.output(u'D:AdT: {{War AdT}} ' u'gefunden, füge heute hinzu') war_adt_added = True text = unicode(code) if not war_adt_added: template = u'{{War AdT|1=' + self.snapDate + u'}}\n' text = self.__add_templ(text, template) if adtPage.text != text: pywikibot.showDiff(adtPage.text, text) # debug adtPage.text = text if not self.dry: adtPage.save(comment=templateComment, botflag=True, minor=True)
def test_index(self): """test Wikicode.index()""" code = parse("Have a {{template}} and a [[page|link]]") self.assertEqual(0, code.index("Have a ")) self.assertEqual(3, code.index("[[page|link]]")) self.assertEqual(1, code.index(code.get(1))) self.assertRaises(ValueError, code.index, "foo") code = parse("{{foo}}{{bar|{{baz}}}}") self.assertEqual(1, code.index("{{bar|{{baz}}}}")) self.assertEqual(1, code.index("{{baz}}", recursive=True)) self.assertEqual(1, code.index(code.get(1).get(1).value, recursive=True)) self.assertRaises(ValueError, code.index, "{{baz}}", recursive=False) self.assertRaises(ValueError, code.index, code.get(1).get(1).value, recursive=False)
def test_readme_4(self): """test a block of example code in the README""" text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" code = mwparserfromhell.parse(text) for template in code.filter_templates(): if template.name.matches("Cleanup") and not template.has("date"): template.add("date", "July 2012") res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}}" self.assertPrint(code, res) code.replace("{{uncategorized}}", "{{bar-stub}}") res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" self.assertPrint(code, res) if py3k: res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']" else: res = "[u'{{cleanup|date=July 2012}}', u'{{bar-stub}}']" self.assertPrint(code.filter_templates(), res) text = str(code) res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}" self.assertPrint(text, res) self.assertEqual(text, code)
def process_wikipedia_article(title, text, template='Infobox film'): wikipedia_article_parser = mwparserfromhell.parse(text) # Determine whether the article is about a movie or not based on whether it includes the film infobox. movie_infobox_matches = wikipedia_article_parser.filter_templates( matches=template) if len(movie_infobox_matches) >= 1: # Extract information from infobox. # Don't actually need, but keep in case it's useful later. # properties = {param.name.strip_code().strip(): param.value.strip_code().strip() for param in movie_infobox_matches[0].params if param.value.strip_code().strip()} # Extract internal wikilinks. internal_links = [ link.title.strip_code().strip() for link in wikipedia_article_parser.filter_wikilinks() ] internal_links = filter_out_most_common_links(internal_links) return {'title': title, 'internal_links': internal_links}
def mapper_get_page(self, _, line): if '<page>' in line: self.page_single = [] self.page_status = 1 if self.page_status == 1: self.page_single.append(line) if '</page>' in line: #self.page_single.append(line) if self.page_status == 1: page = ''.join(self.page_single) root = etree.XML(page) content = root.xpath("//revision")[-1].xpath(".//text")[0].text self.page_status = 0 if content: content = mwparserfromhell.parse(content).strip_code() yield None, content else: self.page_status = 0 self.page_single = []
def keep_only_includes( wikicode: mwparserfromhell.wikicode.Wikicode ) -> mwparserfromhell.wikicode.Wikicode: """Keeps only the onlyincludes tags if any""" only_include_present = False to_remove = list() for tag in wikicode.filter_tags( recursive=False): # select only the most external one if tag.tag.matches('onlyinclude'): only_include_present = True else: to_remove.append(tag) if only_include_present: for tag in to_remove: try: wikicode.remove(tag) except ValueError: pass wikicode = mwparserfromhell.parse( re.sub(onlyinclude_tag, '', str(wikicode))) return wikicode
def process_text(text): parsed = mwparserfromhell.parse(text) titles = {} editions = [] for tag in parsed.filter(): if isinstance(tag, mwparserfromhell.nodes.heading.Heading): titles[tag.level] = tag.title elif isinstance(tag, mwparserfromhell.nodes.wikilink.Wikilink): if link_is_pdf(tag) and titles.get(2) == 'Music files': editions[-1].add_pdf(tag) elif isinstance(tag, mwparserfromhell.nodes.template.Template): if tag.name == 'CPDLno': editions.append(Edition()) editions[-1].cpdlno_params = strparams(tag) elif tag.name == 'Editor': editions[-1].editor_params = strparams(tag) elif tag.name == 'ScoreInfo': editions[-1].score_info = strparams(tag) return editions
def __init__(self, element: Element): self.ignored = False for child in element.getchildren(): if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}redirect": self.ignored = True return if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}ns" and child.text != "0": self.ignored = True return if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}title": self.title = child.text if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}id": self.id = child.text if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}revision": for revision in child.getchildren(): if revision.tag == "{http://www.mediawiki.org/xml/export-0.10/}text": self.body = mwparserfromhell.parse(revision.text).strip_code() break element.clear()
def mapper_final(self): for entry in self.d: working_text = self.d[entry][1] wikicode = mw.parse(working_text) links_list = wikicode.filter_wikilinks() article_links = 0 links_title = [] if links_list != []: for link in links_list: links_title.append(link.title) for h in range(0, len(links_title)): links_title[h] = links_title[h].strip().lower() title_set = Set(links_title) article_links = len(title_set) yield "a", article_links yield "s", article_links * article_links yield "n", 1 for k in range(0, self.e): yield "a", 1 yield "s", 1 yield "n", 1
def make_new_wikicode(text, form_data, page_name): wikicode = mwparserfromhell.parse(text) change_made = False for template in wikicode.filter_templates(): edit = main.TemplateEdit(template, page_name) if edit.classification == 'ignored' or edit.classification == 'rejected': continue proposed_addition = form_data.get(edit.orig_hash) user_checked = form_data.get(edit.orig_hash + '-addlink') if proposed_addition and user_checked == 'checked': # Go through one or more suggestions separated by pipe for proposed_parameter in proposed_addition.split("|"): try: # Get the new wikitext for the template with this parameter added edit.update_template(proposed_parameter) change_made = True except ValueError: app.logger.exception( 'update_template failed on {}'.format(page_name)) pass # TODO report to the user return unicode(wikicode), change_made
def get_bill_for_page(page): for template in mwparserfromhell.parse(page["text"]).filter_templates(): if template.name.strip() == "Infobox U.S. legislation": #print page["title"].encode("utf8") billref = get_bill_from_infobox(template) if billref: try: if billref[0] == "PL": # Get by pulic law number. return Bill.objects.get(congress=billref[1], sliplawpubpriv="PUB", sliplawnum=billref[2]) elif billref[0] == "BILL": # It's a bill number. return Bill.objects.get(congress=billref[1], bill_type=BillType.by_slug( billref[2]), number=billref[3]) except Bill.DoesNotExist: return None return None
def expand_templates(defn, keep_lb=True, rm_gram=False): from mwparserfromhell import parse from wikiparse.assoc.identispan import identispan_text_rm wikicode = parse(defn) for t in wikicode.filter_templates(recursive=False): if t.name in ("l", "link"): wikicode.replace(t, "[[{}]]".format(expand_templates(t.get(2)))) elif t.name in ("lb",) and keep_lb: wikicode.replace(t, "({})".format(expand_templates(t.get(2)))) elif t.name in ("gloss", "qualifier"): wikicode.replace(t, "({})".format(expand_templates(t.get(1)))) else: wikicode.remove(t) defn = str(wikicode) if rm_gram: defn = identispan_text_rm(defn) return defn
def parse_proposal_page(self, page_name): """ Parses a proposal page to extract metadata about the property to create. :param text: the name of the proposal page """ self.page_name = page_name text = self.get_page_over_api(PROPERTY_PROPOSAL_PREFIX + page_name) wikicode = mwparserfromhell.parse(cleanup_text(text.encode('utf-8'))) for node in wikicode.filter(forcetype=(Template, Heading)): if isinstance(node, Heading): self.latest_labels = self.parse_translatable(node.title) elif isinstance(node, Template): template = node if (unicode(template.name).strip() == 'Property proposal' and template.get('status').value.strip() == 'ready'): self.parse_proposal_template(template) self.users = self.extract_users(wikicode) break self.orig_wikicode = wikicode
def location_from_grid(grid, e, n, digits, view_direction, use6fig, mapit=None): latstr, lonstr, prec = latlon_from_grid(grid, e, n, digits, use6fig) precstr = "{:g}".format(prec) paramstr = "source:" + source_from_grid(grid, e, n, digits) region = region_of(grid, e, n, latstr, lonstr, mapit) if region != None: paramstr += "_region:{}".format(region) if view_direction != None: paramstr += "_heading:{}".format(view_direction) t = Template(mwparserfromhell.parse('Location')) t.add(1, latstr) t.add(2, lonstr) t.add(3, paramstr) t.add('prec', precstr) return t
def update_and_save(self, page, lookup): text = page.text() wikitext = mwparserfromhell.parse(text) for template in wikitext.filter_templates(): if template.name.matches(['Listplayer/Current']): player = template.get('1').value.strip() if player not in lookup: template.add('squad', '') continue template.add('squad', lookup[player]) newtext = str(wikitext) if text != newtext: # print('Saving page %s...' % page.name) try: self.site.save(page, newtext, summary=self.SUMMARY) except EditError: self.site.log_error_content( page.name, 'Spam filter prohibited squad point update') else: pass
def handle_existing_page(player_page, player_name): """ :type player_page: pywikibot.page.Page :type player_name: str """ parsed_mw_text = mwparserfromhell.parse(player_page.text) football_player_template = parsed_mw_text.filter_templates(player_template_name)[0] arguments = __get_football_player_template(player_name) for argument_name, argument_value in arguments.items(): if str(argument_value) != football_player_template.get(argument_name).value and SHOULD_SHOW_DIFF: logger.info("Found diff between arguments on this argument_name: {arg_name}\n" "existing value: {existing_value}\nnew_value: {new_value}". format(arg_name=argument_name, existing_value=football_player_template.get(argument_name).value, new_value=argument_value)) football_player_template.add(argument_name, argument_value) player_page.text = parsed_mw_text
def process_article(self, title, text, template='Infobox person'): """Process a wikipedia article looking for template""" # Create a parsing object wikicode = mwparserfromhell.parse(text) # Search through templates for the template matches = wikicode.filter_templates(matches=template) raw_year_string = 'EMPTY' birth_year = 'EMPTY' infobox = '' if len(matches) >= 1: # Extract information from infobox for match in matches: infobox = str(match) for param in match.params: if param.name.strip_code().strip() == 'birth_date': raw_year_string = str(param.value) birth_year = self.get_birth_year(raw_year_string) summary = self.get_summary(wikicode.strip_code().strip()) return (title, birth_year, summary, raw_year_string, infobox)
def mapper(self, _, line): try: self._chunk += line.strip() if re.search(r"</page>", line): text = '' self._slurping = False root = etree.fromstring(self._chunk, parser) texts = root and root.xpath('//text') if texts: text = texts[0].text if text: lset = set() mwp = mwparserfromhell.parse(text) links = mwp.filter_wikilinks() for link in links: match = parselink.search(unicode(link)) lset.add(match.groups()[0]) yield None, len(lset) self._chunk = '' except: self._chunk = ''
def test_readme_5(self): """test a block of example code in the README; includes a web call""" url1 = "https://en.wikipedia.org/w/api.php" url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw" title = "Test" data = {"action": "query", "prop": "revisions", "rvprop": "content", "rvslots": "main", "rvlimit": 1, "titles": title, "format": "json", "formatversion": "2"} try: raw = urlopen(url1, urlencode(data).encode("utf8")).read() except IOError: self.skipTest("cannot continue because of unsuccessful web call") res = json.loads(raw.decode("utf8")) revision = res["query"]["pages"][0]["revisions"][0] text = revision["slots"]["main"]["content"] try: expected = urlopen(url2.format(title)).read().decode("utf8") except IOError: self.skipTest("cannot continue because of unsuccessful web call") actual = mwparserfromhell.parse(text) self.assertEqual(expected, actual)
def count_in_article(article_name): soup = bsoup( requests.get('https://en.wikipedia.org/wiki/Special:Export/' + article_name).text, 'lxml') pagetext = soup.find('text').text wikilinks = mwph.parse(pagetext).filter_wikilinks() for wl in wikilinks: if wl[:7].lower() == '[[file:' or wl[:11].lower() == '[[category:': pagetext = pagetext.replace(str(wl), '') for wl in wikilinks: if wl[:7].lower() != '[[file:' and wl[:11].lower() != '[[category:': pagetext = pagetext.replace(str(wl), str(wl)[2:-2]) wikitemplates = mwph.parse(pagetext).filter_templates() for wt in wikitemplates: pagetext = pagetext.replace(str(wt), '') comments = mwph.parse(pagetext).filter_comments() for comment in comments: pagetext = pagetext.replace(str(comment), '') external_links = mwph.parse(pagetext).filter_external_links() for ex_l in external_links: pagetext = pagetext.replace(str(ex_l), '') headings = mwph.parse(pagetext).filter_headings() for heading in headings: pagetext = pagetext.replace(str(heading), str(heading).strip('=')) html_entities = mwph.parse(pagetext).filter_html_entities() for h_ent in html_entities: pagetext = pagetext.replace(str(h_ent), '') pagetext = filter_elements(pagetext, '{| class="wikitable', '|}') pagetext = filter_elements(pagetext, '{| class="infobox', '|}') pagetext = filter_elements(pagetext, '{| class="floatright', '|}') pagetext = filter_elements(pagetext, '{{cite', '}}') pagetext = filter_elements(pagetext, '<', '>') count_dict = { 'wikilinks': len(wikilinks), 'words': len(remove_punctuation(nltk.word_tokenize(pagetext))), 'sentences': len(nltk.sent_tokenize(pagetext)) } return count_dict
def run(self): while True: page, qid = self.queue.get() code = mwparser.parse(page.get()) found = False template = '' # to make pycharm shut up for template in code.filter_templates(): if template.name.lower().strip() in redirects: found = True break if not found: print 'Could not find template on ' + page.title() return data = dictify(template) d = list() for nm in data: if not data[nm] or len(data[nm]) > 250: continue d.append((None, page.title(), deqid(qid), nm, data[nm])) parsed.put(d) self.queue.task_done()
def allow_bots(text, user): user = user.lower().strip() text = mwparserfromhell.parse(text) for tl in text.filter_templates(): if tl.name in ('bots', 'nobots'): break else: return True for param in tl.params: bots = [x.lower().strip() for x in param.value.split(",")] if param.name == 'allow': if ''.join(bots) == 'none': return False for bot in bots: if bot in (user, 'all'): return True elif param.name == 'deny': if ''.join(bots) == 'none': return True for bot in bots: if bot in (user, 'all'): return False return True
def fix_text(text): code = mwparserfromhell.parse(text) newtext = '' for index, x in enumerate(code.nodes): flag = False #print repr(x) if index != 0 and isinstance(x, mwparserfromhell.nodes.Text): if x.endswith('\n\n'): if isinstance(code.nodes[index - 1], mwparserfromhell.nodes.Tag) and str( code.nodes[index - 1]) == '*': if len(code.nodes) >= index + 1: if isinstance(code.nodes[index + 1], mwparserfromhell.nodes.Tag) and str( code.nodes[index + 1]) == '*': #print 'trimming' flag = True newtext += unicode(x)[:-1] if not flag: newtext += unicode(x) return newtext
def WIR_member_parse_wikilinks(self, page): set_members = set() try: query = self.url_page + page response = requests.get(query).json() pages = response['query']['pages'] for page in pages: page_text = pages[page]['revisions'][0]['*'] wikicode = mwp.parse(page_text) for link in wikicode.filter_wikilinks(): if link.startswith("[[User:"******"[[User:"******"").replace("]]", "") set_members.add(user_text) except Exception: print("Error when parsing WIR pages") print("Identified {} members from the page: {}.".format( len(set_members), page)) return set_members
def process_template(self, template): for page in template.getReferences(onlyTemplateInclusion=True, namespaces=0): try: text = page.get() except pywikibot.Error: continue else: code = mwparserfromhell.parse(text) for t in code.ifilter_templates(): if t.name.lower().strip() == self.singles_template.title( withNamespace=False).lower(): for p in t.params: if "date" in p.name: for t2 in p.value.ifilter_templates(): if t2.name.lower().strip( ) in self.start_date_template_titles: date = self._get_date(t2, False) if date is not None: p.value.replace(t2, date) elif t.name.lower().strip( ) == self.episode_list_template.title( withNamespace=False).lower(): if t.has_param("AltDate"): for t2 in t.get("AltDate").value.ifilter_templates(): if t2.name.lower().strip( ) in self.start_date_template_titles: date = self._get_date(t2) if date is not None: t.get("AltDate").value.replace(t2, date) if text != code: try: page.put( code, "[[Wikipedia:Bots|Bot]]: Replacing {{[[Template:Start date|start date]]}} with the actual date" " (it should only be used once in a template that emits microformats;" " see [[Template:Start date/doc]])") except pywikibot.Error: continue
def parse_person(rec): wikitext = rec['wikitext'] parsed = mwparserfromhell.parse(wikitext) words = [w.lower() for w in WORD_RE.findall(parsed.strip_code())] word_count = len(words) word_counts = Counter(words) gender_words = {w: word_counts[w] for w in ('him', 'his', 'he', 'her', 'she')} res = {} for template in parsed.filter_templates(): if template.name.lower().startswith('infobox'): for param in template.params: res[param.name.strip().lower()] = param.value wikilinks = [str(x.title) for x in parsed.filter_wikilinks()] locations = [] for k in 'birth_place', 'death_place': if k in res: locations += [str(x.title) for x in res[k].filter_wikilinks()] born = None died = None for wl in parsed.filter_wikilinks(): title = str(wl.title) if title.startswith(CAT_PREFIX): if title.endswith(BIRTH_POSTFIX): born = tolerant_int(title[len(CAT_PREFIX): -len(BIRTH_POSTFIX)]) if title.endswith(DIED_POSTFIX): died = tolerant_int(title[len(CAT_PREFIX): -len(DIED_POSTFIX)]) return {'person_name': rec['title'], 'wiki_id': rec['wiki_id'], 'infobox': rec['infobox'], 'locations': locations, 'word_count': word_count, 'gender_words': gender_words, 'view_count': rec['viewcount'], 'wikilinks': wikilinks, 'born': born, 'died': died}
def run(self): where_condition = ' OR '.join( ['MSG.{} IS NOT NULL'.format(_) for _ in self.vod_params]) vod_options = ['MSG.{}'.format(_) for _ in self.vod_params] fields = [ 'COALESCE({})=Vod'.format(', '.join(vod_options)), 'MSG._pageName=MSGPage', 'SG._pageName=SBPage', 'SG.N_MatchInPage=N_MatchInPage', 'SG.N_GameInMatch=N_GameInMatch' ] result = self.site.cargo_client.query( tables="MatchScheduleGame=MSG,ScoreboardGames=SG", join_on="MSG.GameId=SG.GameId", where= f"(SG.VOD IS NULL AND SG._pageName IS NOT NULL AND ({where_condition}))" f" OR (SG.VOD != COALESCE(MSG.Vod, MSG.VodPB, MSG.VodGameStart, MSG.VodPostgame))", fields=', '.join(fields), order_by= 'SG._pageName, SG.N_MatchInPage', # this is just to group same pages consecutively ) current_page = { 'page': None, 'wikitext': None, 'page_name': None, } for item in result: if current_page['page_name'] != item['SBPage']: if current_page['page'] is not None: self.save_page(current_page) current_page['page_name'] = item['SBPage'] current_page['page'] = self.site.client.pages[ current_page['page_name']] current_page['wikitext'] = mwparserfromhell.parse( current_page['page'].text()) # print('Discovered page {}'.format(current_page['page_name'])) self.add_vod_to_page(item, current_page['wikitext']) # we need to catch the last iteration too (assuming we actually did anything) if current_page['page'] is not None: self.save_page(current_page)
def do_page(self, page): #print page.title(asLink=True).encode('utf-8') if page.namespace() != 6: return text = page.get() text, gen_fix_summary = self.AWBGenFixes.do_page(text) code = mwparserfromhell.parse(text) tag = False log = '* ' summary = 'Bot: Updating license tag(s) with image has rationale=yes (errors? [[User:Legobot/Stop/22|stop me]])' for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts(template.name.lower()).strip() #print self.NFURs #time.sleep(5) if name in self.NFURs: tag = True if tag: for template in code.filter_templates(recursive=True): name = pywikibot.removeDisabledParts( template.name.lower()).strip() if name in self.licenses: template.add('image has rationale', 'yes') log += '[[:%s]]: Adding <code>|image has rationale=yes</code>' % page.title( ) else: #print 'Skipping '+page.title(asLink=True).encode('utf-8') return #if gen_fix_summary: # summary += ', also dating ' + gen_fix_summary puttext = unicode(code).lstrip('\n') pywikibot.showDiff(text, puttext) self.output(log) self.check_page() try: page.text = puttext page.save(summary, async=True, nocreate=True) except pywikibot.exceptions.PageNotSaved: pass except pywikibot.exceptions.LockedPage: pass
class test_parented_ifilter: wikicode = mwparserfromhell.parse("""<span> foo {{bar|some text and {{another|template}}}} </span> {{foo|bar}} """) def test_recursive(self): nodes = [] for parent, node in parented_ifilter(self.wikicode, recursive=True): nodes.append(node) assert parent.index(node) >= 0 assert nodes == self.wikicode.filter(recursive=True) def test_nonrecursive(self): nodes = [] for parent, node in parented_ifilter(self.wikicode, recursive=False): nodes.append(node) assert parent.index(node) >= 0 assert nodes == self.wikicode.filter(recursive=False) def test_recursive_templates(self): templates = [] for parent, template in parented_ifilter(self.wikicode, forcetype=mwparserfromhell.nodes.template.Template, recursive=True): templates.append(template) assert parent.index(template) >= 0 assert templates == self.wikicode.filter_templates(recursive=True) def test_nonrecursive_templates(self): templates = [] for parent, template in parented_ifilter(self.wikicode, forcetype=mwparserfromhell.nodes.template.Template, recursive=False): templates.append(template) assert parent.index(template) >= 0 assert templates == self.wikicode.filter_templates(recursive=False)
def run(self): matches = self.parser.run() i = 0 match = matches[i] match: Match cur_page = None # trailing index for printing at the end for page in self.data_pages: cur_page = page text = page.text() wikitext = mwparserfromhell.parse(text) for template in wikitext.filter_templates(): template: Template if template.name.matches('MatchSchedule'): # allow for the possibility of partially updating an event # that starts in the latter half of a toornament scrape, e.g. playoffs # n.b. we can only do this if we added correct page and n_in_page tagging # when we first created the event if template.has('page', ignore_empty=True) and \ template.has('n_in_page', ignore_empty=True): while match.page < int(template.get('page').value.strip()) \ or match.index_in_page < int(template.get('n_in_page').value.strip()): i += 1 if i >= len(matches): break match = matches[i] team1 = template.get('team1').value.strip() team2 = template.get('team2').value.strip() # TODO: some team validation? however remember there can be disambiguation # TODO: so parse out anything in () when doing validation if match.completed: match.merge_into(template) # do a normal increment here # this is necessary for legacy behavior in case the indices in_page etc aren't defined i += 1 if i >= len(matches): break match = matches[i] self.site.save(page, str(wikitext), summary=self.summary) return 'https://lol.gamepedia.com/' + cur_page.name.replace(' ', '_')
def actuallista(pllista, diccipa, pagprova=False): resultat = u"" origen = pllista.title() text = pllista.get() text0 = text code = mwparserfromhell.parse(text) t = code.filter_templates() #print(t) for template in t: #print (template.name) if template.name.matches(("Filera IPA")): if template.has("wikidata"): wd = template.get("wikidata").value.strip() wd = re.sub("<!-- no ?[Ww][Dd] ?auto -->", "", wd) #print(wd) else: wd = "" if wd == "" and template.has("nomcoor"): nombusca = template.get("nomcoor").value.strip() nombusca = nombusca.split("(")[0].strip() print("Per", template.get("nomcoor").value.strip(), "busquem nom:", nombusca) if nombusca in diccipa.keys(): print(diccipa[nombusca]) wdposar = diccipa[nombusca] #print(wdposar) template.add("wikidata", wdposar) else: print("Inexistent") text = code if text != text0: print("Desant", pllista) pllista.put( text, u"Robot actualitza el paràmetre wikidata a partir dels noms dels monuments" ) else: print("Cap canvi") return ()
def process_article(title, text, timestamp, template='Infobox film'): """Process a wikipedia article looking for template""" # Create a parsing object wikicode = mwparserfromhell.parse(text) # Search through templates for the template matches = wikicode.filter_templates(matches=template) # Filter out errant matches matches = [ x for x in matches if x.name.strip_code().strip().lower() == template.lower() ] if len(matches) >= 1: # template_name = matches[0].name.strip_code().strip() # Extract information from infobox properties = { param.name.strip_code().strip(): param.value.strip_code().strip() for param in matches[0].params if param.value.strip_code().strip() } # Extract internal wikilinks wikilinks = [ x.title.strip_code().strip() for x in wikicode.filter_wikilinks() ] # Extract external links exlinks = [ x.url.strip_code().strip() for x in wikicode.filter_external_links() ] # Find approximate length of article text_length = len(wikicode.strip_code().strip()) return (title, properties, wikilinks, exlinks, timestamp, text_length)