Python parse Examples, mwparserfromhell.parse Python Examples

Example #1

0

Show file

File: dailyimagel.py Project: Toollabs/daily-image-l

def get_captions(title):
    params = {
        'action': 'query',
        'list': 'allpages',
        'apfrom': title.split(':', 1)[1],
        'aplimit': '100',
        'apnamespace': '10'
    }
    data = api(**params)
    langs = {}
    prefix = title + ' '
    for item in data['query']['allpages']:
        if item['title'].startswith(prefix):
            lang = item['title'].split('(')[1].split(')')[0]
            langs[lang] = item['title']
    text = ''
    for lang in sorted(langs):
        lang_name = get_language_name(lang)
        content = page_content(langs[lang])
        if content.strip().startswith('#REDIRECT'):
            # ???
            continue
        code = mwparserfromhell.parse(content)
        try:
            temp = code.filter_templates()[0]
        except IndexError:
            continue
        caption_code = temp.get(1).value
        # We want templates like {{w|FooBar}} to render, so expand them
        expanded = expand_templates(unicode(caption_code))
        caption = unicode(mwparserfromhell.parse(expanded).strip_code())
        text += '%s: %s\n' % (lang_name, caption)

    return text

Example #2

0

Show file

File: parser.py Project: HazardSJ/wikidata

def parse():
    text = rfd.get()
    code = mwparserfromhell.parse(text)
    requests = []
    section = code.get_sections()[2]
    for section in code.get_sections()[1:]:

    #print section
    #print type(section)
        data = {'section': section}
        header = unicode(section.filter_headings()[0])
        data['header'] = header
        text = mwparserfromhell.parse(unicode(section).replace(header +'\n', ''))
        data['text'] = text
        #print text
        item = None
        for template in text.filter_templates():
            if unicode(template.name).startswith('Rfd group'):
                data['type'] = 'bulk'
                break
            elif template.name == 'rfd links':
                data['type'] = 'single'
                item = template.get(1).value
                break
        if item:
            item = pywikibot.ItemPage(repo, item)
        data['item'] = item
        requests.append(data)
    return requests

Example #3

0

Show file

File: hurricane.py Project: Mdann52/pywikipedia-scripts

 def update(self, push=True):
     self.fetch_info()
     self.parse_info()
     print self.LOCATION
     print self.CATEGORY
     print self.ABOUT
     print self.MOVEMENT
     print self.PRESSURE
     print self.WINDS
     #print self.UTC_TIMESTAMP
     #actually update crap
     #return
     text = self.wikipage.get()
     code = mwparserfromhell.parse(text)
     main = pywikibot.Page(self.wikipage.site, '2012 Atlantic hurricane season')
     main_text = main.get()
     main_code = mwparserfromhell.parse(main_text)
     for template in code.filter_templates():
         name = template.name.lower().strip()
         if name == 'Infobox hurricane current'.lower():
             if template.get('name').value.strip() == 'Hurricane Sandy':
                 template.get('time').value = self.UTC_TIMESTAMP
                 template.get('category').value = self.CATEGORY
                 template.get('gusts').value = self.format_wind(self.WINDS)
                 template.get('lat').value = self.LOCATION['latc']
                 template.get(1).value = self.LOCATION['latd']
                 template.get('lon').value = self.LOCATION['lonc']
                 template.get(2).value = self.LOCATION['lond']
                 template.get('movement').value = self.format_movement(self.MOVEMENT)
                 template.get('pressure').value = self.format_pressure(self.PRESSURE)
     pywikibot.showDiff(text, unicode(code))
     if push:
         self.wikipage.put(unicode(code), 'Bot: Updating hurricane infobox. Errors? [[User talk:Legoktm|report them!]]')

Example #4

0

Show file

File: test_wikicode.py Project: lahwaacz/wiki-scripts

 def test_multiple_nodes_spaces(self):
     snippet = "foo [[link1]] [[link2]] [[link3]] bar"
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link1]]", "foo [[link2]] [[link3]] bar")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link2]]", "foo [[link1]] [[link3]] bar")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link3]]", "foo [[link1]] [[link2]] bar")

Example #5

0

Show file

File: test_wikicode.py Project: lahwaacz/wiki-scripts

 def test_multiple_nodes_newlines(self):
     snippet = "[[link1]]\n[[link2]]\n[[link3]]"
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link1]]", "[[link2]]\n[[link3]]")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link2]]", "[[link1]]\n[[link3]]")
     wikicode = mwparserfromhell.parse(snippet)
     self._do_test(wikicode, "[[link3]]", "[[link1]]\n[[link2]]")

Example #6

0

Show file

File: Samhlada.py Project: edgarskos/AsuraBot

    def cleanup_sectionlink(self, section_title):
        code = mwparser.parse(section_title)
        template = code.filter_templates()
        if len(template) == 1 and template[0].name.matches(('Erl', 'erl')):
            section_title = template[0].get(1)

        title = mwparser.parse(unicode(section_title))
        clean_title = title.strip_code(normalize=True, collapse=True).strip()
        return clean_title

Example #7

0

Show file

File: dykbot.py Project: thewikipedian/dykreviewbot

	def __init__(self, title=None, text=None):
		super(Article, self).__init__(text=text)
		self.title = title
		self.paragraphs = None
		self.readable_text = None
		self.lede_length = 1
		if title is not None:
			self.page = pwb.Page(site, title)
			self.text = mwp.parse(self.page.text)
			self.wikitext = mwp.parse(self.page.text)

Example #8

0

Show file

File: test_wikicode.py Project: earwig/mwparserfromhell

 def test_contains(self):
     """test Wikicode.contains()"""
     code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]")
     tmpl1, tmpl2, tmpl3 = code.filter_templates()
     tmpl4 = parse("{{ccc}}").filter_templates()[0]
     self.assertTrue(code.contains(tmpl1))
     self.assertTrue(code.contains(tmpl3))
     self.assertFalse(code.contains(tmpl4))
     self.assertTrue(code.contains(str(tmpl4)))
     self.assertTrue(code.contains(tmpl2.params[0].value))

Example #9

0

Show file

File: article.py Project: necrop/pylib3.2

 def wikicode(self):
     """
     Return the parsed wikitext (mwparserfromhell.wikicode.Wikicode object)
     """
     if not self._wikicode:
         try:
             self._wikicode = mwparserfromhell.parse(self.wikitext)
         except SystemError:
             self._wikicode = mwparserfromhell.parse('')
     return self._wikicode

Example #10

0

Show file

File: cmdline.py Project: WikiEducationFoundation/academic_classification

def _parse_revs_into_wcode(rev_text_dict):
    result = []
    for rev_id in rev_text_dict:
        try:
            result.append((rev_id, mwp.parse(rev_text_dict[rev_id])))
        except mwp.parser.ParserError as e:
            logger.warning(e)
            logger.warning('Error parsing {0}'.format(rev_id))
            result.append((rev_id, mwp.parse('')))
    return result

Example #11

0

Show file

File: archive.py Project: Mdann52/pywikipedia-scripts

def page_f(pg):
    count = 0
    text = pg.get()
    code = mwparserfromhell.parse(text)
    for template in code.filter_templates(recursive=True):
        if template.name.lower().strip() in CITE_TEMPLATES:
            url = template.get('url').value.strip()
            if 'msnbc.com' in url:
                continue
            isup = is_up(url)
            if isup:
                continue
            if template.has_param('archiveurl'):
                #if template.has_param('deadurl'):
                #    if template.get('deadurl').value.strip() == 'no':
                #        template.remove('deadurl')
                #        template.add('deadurl', 'yes')
                #        continue
                continue
            #find it on archive.org
            ai_url = archive_page(url)
            if not ai_url:
                print 'Not found. :('
                continue
            raw_date = ai_url[27:27+14]
            year = int(raw_date[:4])
            day = int(raw_date[6:8])
            month_num = int(raw_date[4:6])
            month = MONTH_NAMES[month_num-1]
            template.add('archiveurl', ai_url)
            template.add('deadurl', 'yes')
            template.add('archivedate', '%s %s %s' % (day, month, year))
            count += 1

    #lets remove all the {{dead link}} now
    code = unicode(code)
    for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', code):
        p = mwparserfromhell.parse(tag.group(2))
        for template in p.filter_templates():
            set = False
            if template.name.lower().strip() in CITE_TEMPLATES:
                if template.has_param('archiveurl'):
                    set = True
            elif template.name.lower().strip() in DEAD_LINK:
                if set:
                    del p.nodes[p.nodes.index(unicode(template))]
                    code = code.replace(tag.group(2), unicode(p))
    if text != code:
        print 'No changes made on %s' % pg.title(asLink=True)
        return
    pywikibot.showDiff(text, unicode(code))
    if raw_input('Save?').lower() == 'y':
        pg.put(unicode(code), 'Manually-assisted archive url fetching.')

Example #12

0

Show file

File: test_wikicode.py Project: stanta/ipc-parser-1

 def test_matches(self):
     """test Wikicode.matches()"""
     code1 = parse("Cleanup")
     code2 = parse("\nstub<!-- TODO: make more specific -->")
     self.assertTrue(code1.matches("Cleanup"))
     self.assertTrue(code1.matches("cleanup"))
     self.assertTrue(code1.matches("  cleanup\n"))
     self.assertFalse(code1.matches("CLEANup"))
     self.assertFalse(code1.matches("Blah"))
     self.assertTrue(code2.matches("stub"))
     self.assertTrue(code2.matches("Stub<!-- no, it's fine! -->"))
     self.assertFalse(code2.matches("StuB"))

Example #13

0

Show file

File: ap_dead_link.py Project: TAP-WP/pywikipedia-scripts

def process_page(page):
    text = page.get()
    text, blah = AWB.do_page(text, date=False)
    code = mwparserfromhell.parse(text)
    urls = []
    for m in urlregex.MATCH_URL.finditer(unicode(code)):
        u = m.group(0)
        if u.startswith(('http://ap.google', 'https://ap.google')):
            urls.append(u)
    """
    buffer = unicode(code)
    for template in code.filter_templates():
        for url in urls:
            if url in template:
                if template.has_param('archiveurl'):
                    urls.remove(url)
                else:
                    buffer = buffer.replace(unicode(template), unicode(template)+TAG)
                    urls.remove(url)
    code = buffer
    """
    #find ref tags
    loop1= False
    for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)):
        for url in urls:
            if url in tag.group(2):
                for template in mwparserfromhell.parse(tag.group(2)).filter_templates():
                    if template.has_param('archiveurl'):
                        try:
                            urls.remove(url)
                        except ValueError:
                            pass
                        loop1 = True
                if loop1:
                    break
                if 'dead link' in tag.group(0).lower():
                    urls.remove(url)
                else:
                    code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+TAG+'</ref>')
                    urls.remove(url)
            if loop1:
                loop1 = False
                break
    if urls:
        print 'STILL HAVE THESE LEFT: '+', '.join(urls)

    pywikibot.showDiff(text, unicode(code))
    if text != unicode(code):
        page.put(unicode(code), 'Bot: Tagging ap.google.* links with {{dead link}}')
        return True
    else:
        return None

Example #14

0

Show file

File: tagger.py Project: Mdann52/pywikipedia-scripts

 def load_stub_templates(self):
     self.stub_templates = []
     st = pywikibot.Page(self.site, 'Wikipedia:WikiProject Stub sorting/Stub types')
     text = st.get()
     code = mwparserfromhell.parse(text)
     for template in code.filter_templates():
         if template.name.startswith('Wikipedia:WikiProject Stub sorting/Stub types/'):
             st_page = pywikibot.Page(self.site, unicode(template.name))
             text = st_page.get()
             code = mwparserfromhell.parse(text)
             for template in code.filter_templates():
                 if template.name.lower() == 'tl':
                     self.stub_templates.append(unicode(template.get(1).value).lower())

Example #15

0

Show file

File: test_infobox.py Project: WikiEducationFoundation/academic_classification

    def test_transform(self):
        wcode_list = [mwp.parse('{{Infobox something | thing}}'
                                '{{not-one else}}'
                                '{{infobox again}}'),
                      mwp.parse('{{Infobox num1 | thing}}'
                                '{{not-one else}}'
                                '{{infobox num2}}')]

        result = ifb._transform(wcode_list)

        self.assertEqual(len(result), 2)
        self.assertEqual(result[0], 'infobox-something infobox-again')
        self.assertEqual(result[1], 'infobox-num1 infobox-num2')

Example #16

0

Show file

File: ap_dead_link.py Project: Mdann52/pywikipedia-scripts

    def process_page(self, page):
        text = page.get()
        text, blah = self.AWB.do_page(text, date=False)
        code = mwparserfromhell.parse(text)
        urls = []
        for m in urlregex.MATCH_URL.finditer(unicode(code)):
            u = m.group(0)
            if self.matching.search(u):
                urls.append(u)
            else:
                pass
                #print 'Did not match: '+u
        #find ref tags
        loop1= False
        for tag in re.finditer(r'<ref(.*?)>(.*?)</ref>', unicode(code)):
            for url in urls:
                if url in tag.group(2):
                    for template in mwparserfromhell.parse(tag.group(2)).filter_templates():
                        if template.has_param('archiveurl'):
                            try:
                                urls.remove(url)
                            except ValueError:
                                pass
                            loop1 = True
                    if loop1:
                        break
                    if 'dead link' in tag.group(0).lower():
                        urls.remove(url)
                    elif 'wayback' in tag.group(0).lower():
                        urls.remove(url)
                    elif 'webcite' in tag.group(0).lower():
                        urls.remove(url)
                    else:
                        code = unicode(code).replace(tag.group(0), '<ref'+tag.group(1)+'>'+tag.group(2)+self.tag+'</ref>')
                        urls.remove(url)
                if loop1:
                    loop1 = False
                    break
        if urls:
            print 'STILL HAVE THESE LEFT: '+', '.join(urls)

        pywikibot.showDiff(text, unicode(code))
        if text != unicode(code):
            if self.simulate:
                print 'Not editing, just simulating.'
                return None
            page.put(unicode(code), 'Bot: Tagging %s links with {{dead link}}' %self.domain)
            return True
        else:
            return None

Example #17

0

Show file

File: test_wikicode.py Project: stanta/ipc-parser-1

    def _test_search(self, meth, expected):
        """Base test for insert_before(), insert_after(), and replace()."""
        code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}")
        func = partial(meth, code)
        func("{{b}}", "x", recursive=True)
        func("{{d}}", "[[y]]", recursive=False)
        func(code.get(2), "z")
        self.assertEqual(expected[0], code)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True)
        self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False)
        fake = parse("{{a}}").get(0)
        self.assertRaises(ValueError, func, fake, "n", recursive=True)
        self.assertRaises(ValueError, func, fake, "n", recursive=False)

        code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}")
        func = partial(meth, code2)
        func(code2.get(1), "c", recursive=False)
        func("{{a}}", "d", recursive=False)
        func(code2.get(-1), "e", recursive=True)
        func("{{b}}", "f", recursive=True)
        self.assertEqual(expected[1], code2)

        code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}")
        func = partial(meth, code3)
        obj = code3.get(0).params[0].value.get(0)
        self.assertRaises(ValueError, func, obj, "x", recursive=False)
        func(obj, "x", recursive=True)
        self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False)
        func("{{f}}", "y", recursive=True)
        self.assertEqual(expected[2], code3)

        code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}")
        func = partial(meth, code4)
        fake = parse("{{b}}{{c}}")
        self.assertRaises(ValueError, func, fake, "q", recursive=False)
        self.assertRaises(ValueError, func, fake, "q", recursive=True)
        func("{{b}}{{c}}", "w", recursive=False)
        func("{{d}}{{e}}", "x", recursive=True)
        func(wrap(code4.nodes[-2:]), "y", recursive=False)
        func(wrap(code4.nodes[-2:]), "z", recursive=True)
        self.assertEqual(expected[3], code4)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False)
        self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True)

        code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}")
        func = partial(meth, code5)
        self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False)
        func("{{b}}{{c}}", "x", recursive=True)
        obj = code5.get(0).params[1].value.get(0).params[0].value
        self.assertRaises(ValueError, func, obj, "y", recursive=False)
        func(obj, "y", recursive=True)
        self.assertEqual(expected[4], code5)

        code6 = parse("here is {{some text and a {{template}}}}")
        func = partial(meth, code6)
        self.assertRaises(ValueError, func, "text and", "ab", recursive=False)
        func("text and", "ab", recursive=True)
        self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False)
        func("is {{some", "cd", recursive=True)
        self.assertEqual(expected[5], code6)

Example #18

0

Show file

File: test_wikicode.py Project: earwig/mwparserfromhell

    def test_get_ancestors_parent(self):
        """test Wikicode.get_ancestors() and Wikicode.get_parent()"""
        code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}")
        tmpl = code.filter_templates(matches=lambda n: n.name == "f")[0]
        parent1 = code.filter_templates(matches=lambda n: n.name == "d")[0]
        parent2 = code.filter_templates(matches=lambda n: n.name == "b")[0]
        parent3 = code.filter_templates(matches=lambda n: n.name == "a")[0]
        fake = parse("{{f}}").get(0)

        self.assertEqual([parent3, parent2, parent1], code.get_ancestors(tmpl))
        self.assertIs(parent1, code.get_parent(tmpl))
        self.assertEqual([], code.get_ancestors(parent3))
        self.assertIs(None, code.get_parent(parent3))
        self.assertRaises(ValueError, code.get_ancestors, fake)
        self.assertRaises(ValueError, code.get_parent, fake)

Example #19

0

Show file

File: botreq-status.py Project: APerson241/APersonBot

    def section_to_request(enumerated_section_tuple):
        enum_number, section_tuple = enumerated_section_tuple
        section_header, section_wikitext = section_tuple
        section = mwparserfromhell.parse(section_wikitext)
        r = Request()
        r.row_number = enum_number + 1
        r.title = section_header
        r.replies = unicode(section).count(u"(UTC)") - 1
        signatures = []
        for index, each_node in enumerate(section.nodes):
            if type(each_node) == mwparserfromhell.nodes.text.Text and "(UTC)" in each_node:

                # Get the last timestamp-looking thing (trick from http://stackoverflow.com/a/2988680/1757964)
                for timestamp_match in TIMESTAMP.finditer(unicode(each_node)): pass
                try:
                    timestamp = datetime.datetime.strptime(timestamp_match.group(0), SIGNATURE_TIME_FORMAT)
                except ValueError:
                    timestamp = "{{unknown}}"

                # Use the last user talk page link before the timestamp
                for user_index in itertools.count(index - 1, -1):
                    user = USER.search(unicode(section.get(user_index)))
                    if user:
                        user = user.group(1)
                        break

                # Check for user renames/redirects
                user_page = pywikibot.Page(wiki, "User:"******":")[1]

                signatures.append((user, timestamp))
        # Process usernames by removing anchors
        signatures = [(x.partition('#')[0], y) for x, y in signatures]

        # Default values for everything
        r.last_editor, r.last_edit_time = r.last_botop_editor, r.last_botop_time = "{{no result|None}}", "{{n/a}}"

        if signatures:
            r.last_editor, r.last_edit_time = signatures[-1]
            for user, timestamp in reversed(signatures):
                if is_botop(wiki, user):
                    r.last_botop_editor, r.last_botop_time = user, timestamp
                    break
        return r

Example #20

0

Show file

File: mr_pb6.py Project: DataIncSGO/Miniprojects

 def mapper_final(self):
     for entry in self.d:
         links_list = []
         working_text = self.d[entry][1]            
         try:
             wikicode = mw.parse(working_text)
             links_list = wikicode.filter_wikilinks() 
         except:
             pass
         article_links = 0
         links_title = []
         if links_list != []:
             for link in links_list:
                 links_title.append(link.title)
             for h in range(0, len(links_title)):
                 links_title[h]=links_title[h].strip().lower()
             title_set = Set(links_title)
             article_links=len(title_set)
         yield "a", article_links
         yield "s", article_links*article_links
         yield "n", 1
     for k in range(0, self.e):
         yield "a", 1
         yield "s", 1
         yield "n", 1

Example #21

0

Show file

File: wiki.py Project: ajmendez/timeline

  def parse_infobox(self, title, page):
    '''Parse out the nice mediawiki markdown to get birth and death
    Input:
      mediawiki unicode page string
    Returns:
      a dictionary with name(string), birth_date:DateTime, death_date:DateTime
    '''
    code = mwparserfromhell.parse(page)
    for template in code.filter_templates():
      if 'Infobox' in template.name:
        # Found the right template -- attempting to extract data
        output = dict(title=title)
        
        for key in ['name', 'birth_name']:
          if template.has(key):
            output['name'] = template.get(key).value.strip()
        
        for date in ['birth_date', 'death_date']:
          try:
            item = self.parse_date(template.get(date))
          except ValueError as e:
            item = None
          output[date] = item

        # ok we are done here
        return output
    raise InfoError()

Example #22

0

Show file

File: pageinfo.py Project: wmde/WikiLovesMonuments

def get_template_info(template_checker, commonscat_mapper, text, monument_id=''):
    if not monument_id:
        return {
            "id_not_found": True,
            "category": get_most_specific_category(commonscat_mapper, text),
            "missing_monument_id": True
        }
    id_count = 0
    info = {}
    templates = mwparserfromhell.parse(text).filter_templates()
    for template in template_checker.filter_allowed_templates(templates):
        if template_checker.get_id(template) != monument_id:
            continue
        if id_count:
            id_count += 1
            continue
        id_count = 1
        info = {
            "template": unicode(template),
            "category": get_most_specific_category(commonscat_mapper, text, template),
            "valid_id": template_checker.has_valid_id(template),
            "image_exists": image_exists(template)
        }
    if info:
        info["duplicate_ids"] = id_count > 1
    else:
        info["id_not_found"] = True
        info["category"] = get_most_specific_category(commonscat_mapper, text)
    return info

Example #23

0

Show file

File: textlib.py Project: bjonesin/pywikibot-core

def extract_templates_and_params(text):
    """Return a list of templates found in text.

    Return value is a list of tuples. There is one tuple for each use of a
    template in the page, with the template title as the first entry and a
    dict of parameters as the second entry.  Parameters are indexed by
    strings; as in MediaWiki, an unnamed parameter is given a parameter name
    with an integer value corresponding to its position among the unnnamed
    parameters, and if this results multiple parameters with the same name
    only the last value provided will be returned.

    This uses a third party library (mwparserfromhell) if it is installed
    and enabled in the user-config.py. Otherwise it falls back on a
    regex based function defined below.

    @param text: The wikitext from which templates are extracted
    @type text: unicode or string

    """

    if not (config.use_mwparserfromhell and mwparserfromhell):
        return extract_templates_and_params_regex(text)
    code = mwparserfromhell.parse(text)
    result = []
    for template in code.filter_templates(recursive=True):
        params = {}
        for param in template.params:
            params[unicode(param.name)] = unicode(param.value)
        result.append((unicode(template.name.strip()), params))
    return result

Example #24

0

Show file

File: wikicode.py Project: lahwaacz/wiki-scripts

def ensure_flagged_by_template(wikicode, node, template_name, *template_parameters, overwrite_parameters=True):
    """
    Makes sure that ``node`` in ``wikicode`` is immediately (except for
    whitespace) followed by a template with ``template_name`` and optional
    ``template_parameters``.

    :param wikicode: a :py:class:`mwparserfromhell.wikicode.Wikicode` object
    :param node: a :py:class:`mwparserfromhell.nodes.Node` object
    :param str template_name: the name of the template flag
    :param template_parameters: optional template parameters
    :returns: the template flag, as a
        :py:class:`mwparserfromhell.nodes.template.Template` objet
    """
    parent = get_parent_wikicode(wikicode, node)
    adjacent = get_adjacent_node(parent, node, ignore_whitespace=True)

    if template_parameters:
        flag = "{{%s}}" % "|".join([template_name, *template_parameters])
    else:
        flag = "{{%s}}" % template_name
    flag = mwparserfromhell.parse(flag).nodes[0]
    assert(isinstance(flag, mwparserfromhell.nodes.Template))

    if isinstance(adjacent, mwparserfromhell.nodes.Template) and adjacent.name.matches(template_name):
        # in case of {{Dead link}} we want to preserve the original parameters
        if overwrite_parameters is True:
            wikicode.replace(adjacent, flag)
        else:
            flag = adjacent
    else:
        wikicode.insert_after(node, flag)

    assert(get_parent_wikicode(wikicode, flag) is parent)
    return flag

Example #25

0

Show file

File: wikidata_massdelete.py Project: RileyHuntley/wikidata

def main():
    wikidata = pywikibot.getSite('wikidata','wikidata')
    page = pywikibot.Page(wikidata, 'User:Legobot/Dupes')
    text = page.get()
    code = mwparser.parse(text)
    templates = code.filter_templates()
    for template in templates:
        if template.name != 'rfd links':
            continue
        qid = str(template.get(1).value)
        reason = str(template.get(2).value)
        dupe = link.search(reason)
        if not dupe:
            print 'Error: Cannot parse the deletion reason, skipping.'
            continue
        other = pywikibot.Page(wikidata, dupe.group(1))
        if not other.exists():
            print 'Uhoh, the other copy doesn\'t exist. Won\'t delete.'
            continue
        print 'Will delete {0} because: {1}'.format(qid, reason)
        page = pywikibot.Page(wikidata, qid)
        if not page.exists():
            print 'Uhoh, someone already deleted it!'
            continue
        page.delete(reason, prompt=False)
        print 'Destroyed. Will sleep a bit.'

Example #26

0

Show file

File: tsd_split_pages_by_editions.py Project: bl79/4wiki

def section_processing(section, tpl_header, categories, articleTitle):
	# tpl_header = copy.deepcopy(tpl_header)
	tom_tpl = section[0]
	content = section[1]
	section_tpl_wikicode = mwp.parse(tom_tpl)
	for tpl in section_tpl_wikicode.filter_templates():
		if tpl.name.matches('tom'):
			num_izd = str(tpl.get(1).value).strip()
			# {{tom}} - без аналога в изданиях, без 2-го параметра или с 3-м
			tpls_noTerm = []
			# for tpl in wikicode.filter_templates():
			# 	if tpl.name.matches('tom'):
			if not tpl.has(2) or tpl.get(2).value == '' \
					or (tpl.has(3) and tpl.get(3).value != ''):
				# tpls_noTerm.append(tpl)
				# tpl_header.add(tpls_noTerm)
				tpl_header.add('-ТСД%s' % num_izd, '%s<!-- временный шаблон для бота -->' % str(tpl))
				return None

			title_ed = '%s%s/%s' % (PAGENAME_PREFIX, num_izd, articleTitle)
			# articleName = articleTitle.partition('/')[0]
			tpl_header.add('ТСД%s' % num_izd, articleTitle.partition('/')[0])
			# page_new = makePage(num_izd, title_ed, tpl_header, content, categories)
			page_new = [num_izd, title_ed, tpl_header, content]
			return page_new

Example #27

0

Show file

File: step2_find_concepts_from_documents.py Project: remenberl/concept_newtork

def extract_plain_text(wiki_body):
    wikicode = mwparserfromhell.parse(wiki_body)
    plain_text = ""
    for node in wikicode.nodes:
        type_of_node = type(node)
        if type_of_node == mwparserfromhell.nodes.template.Template:
            continue
        if type_of_node == mwparserfromhell.nodes.argument.Argument:
            continue
        if type_of_node == mwparserfromhell.nodes.comment.Comment:
            continue
        if type_of_node == mwparserfromhell.nodes.html_entity.HTMLEntity:
            continue
        if type_of_node != mwparserfromhell.nodes.text.Text:
            if type(node) == mwparserfromhell.nodes.tag.Tag:
                str_node = node.contents
            elif type(node) == mwparserfromhell.nodes.external_link.ExternalLink:
                str_node = node.title
            elif type(node) == mwparserfromhell.nodes.heading.Heading:
                str_node = node.title
            elif type(node) == mwparserfromhell.nodes.wikilink.Wikilink:
                str_node = node.title
            plain_text += extract_plain_text(str_node)
        else:
            plain_text += str(node)
    return re.sub(r'\([^)]*\)', '', plain_text)

Example #28

0

Show file

File: clean-training-set.py Project: nettrom/assessments

    def get_assessments(self, rev_content):
        '''
        For the given revision content, get all assessments.

        @param rev_content: wikitext content of the given talk page revision
                            we're assessing
        @type rev_content: unicode
        '''

        parsed_code = mwp.parse(rev_content)
        templates = parsed_code.filter_templates()
        assessments = []
        for temp in templates:
            if re.match('wikiproject\s+',
                        unicode(temp.name),
                        re.I) \
                or unicode(temp.name) in self.translations \
                or temp.has_param('class'):
                project = unicode(temp.name).lower()
                try:
                    rating = unicode(temp.get('class').value).strip().lower()
                except ValueError:
                    continue # no assessment class in template
                importance = None
                if temp.has_param('importance'):
                    importance = unicode(temp.get('importance').value).strip().lower()
                assessments.append(Assessment(rating,
                                              importance,
                                              project))
        # return all assessments
        return assessments

Example #29

0

Show file

File: Bali.py Project: SuriyaaKudoIsc/AsuraBot

    def add_template(self):
        if not self.adtTitle:
            return  # silently fail

        adtPage = pywikibot.Page(self.site, self.adtTitle, ns=1)
        code = mwparserfromhell.parse(adtPage.text)

        war_adt_added = False
        for template in code.filter_templates(recursive=False):
            if template.name.matches("AdT-Vorschlag Hinweis"):
                code.remove(template)
                pywikibot.output(u'D:AdT: {{AdT-Vorschlag Hinweis}} gefunden,'
                                 u'entfernt')
            if template.name.matches("War AdT"):
                if not any(self.snapDate in p for p in template.params):
                    template.add(str(len(template.params)+1), self.snapDate)
                    pywikibot.output(u'D:AdT: {{War AdT}} '
                                     u'gefunden, füge heute hinzu')
                war_adt_added = True
        text = unicode(code)
        if not war_adt_added:
            template = u'{{War AdT|1=' + self.snapDate + u'}}\n'
            text = self.__add_templ(text, template)

        if adtPage.text != text:
            pywikibot.showDiff(adtPage.text, text)  # debug
            adtPage.text = text
            if not self.dry:
                adtPage.save(comment=templateComment, botflag=True, minor=True)

Example #30

0

Show file

File: test_wikicode.py Project: stanta/ipc-parser-1

    def test_index(self):
        """test Wikicode.index()"""
        code = parse("Have a {{template}} and a [[page|link]]")
        self.assertEqual(0, code.index("Have a "))
        self.assertEqual(3, code.index("[[page|link]]"))
        self.assertEqual(1, code.index(code.get(1)))
        self.assertRaises(ValueError, code.index, "foo")

        code = parse("{{foo}}{{bar|{{baz}}}}")
        self.assertEqual(1, code.index("{{bar|{{baz}}}}"))
        self.assertEqual(1, code.index("{{baz}}", recursive=True))
        self.assertEqual(1, code.index(code.get(1).get(1).value,
                                       recursive=True))
        self.assertRaises(ValueError, code.index, "{{baz}}", recursive=False)
        self.assertRaises(ValueError, code.index,
                          code.get(1).get(1).value, recursive=False)

Example #31

0

Show file

File: test_docs.py Project: wikimedia/operations-debs-python-mwparserfromhell

 def test_readme_4(self):
     """test a block of example code in the README"""
     text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}"
     code = mwparserfromhell.parse(text)
     for template in code.filter_templates():
         if template.name.matches("Cleanup") and not template.has("date"):
             template.add("date", "July 2012")
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}}"
     self.assertPrint(code, res)
     code.replace("{{uncategorized}}", "{{bar-stub}}")
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
     self.assertPrint(code, res)
     if py3k:
         res = "['{{cleanup|date=July 2012}}', '{{bar-stub}}']"
     else:
         res = "[u'{{cleanup|date=July 2012}}', u'{{bar-stub}}']"
     self.assertPrint(code.filter_templates(), res)
     text = str(code)
     res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{bar-stub}}"
     self.assertPrint(text, res)
     self.assertEqual(text, code)

Example #32

0

Show file

File: training_data_generator.py Project: GlenCrawford/movie_recommendation_neural_network

def process_wikipedia_article(title, text, template='Infobox film'):
    wikipedia_article_parser = mwparserfromhell.parse(text)

    # Determine whether the article is about a movie or not based on whether it includes the film infobox.
    movie_infobox_matches = wikipedia_article_parser.filter_templates(
        matches=template)

    if len(movie_infobox_matches) >= 1:
        # Extract information from infobox.
        # Don't actually need, but keep in case it's useful later.
        # properties = {param.name.strip_code().strip(): param.value.strip_code().strip() for param in movie_infobox_matches[0].params if param.value.strip_code().strip()}

        # Extract internal wikilinks.
        internal_links = [
            link.title.strip_code().strip()
            for link in wikipedia_article_parser.filter_wikilinks()
        ]

        internal_links = filter_out_most_common_links(internal_links)

        return {'title': title, 'internal_links': internal_links}

Example #33

0

Show file

    def mapper_get_page(self, _, line):
        if '<page>' in line:
            self.page_single = []
            self.page_status = 1

        if self.page_status == 1:
            self.page_single.append(line)

        if '</page>' in line:
            #self.page_single.append(line)
            if self.page_status == 1:
                page = ''.join(self.page_single)
                root = etree.XML(page)
                content = root.xpath("//revision")[-1].xpath(".//text")[0].text
                self.page_status = 0
                if content:
                    content = mwparserfromhell.parse(content).strip_code()
                    yield None, content
            else:
                self.page_status = 0
                self.page_single = []

Example #34

0

Show file

def keep_only_includes(
    wikicode: mwparserfromhell.wikicode.Wikicode
) -> mwparserfromhell.wikicode.Wikicode:
    """Keeps only the onlyincludes tags if any"""
    only_include_present = False
    to_remove = list()
    for tag in wikicode.filter_tags(
            recursive=False):  # select only the most external one
        if tag.tag.matches('onlyinclude'):
            only_include_present = True
        else:
            to_remove.append(tag)
    if only_include_present:
        for tag in to_remove:
            try:
                wikicode.remove(tag)
            except ValueError:
                pass
    wikicode = mwparserfromhell.parse(
        re.sub(onlyinclude_tag, '', str(wikicode)))
    return wikicode

Example #35

0

Show file

def process_text(text):
    parsed = mwparserfromhell.parse(text)
    titles = {}
    editions = []

    for tag in parsed.filter():
        if isinstance(tag, mwparserfromhell.nodes.heading.Heading):
            titles[tag.level] = tag.title
        elif isinstance(tag, mwparserfromhell.nodes.wikilink.Wikilink):
            if link_is_pdf(tag) and titles.get(2) == 'Music files':
                editions[-1].add_pdf(tag)
        elif isinstance(tag, mwparserfromhell.nodes.template.Template):
            if tag.name == 'CPDLno':
                editions.append(Edition())
                editions[-1].cpdlno_params = strparams(tag)
            elif tag.name == 'Editor':
                editions[-1].editor_params = strparams(tag)
            elif tag.name == 'ScoreInfo':
                editions[-1].score_info = strparams(tag)

    return editions

Example #36

0

Show file

    def __init__(self, element: Element):
        self.ignored = False

        for child in element.getchildren():
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}redirect":
                self.ignored = True
                return
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}ns" and child.text != "0":
                self.ignored = True
                return
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}title":
                self.title = child.text
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}id":
                self.id = child.text
            if child.tag == "{http://www.mediawiki.org/xml/export-0.10/}revision":
                for revision in child.getchildren():
                    if revision.tag == "{http://www.mediawiki.org/xml/export-0.10/}text":
                        self.body = mwparserfromhell.parse(revision.text).strip_code()
                        break

        element.clear()

Example #37

0

Show file

 def mapper_final(self):
     for entry in self.d:
         working_text = self.d[entry][1]
         wikicode = mw.parse(working_text)
         links_list = wikicode.filter_wikilinks()
         article_links = 0
         links_title = []
         if links_list != []:
             for link in links_list:
                 links_title.append(link.title)
             for h in range(0, len(links_title)):
                 links_title[h] = links_title[h].strip().lower()
             title_set = Set(links_title)
             article_links = len(title_set)
         yield "a", article_links
         yield "s", article_links * article_links
         yield "n", 1
     for k in range(0, self.e):
         yield "a", 1
         yield "s", 1
         yield "n", 1

Example #38

0

Show file

def make_new_wikicode(text, form_data, page_name):
    wikicode = mwparserfromhell.parse(text)
    change_made = False
    for template in wikicode.filter_templates():
        edit = main.TemplateEdit(template, page_name)
        if edit.classification == 'ignored' or edit.classification == 'rejected':
            continue
        proposed_addition = form_data.get(edit.orig_hash)
        user_checked = form_data.get(edit.orig_hash + '-addlink')
        if proposed_addition and user_checked == 'checked':
            # Go through one or more suggestions separated by pipe
            for proposed_parameter in proposed_addition.split("|"):
                try:
                    # Get the new wikitext for the template with this parameter added
                    edit.update_template(proposed_parameter)
                    change_made = True
                except ValueError:
                    app.logger.exception(
                        'update_template failed on {}'.format(page_name))
                    pass  # TODO report to the user
    return unicode(wikicode), change_made

Example #39

0

Show file

File: wikipedia.py Project: tanveerahmad1517/govtrack.us-web

def get_bill_for_page(page):
    for template in mwparserfromhell.parse(page["text"]).filter_templates():
        if template.name.strip() == "Infobox U.S. legislation":
            #print page["title"].encode("utf8")
            billref = get_bill_from_infobox(template)
            if billref:
                try:
                    if billref[0] == "PL":
                        # Get by pulic law number.
                        return Bill.objects.get(congress=billref[1],
                                                sliplawpubpriv="PUB",
                                                sliplawnum=billref[2])
                    elif billref[0] == "BILL":
                        # It's a bill number.
                        return Bill.objects.get(congress=billref[1],
                                                bill_type=BillType.by_slug(
                                                    billref[2]),
                                                number=billref[3])
                except Bill.DoesNotExist:
                    return None
    return None

Example #40

0

Show file

def expand_templates(defn, keep_lb=True, rm_gram=False):
    from mwparserfromhell import parse
    from wikiparse.assoc.identispan import identispan_text_rm

    wikicode = parse(defn)

    for t in wikicode.filter_templates(recursive=False):
        if t.name in ("l", "link"):
            wikicode.replace(t, "[[{}]]".format(expand_templates(t.get(2))))
        elif t.name in ("lb",) and keep_lb:
            wikicode.replace(t, "({})".format(expand_templates(t.get(2))))
        elif t.name in ("gloss", "qualifier"):
            wikicode.replace(t, "({})".format(expand_templates(t.get(1))))
        else:
            wikicode.remove(t)

    defn = str(wikicode)
    if rm_gram:
        defn = identispan_text_rm(defn)

    return defn

Example #41

0

Show file

File: wdtk.py Project: wetneb/wikidata-bots

    def parse_proposal_page(self, page_name):
        """
        Parses a proposal page to extract metadata about the property to create.

        :param text: the name of the proposal page
        """
        self.page_name = page_name
        text = self.get_page_over_api(PROPERTY_PROPOSAL_PREFIX + page_name)
        wikicode = mwparserfromhell.parse(cleanup_text(text.encode('utf-8')))

        for node in wikicode.filter(forcetype=(Template, Heading)):
            if isinstance(node, Heading):
                self.latest_labels = self.parse_translatable(node.title)
            elif isinstance(node, Template):
                template = node
                if (unicode(template.name).strip() == 'Property proposal'
                        and template.get('status').value.strip() == 'ready'):
                    self.parse_proposal_template(template)
                    self.users = self.extract_users(wikicode)
                    break
        self.orig_wikicode = wikicode

Example #42

0

Show file

def location_from_grid(grid,
                       e,
                       n,
                       digits,
                       view_direction,
                       use6fig,
                       mapit=None):
    latstr, lonstr, prec = latlon_from_grid(grid, e, n, digits, use6fig)
    precstr = "{:g}".format(prec)
    paramstr = "source:" + source_from_grid(grid, e, n, digits)
    region = region_of(grid, e, n, latstr, lonstr, mapit)
    if region != None:
        paramstr += "_region:{}".format(region)
    if view_direction != None:
        paramstr += "_heading:{}".format(view_direction)
    t = Template(mwparserfromhell.parse('Location'))
    t.add(1, latstr)
    t.add(2, lonstr)
    t.add(3, paramstr)
    t.add('prec', precstr)
    return t

Example #43

0

Show file

File: cron_squad_point_update.py Project: RheingoldRiver/fortnite_pr_tools

    def update_and_save(self, page, lookup):
        text = page.text()
        wikitext = mwparserfromhell.parse(text)
        for template in wikitext.filter_templates():
            if template.name.matches(['Listplayer/Current']):
                player = template.get('1').value.strip()
                if player not in lookup:
                    template.add('squad', '')
                    continue
                template.add('squad', lookup[player])

        newtext = str(wikitext)
        if text != newtext:
            # print('Saving page %s...' % page.name)
            try:
                self.site.save(page, newtext, summary=self.SUMMARY)
            except EditError:
                self.site.log_error_content(
                    page.name, 'Spam filter prohibited squad point update')
        else:
            pass

Example #44

0

Show file

File: playersbot.py Project: Maccabipedia/maccabipedia_mediawiki_bot

def handle_existing_page(player_page, player_name):
    """
    :type player_page: pywikibot.page.Page
    :type player_name: str
    """

    parsed_mw_text = mwparserfromhell.parse(player_page.text)
    football_player_template = parsed_mw_text.filter_templates(player_template_name)[0]

    arguments = __get_football_player_template(player_name)

    for argument_name, argument_value in arguments.items():
        if str(argument_value) != football_player_template.get(argument_name).value and SHOULD_SHOW_DIFF:
            logger.info("Found diff between arguments on this argument_name: {arg_name}\n"
                        "existing value: {existing_value}\nnew_value: {new_value}".
                        format(arg_name=argument_name, existing_value=football_player_template.get(argument_name).value,
                               new_value=argument_value))

            football_player_template.add(argument_name, argument_value)

    player_page.text = parsed_mw_text

Example #45

0

Show file

File: wiki_xml_handler.py Project: chrispan68/wikipedia-parser

    def process_article(self, title, text, template='Infobox person'):
        """Process a wikipedia article looking for template"""

        # Create a parsing object
        wikicode = mwparserfromhell.parse(text)

        # Search through templates for the template
        matches = wikicode.filter_templates(matches=template)
        raw_year_string = 'EMPTY'
        birth_year = 'EMPTY'
        infobox = ''
        if len(matches) >= 1:
            # Extract information from infobox
            for match in matches:
                infobox = str(match)
                for param in match.params:
                    if param.name.strip_code().strip() == 'birth_date':
                        raw_year_string = str(param.value)
                        birth_year = self.get_birth_year(raw_year_string)
            summary = self.get_summary(wikicode.strip_code().strip())
            return (title, birth_year, summary, raw_year_string, infobox)

Example #46

0

Show file

 def mapper(self, _, line):
     try:
         self._chunk += line.strip()
         if re.search(r"</page>", line):
             text = ''
             self._slurping = False
             root = etree.fromstring(self._chunk, parser)
             texts = root and root.xpath('//text')
             if texts:
                 text = texts[0].text
             if text:
                 lset = set()
                 mwp = mwparserfromhell.parse(text)
                 links = mwp.filter_wikilinks()
                 for link in links:
                     match = parselink.search(unicode(link))
                     lset.add(match.groups()[0])
                 yield None, len(lset)
             self._chunk = ''
     except:
         self._chunk = ''

Example #47

0

Show file

File: test_docs.py Project: opencitations/wcw

 def test_readme_5(self):
     """test a block of example code in the README; includes a web call"""
     url1 = "https://en.wikipedia.org/w/api.php"
     url2 = "https://en.wikipedia.org/w/index.php?title={0}&action=raw"
     title = "Test"
     data = {"action": "query", "prop": "revisions", "rvprop": "content",
             "rvslots": "main", "rvlimit": 1, "titles": title,
             "format": "json", "formatversion": "2"}
     try:
         raw = urlopen(url1, urlencode(data).encode("utf8")).read()
     except IOError:
         self.skipTest("cannot continue because of unsuccessful web call")
     res = json.loads(raw.decode("utf8"))
     revision = res["query"]["pages"][0]["revisions"][0]
     text = revision["slots"]["main"]["content"]
     try:
         expected = urlopen(url2.format(title)).read().decode("utf8")
     except IOError:
         self.skipTest("cannot continue because of unsuccessful web call")
     actual = mwparserfromhell.parse(text)
     self.assertEqual(expected, actual)

Example #48

0

Show file

File: wiki_changes_in_article_revisions.py Project: descentis/kdap-files

def count_in_article(article_name):
    soup = bsoup(
        requests.get('https://en.wikipedia.org/wiki/Special:Export/' +
                     article_name).text, 'lxml')
    pagetext = soup.find('text').text

    wikilinks = mwph.parse(pagetext).filter_wikilinks()
    for wl in wikilinks:
        if wl[:7].lower() == '[[file:' or wl[:11].lower() == '[[category:':
            pagetext = pagetext.replace(str(wl), '')

    for wl in wikilinks:
        if wl[:7].lower() != '[[file:' and wl[:11].lower() != '[[category:':
            pagetext = pagetext.replace(str(wl), str(wl)[2:-2])

    wikitemplates = mwph.parse(pagetext).filter_templates()
    for wt in wikitemplates:
        pagetext = pagetext.replace(str(wt), '')

    comments = mwph.parse(pagetext).filter_comments()
    for comment in comments:
        pagetext = pagetext.replace(str(comment), '')

    external_links = mwph.parse(pagetext).filter_external_links()
    for ex_l in external_links:
        pagetext = pagetext.replace(str(ex_l), '')

    headings = mwph.parse(pagetext).filter_headings()
    for heading in headings:
        pagetext = pagetext.replace(str(heading), str(heading).strip('='))

    html_entities = mwph.parse(pagetext).filter_html_entities()
    for h_ent in html_entities:
        pagetext = pagetext.replace(str(h_ent), '')

    pagetext = filter_elements(pagetext, '{| class="wikitable', '|}')
    pagetext = filter_elements(pagetext, '{| class="infobox', '|}')
    pagetext = filter_elements(pagetext, '{| class="floatright', '|}')
    pagetext = filter_elements(pagetext, '{{cite', '}}')
    pagetext = filter_elements(pagetext, '<', '>')

    count_dict = {
        'wikilinks': len(wikilinks),
        'words': len(remove_punctuation(nltk.word_tokenize(pagetext))),
        'sentences': len(nltk.sent_tokenize(pagetext))
    }

    return count_dict

Example #49

0

Show file

 def run(self):
     while True:
         page, qid = self.queue.get()
         code = mwparser.parse(page.get())
         found = False
         template = ''  # to make pycharm shut up
         for template in code.filter_templates():
             if template.name.lower().strip() in redirects:
                 found = True
                 break
         if not found:
             print 'Could not find template on ' + page.title()
             return
         data = dictify(template)
         d = list()
         for nm in data:
             if not data[nm] or len(data[nm]) > 250:
                 continue
             d.append((None, page.title(), deqid(qid), nm, data[nm]))
         parsed.put(d)
         self.queue.task_done()

Example #50

0

Show file

File: click_deprecated_template_fixer.py Project: TheSandDoctor/Click-deprecated-param-fixer

def allow_bots(text, user):
    user = user.lower().strip()
    text = mwparserfromhell.parse(text)
    for tl in text.filter_templates():
        if tl.name in ('bots', 'nobots'):
            break
    else:
        return True
    for param in tl.params:
        bots = [x.lower().strip() for x in param.value.split(",")]
        if param.name == 'allow':
            if ''.join(bots) == 'none': return False
            for bot in bots:
                if bot in (user, 'all'):
                    return True
        elif param.name == 'deny':
            if ''.join(bots) == 'none': return True
            for bot in bots:
                if bot in (user, 'all'):
                    return False
    return True

Example #51

0

Show file

def fix_text(text):
    code = mwparserfromhell.parse(text)
    newtext = ''
    for index, x in enumerate(code.nodes):
        flag = False
        #print repr(x)
        if index != 0 and isinstance(x, mwparserfromhell.nodes.Text):
            if x.endswith('\n\n'):
                if isinstance(code.nodes[index - 1],
                              mwparserfromhell.nodes.Tag) and str(
                                  code.nodes[index - 1]) == '*':
                    if len(code.nodes) >= index + 1:
                        if isinstance(code.nodes[index + 1],
                                      mwparserfromhell.nodes.Tag) and str(
                                          code.nodes[index + 1]) == '*':
                            #print 'trimming'
                            flag = True
                            newtext += unicode(x)[:-1]
        if not flag:
            newtext += unicode(x)
    return newtext

Example #52

0

Show file

File: page_parser.py Project: bowen0603/WikiProjectRecSys

    def WIR_member_parse_wikilinks(self, page):
        set_members = set()
        try:
            query = self.url_page + page
            response = requests.get(query).json()
            pages = response['query']['pages']
            for page in pages:
                page_text = pages[page]['revisions'][0]['*']
                wikicode = mwp.parse(page_text)
                for link in wikicode.filter_wikilinks():
                    if link.startswith("[[User:"******"[[User:"******"").replace("]]", "")
                        set_members.add(user_text)

        except Exception:
            print("Error when parsing WIR pages")

        print("Identified {} members from the page: {}.".format(
            len(set_members), page))
        return set_members

Example #53

0

Show file

File: start_date.py Project: edgarskos/HazardBot

    def process_template(self, template):
        for page in template.getReferences(onlyTemplateInclusion=True,
                                           namespaces=0):
            try:
                text = page.get()
            except pywikibot.Error:
                continue
            else:
                code = mwparserfromhell.parse(text)

            for t in code.ifilter_templates():
                if t.name.lower().strip() == self.singles_template.title(
                        withNamespace=False).lower():
                    for p in t.params:
                        if "date" in p.name:
                            for t2 in p.value.ifilter_templates():
                                if t2.name.lower().strip(
                                ) in self.start_date_template_titles:
                                    date = self._get_date(t2, False)
                                    if date is not None:
                                        p.value.replace(t2, date)
                elif t.name.lower().strip(
                ) == self.episode_list_template.title(
                        withNamespace=False).lower():
                    if t.has_param("AltDate"):
                        for t2 in t.get("AltDate").value.ifilter_templates():
                            if t2.name.lower().strip(
                            ) in self.start_date_template_titles:
                                date = self._get_date(t2)
                                if date is not None:
                                    t.get("AltDate").value.replace(t2, date)
            if text != code:
                try:
                    page.put(
                        code,
                        "[[Wikipedia:Bots|Bot]]: Replacing {{[[Template:Start date|start date]]}} with the actual date"
                        " (it should only be used once in a template that emits microformats;"
                        " see [[Template:Start date/doc]])")
                except pywikibot.Error:
                    continue

Example #54

0

Show file

def parse_person(rec):
    wikitext = rec['wikitext']
    parsed = mwparserfromhell.parse(wikitext)

    words = [w.lower() for w in WORD_RE.findall(parsed.strip_code())]
    word_count = len(words)
    word_counts = Counter(words)
    gender_words = {w: word_counts[w] for w in ('him', 'his', 'he', 'her', 'she')}

    res = {}
    for template in parsed.filter_templates():
        if template.name.lower().startswith('infobox'):
            for param in template.params:
                res[param.name.strip().lower()] = param.value
    wikilinks = [str(x.title) for x in parsed.filter_wikilinks()]
    locations = []
    for k in 'birth_place', 'death_place':
        if k in res:
            locations += [str(x.title) for x in res[k].filter_wikilinks()]

    born = None
    died = None
    for wl in parsed.filter_wikilinks():
        title = str(wl.title)
        if title.startswith(CAT_PREFIX):
            if title.endswith(BIRTH_POSTFIX):
                born = tolerant_int(title[len(CAT_PREFIX): -len(BIRTH_POSTFIX)])
            if title.endswith(DIED_POSTFIX):
                died = tolerant_int(title[len(CAT_PREFIX): -len(DIED_POSTFIX)])

    return {'person_name': rec['title'],
            'wiki_id': rec['wiki_id'],
            'infobox': rec['infobox'],
            'locations': locations,
            'word_count': word_count,
            'gender_words': gender_words,
            'view_count': rec['viewcount'],
            'wikilinks': wikilinks,
            'born': born,
            'died': died}

Example #55

0

Show file

File: vodstosb_main.py Project: JibrilCSX/esports-wiki-cogs

    def run(self):
        where_condition = ' OR '.join(
            ['MSG.{} IS NOT NULL'.format(_) for _ in self.vod_params])
        vod_options = ['MSG.{}'.format(_) for _ in self.vod_params]
        fields = [
            'COALESCE({})=Vod'.format(', '.join(vod_options)),
            'MSG._pageName=MSGPage', 'SG._pageName=SBPage',
            'SG.N_MatchInPage=N_MatchInPage', 'SG.N_GameInMatch=N_GameInMatch'
        ]
        result = self.site.cargo_client.query(
            tables="MatchScheduleGame=MSG,ScoreboardGames=SG",
            join_on="MSG.GameId=SG.GameId",
            where=
            f"(SG.VOD IS NULL AND SG._pageName IS NOT NULL AND ({where_condition}))"
            f" OR (SG.VOD != COALESCE(MSG.Vod, MSG.VodPB, MSG.VodGameStart, MSG.VodPostgame))",
            fields=', '.join(fields),
            order_by=
            'SG._pageName, SG.N_MatchInPage',  # this is just to group same pages consecutively
        )

        current_page = {
            'page': None,
            'wikitext': None,
            'page_name': None,
        }
        for item in result:
            if current_page['page_name'] != item['SBPage']:
                if current_page['page'] is not None:
                    self.save_page(current_page)
                current_page['page_name'] = item['SBPage']
                current_page['page'] = self.site.client.pages[
                    current_page['page_name']]
                current_page['wikitext'] = mwparserfromhell.parse(
                    current_page['page'].text())
                # print('Discovered page {}'.format(current_page['page_name']))
            self.add_vod_to_page(item, current_page['wikitext'])

        # we need to catch the last iteration too (assuming we actually did anything)
        if current_page['page'] is not None:
            self.save_page(current_page)

Example #56

0

Show file

 def do_page(self, page):
     #print page.title(asLink=True).encode('utf-8')
     if page.namespace() != 6:
         return
     text = page.get()
     text, gen_fix_summary = self.AWBGenFixes.do_page(text)
     code = mwparserfromhell.parse(text)
     tag = False
     log = '* '
     summary = 'Bot: Updating license tag(s) with image has rationale=yes (errors? [[User:Legobot/Stop/22|stop me]])'
     for template in code.filter_templates(recursive=True):
         name = pywikibot.removeDisabledParts(template.name.lower()).strip()
         #print self.NFURs
         #time.sleep(5)
         if name in self.NFURs:
             tag = True
     if tag:
         for template in code.filter_templates(recursive=True):
             name = pywikibot.removeDisabledParts(
                 template.name.lower()).strip()
             if name in self.licenses:
                 template.add('image has rationale', 'yes')
                 log += '[[:%s]]: Adding <code>|image has rationale=yes</code>' % page.title(
                 )
     else:
         #print 'Skipping '+page.title(asLink=True).encode('utf-8')
         return
     #if gen_fix_summary:
     #    summary += ', also dating ' + gen_fix_summary
     puttext = unicode(code).lstrip('\n')
     pywikibot.showDiff(text, puttext)
     self.output(log)
     self.check_page()
     try:
         page.text = puttext
         page.save(summary, async=True, nocreate=True)
     except pywikibot.exceptions.PageNotSaved:
         pass
     except pywikibot.exceptions.LockedPage:
         pass

Example #57

0

Show file

File: test_wikicode.py Project: kynikos/wiki-scripts

class test_parented_ifilter:
    wikicode = mwparserfromhell.parse("""<span>
            foo {{bar|some text and {{another|template}}}}
            </span>
            {{foo|bar}}
            """)

    def test_recursive(self):
        nodes = []
        for parent, node in parented_ifilter(self.wikicode,
                                             recursive=True):
            nodes.append(node)
            assert parent.index(node) >= 0
        assert nodes == self.wikicode.filter(recursive=True)

    def test_nonrecursive(self):
        nodes = []
        for parent, node in parented_ifilter(self.wikicode,
                                             recursive=False):
            nodes.append(node)
            assert parent.index(node) >= 0
        assert nodes == self.wikicode.filter(recursive=False)

    def test_recursive_templates(self):
        templates = []
        for parent, template in parented_ifilter(self.wikicode,
                                                 forcetype=mwparserfromhell.nodes.template.Template,
                                                 recursive=True):
            templates.append(template)
            assert parent.index(template) >= 0
        assert templates == self.wikicode.filter_templates(recursive=True)

    def test_nonrecursive_templates(self):
        templates = []
        for parent, template in parented_ifilter(self.wikicode,
                                                 forcetype=mwparserfromhell.nodes.template.Template,
                                                 recursive=False):
            templates.append(template)
            assert parent.index(template) >= 0
        assert templates == self.wikicode.filter_templates(recursive=False)

Example #58

0

Show file

File: mena_updater.py Project: SeanDrum/toornament-scraper

 def run(self):
     matches = self.parser.run()
     i = 0
     match = matches[i]
     match: Match
     cur_page = None  # trailing index for printing at the end
     for page in self.data_pages:
         cur_page = page
         text = page.text()
         wikitext = mwparserfromhell.parse(text)
         for template in wikitext.filter_templates():
             template: Template
             if template.name.matches('MatchSchedule'):
                 # allow for the possibility of partially updating an event
                 # that starts in the latter half of a toornament scrape, e.g. playoffs
                 # n.b. we can only do this if we added correct page and n_in_page tagging
                 # when we first created the event
                 if template.has('page', ignore_empty=True) and \
                         template.has('n_in_page', ignore_empty=True):
                     while match.page < int(template.get('page').value.strip()) \
                             or match.index_in_page < int(template.get('n_in_page').value.strip()):
                         i += 1
                         if i >= len(matches):
                             break
                         match = matches[i]
                 team1 = template.get('team1').value.strip()
                 team2 = template.get('team2').value.strip()
                 # TODO: some team validation? however remember there can be disambiguation
                 # TODO: so parse out anything in () when doing validation
                 if match.completed:
                     match.merge_into(template)
                 
                 # do a normal increment here
                 # this is necessary for legacy behavior in case the indices in_page etc aren't defined
                 i += 1
                 if i >= len(matches):
                     break
                 match = matches[i]
         self.site.save(page, str(wikitext), summary=self.summary)
     return 'https://lol.gamepedia.com/' + cur_page.name.replace(' ', '_')

Example #59

0

Show file

File: paramwdnom.py Project: pere-prlpz/monuments

def actuallista(pllista, diccipa, pagprova=False):
    resultat = u""
    origen = pllista.title()
    text = pllista.get()
    text0 = text
    code = mwparserfromhell.parse(text)
    t = code.filter_templates()
    #print(t)
    for template in t:
        #print (template.name)
        if template.name.matches(("Filera IPA")):
            if template.has("wikidata"):
                wd = template.get("wikidata").value.strip()
                wd = re.sub("<!-- no ?[Ww][Dd] ?auto -->", "", wd)
                #print(wd)
            else:
                wd = ""
            if wd == "" and template.has("nomcoor"):
                nombusca = template.get("nomcoor").value.strip()
                nombusca = nombusca.split("(")[0].strip()
                print("Per",
                      template.get("nomcoor").value.strip(), "busquem nom:",
                      nombusca)
                if nombusca in diccipa.keys():
                    print(diccipa[nombusca])
                    wdposar = diccipa[nombusca]
                    #print(wdposar)
                    template.add("wikidata", wdposar)
                else:
                    print("Inexistent")
    text = code
    if text != text0:
        print("Desant", pllista)
        pllista.put(
            text,
            u"Robot actualitza el paràmetre wikidata a partir dels noms dels monuments"
        )
    else:
        print("Cap canvi")
    return ()

Example #60

0

Show file

File: parser.py Project: utkarsh3142/movie_recommender_system

def process_article(title, text, timestamp, template='Infobox film'):
    """Process a wikipedia article looking for template"""

    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)

    # Search through templates for the template
    matches = wikicode.filter_templates(matches=template)

    # Filter out errant matches
    matches = [
        x for x in matches
        if x.name.strip_code().strip().lower() == template.lower()
    ]

    if len(matches) >= 1:
        # template_name = matches[0].name.strip_code().strip()

        # Extract information from infobox
        properties = {
            param.name.strip_code().strip(): param.value.strip_code().strip()
            for param in matches[0].params if param.value.strip_code().strip()
        }

        # Extract internal wikilinks
        wikilinks = [
            x.title.strip_code().strip() for x in wikicode.filter_wikilinks()
        ]

        # Extract external links
        exlinks = [
            x.url.strip_code().strip()
            for x in wikicode.filter_external_links()
        ]

        # Find approximate length of article
        text_length = len(wikicode.strip_code().strip())

        return (title, properties, wikilinks, exlinks, timestamp, text_length)