def testPullReferences(self): markup = 'Something <ref name="foo">Something where</ref>over' p = WikiMarkup(markup) ref = p.find_references(pull = True) self.assertEqual(ref, ['<ref name="foo">Something where</ref>']) got = p.render() self.assertEqual('<p>Something over</p>', got)
def testFindReferences(self): markup = 'Something <ref name="foo">Something where</ref>over' p = WikiMarkup(markup) ref = p.find_references() self.assertEqual(ref, ['<ref name="foo">Something where</ref>']) got = p.render() self.assert_("Something where" in got)
def formatblob(text, filename=None, language=None): #if markdown and any(filter(lambda ext: filename.endswith(ext), ['.md', '.mkdown', '.txt'])): # return markdown(text) if (filename is None): return pygmentize(code, filename, language); if any(filter(lambda ext: filename.endswith(ext), ['.rs', '.txt'])): return restructure(text) if filename.endswith('.mw'): wm = WikiMarkup(text) wm.set_link_postfix('.mw') rendered = wm.render() return rendered.decode('utf-8') return pygmentize(text, filename, language);
def main(): now = datetime.now() page_dates = [] revision_dates = [] corpora = defaultdict(lambda: set()) stemmer = GermanStemmer() for page, revisions in MediawikiDump(sys.stdin).iterpages(): timestamp = revisions[0]['timestamp'] page_dates.append(timestamp) first = revisions[0] stems = set() for year in xrange(first['timestamp'].year, now.year+1): revisions_in_year = [r for r in revisions if r['timestamp'].year == year] revision_dates.extend(r['timestamp'] for r in revisions_in_year) if revisions_in_year: stems = set() for revision in revisions_in_year: html = WikiMarkup(revision['text'].encode('utf-8')).render() text = clean_html(html.decode('utf-8')) # TODO: remove remaining markup words = WORD_RE.findall(text) stems.update(stemmer.stem(word) for word in words) corpora[year].update(stems) page_dates.sort() revision_dates.sort() delta = relativedelta(revision_dates[-1], revision_dates[0]) months = delta.years * 12 + delta.months outdir = os.path.abspath('./out') if not os.path.exists(outdir): os.mkdir(outdir) fig = pyplot.figure() ax = fig.add_subplot(111) ax.plot_date(page_dates, range(1, len(page_dates)+1), '-') ax.hist(date2num(revision_dates), months, histtype='step') ax.set_xlabel(u'Year') ax.legend([u'Total No. of Pages', u'New Revisions per month']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'pages.png'), format='png') fig = pyplot.figure() ax = fig.add_subplot(111) timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())] counts = [len(corpora[t.year]) for t in timestamps] ax.plot_date(timestamps, counts, '-') ax.set_xlabel(u'Year') ax.legend([u'No. of distinct tokens']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'tokens.png'), format='png') years = sorted(corpora.keys()) years = range(years[0], years[-1]) year_pairs = zip(years, years[1:]) for year1, year2 in year_pairs: current = corpora.get(year1, set()) next_ = corpora.get(year2, set()) filename = '{:04d}-{:04d}.diff'.format(year1, year2) with open(os.path.join(outdir, filename), 'w') as f: for token in sorted(current - next_): f.write(u'-{}\n'.format(token).encode('utf-8')) for token in sorted(next_ - current): f.write(u'+{}\n'.format(token).encode('utf-8'))
def checkMarkup(self, markup, wanted): p = WikiMarkup(markup) got = p.render() self.assertEqual(got, wanted)
def testLinkPrefixRendering(self): markup = 'foobar [[Woo]]' p = WikiMarkup(markup) p.set_link_prefix('http://www.google.com/?q=') got = p.render() self.assertEqual('<p>foobar <a href="http://www.google.com/?q=Woo">Woo</a></p>', got)
def testLinkPostfixRendering(self): markup = 'foobar [[Woo]]' p = WikiMarkup(markup) p.set_link_postfix('.mw') got = p.render() self.assertEqual('<p>foobar <a href="Woo.mw">Woo</a></p>', got)
def render_mediawiki(content): wm = WikiMarkup(content) wm.set_link_postfix('.mw') rendered = wm.render() return rendered.decode('utf-8')