def main(): now = datetime.now() page_dates = [] revision_dates = [] corpora = defaultdict(lambda: set()) stemmer = GermanStemmer() for page, revisions in MediawikiDump(sys.stdin).iterpages(): timestamp = revisions[0]['timestamp'] page_dates.append(timestamp) first = revisions[0] stems = set() for year in xrange(first['timestamp'].year, now.year+1): revisions_in_year = [r for r in revisions if r['timestamp'].year == year] revision_dates.extend(r['timestamp'] for r in revisions_in_year) if revisions_in_year: stems = set() for revision in revisions_in_year: html = WikiMarkup(revision['text'].encode('utf-8')).render() text = clean_html(html.decode('utf-8')) # TODO: remove remaining markup words = WORD_RE.findall(text) stems.update(stemmer.stem(word) for word in words) corpora[year].update(stems) page_dates.sort() revision_dates.sort() delta = relativedelta(revision_dates[-1], revision_dates[0]) months = delta.years * 12 + delta.months outdir = os.path.abspath('./out') if not os.path.exists(outdir): os.mkdir(outdir) fig = pyplot.figure() ax = fig.add_subplot(111) ax.plot_date(page_dates, range(1, len(page_dates)+1), '-') ax.hist(date2num(revision_dates), months, histtype='step') ax.set_xlabel(u'Year') ax.legend([u'Total No. of Pages', u'New Revisions per month']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'pages.png'), format='png') fig = pyplot.figure() ax = fig.add_subplot(111) timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())] counts = [len(corpora[t.year]) for t in timestamps] ax.plot_date(timestamps, counts, '-') ax.set_xlabel(u'Year') ax.legend([u'No. of distinct tokens']) fig.autofmt_xdate() fig.savefig(os.path.join(outdir, 'tokens.png'), format='png') years = sorted(corpora.keys()) years = range(years[0], years[-1]) year_pairs = zip(years, years[1:]) for year1, year2 in year_pairs: current = corpora.get(year1, set()) next_ = corpora.get(year2, set()) filename = '{:04d}-{:04d}.diff'.format(year1, year2) with open(os.path.join(outdir, filename), 'w') as f: for token in sorted(current - next_): f.write(u'-{}\n'.format(token).encode('utf-8')) for token in sorted(next_ - current): f.write(u'+{}\n'.format(token).encode('utf-8'))