Exemple #1
0
def main():
    now = datetime.now()
    page_dates = []
    revision_dates = []

    corpora = defaultdict(lambda: set())

    stemmer = GermanStemmer()

    for page, revisions in MediawikiDump(sys.stdin).iterpages():
        timestamp = revisions[0]['timestamp']
        page_dates.append(timestamp)

        first = revisions[0]
        stems = set()
        for year in xrange(first['timestamp'].year, now.year+1):
            revisions_in_year = [r for r in revisions if r['timestamp'].year == year]
            revision_dates.extend(r['timestamp'] for r in revisions_in_year)
            if revisions_in_year:
                stems = set()
                for revision in revisions_in_year:
                    html = WikiMarkup(revision['text'].encode('utf-8')).render()
                    text = clean_html(html.decode('utf-8'))
                    # TODO: remove remaining markup
                    words = WORD_RE.findall(text)
                    stems.update(stemmer.stem(word) for word in words)
            corpora[year].update(stems)

    page_dates.sort()
    revision_dates.sort()

    delta = relativedelta(revision_dates[-1], revision_dates[0])
    months = delta.years * 12 + delta.months

    outdir = os.path.abspath('./out')
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    ax.plot_date(page_dates, range(1, len(page_dates)+1), '-')
    ax.hist(date2num(revision_dates), months, histtype='step')
    ax.set_xlabel(u'Year')
    ax.legend([u'Total No. of Pages', u'New Revisions per month'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'pages.png'), format='png')

    fig = pyplot.figure()
    ax = fig.add_subplot(111)
    timestamps = [datetime(year, 12, 31) for year in sorted(corpora.keys())]
    counts = [len(corpora[t.year]) for t in timestamps]
    ax.plot_date(timestamps, counts, '-')
    ax.set_xlabel(u'Year')
    ax.legend([u'No. of distinct tokens'])
    fig.autofmt_xdate()
    fig.savefig(os.path.join(outdir, 'tokens.png'), format='png')

    years = sorted(corpora.keys())
    years = range(years[0], years[-1])
    year_pairs = zip(years, years[1:])
    for year1, year2 in year_pairs:
        current = corpora.get(year1, set())
        next_ = corpora.get(year2, set())
        filename = '{:04d}-{:04d}.diff'.format(year1, year2)
        with open(os.path.join(outdir, filename), 'w') as f:
            for token in sorted(current - next_):
                f.write(u'-{}\n'.format(token).encode('utf-8'))
            for token in sorted(next_ - current):
                f.write(u'+{}\n'.format(token).encode('utf-8'))