def read_topics(topic_file, topics): #print "read_topics: " + topic_file f = codecs.open(topic_file, "rt", "utf-8") for line in f: fields = line.split() title = fields[0] score = int(fields[1]) if len(fields) >= 3: thenid = fields[2] else: thenid = str(wikipydia.query_revid_by_date_fallback(page, lang, date)) if len(fields) >= 4: priorid = fields[3] else: priorid = str(wikipydia.query_revid_by_date_fallback(page, lang, date - datetime.timedelta(days=15))) topics[title] = {'score': score, 'thenid': thenid, 'priorid': priorid}
def convert_topics(filename, lang): date = None topics_re = re.compile(r'^([0-9]{4})-([0-9]{2})-([0-9]{2})\.topics$') m = topics_re.match(os.path.basename(filename)) if m: date = datetime.date(int(m.group(1)), int(m.group(2)), int(m.group(3))) lineno = 0 try: f = open(filename, 'r') topic_line_re1 = re.compile("^(.+) ([0-9]+)$") topic_line_re2 = re.compile("^([^\t]+)\t([0-9]+)$") print "<table>"; print "<tr><th>Rank</th><th>Titles</th><th>Actions</th></tr>"; for line in f: lineno += 1 line = line.rstrip('\n') m = topic_line_re1.match(line) if m: title = m.group(1) pageviews = int(m.group(2)) else: m = topic_line_re2.match(line) if m: title = m.group(1) pageviews = int(m.group(2)) else: title = line pageviews = None title = title.decode('utf8') if not wikipydia.query_exists(title, lang): continue title = wikipydia.query_redirects(title, lang) title = title.encode('utf8') escaped_title = urllib.quote(title.replace(' ','_'), safe="%") # force / to be quoted and % not to be quoted if pageviews: print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s<span class="score">%d</span></a></td>' % (lineno, lang, escaped_title, title, pageviews) else: print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s</a></td>' % (lineno, lang, escaped_title, title) print '<td><span class="more">more</span><ul class="subnav">' print '\t<li><a href="http://%s.wikipedia.org/wiki/%s" target="view">View Now</a></li>' % (lang, escaped_title) if date: thenid = str(wikipydia.query_revid_by_date_fallback(title, lang, date)) priorid = str(wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15))) print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + thenid + '" target="viewthen">View Then</a></li>' if priorid == "0": print '\t<li>View Prior</li>' print '\t<li>View Diff</li>' else: print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + priorid + '" target="viewprior">View Prior</a></li>' print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?diff=' + thenid + '&oldid=' + priorid + '" target="viewdiff">View Diff</a></li>' if lang != 'en': print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fwiki%2F' + escaped_title + '" target="translate">Translate Now</a></li>' if date: print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + thenid + '" target="translatethen">Translate Then</a></li>' if priorid == "0": print '\t<li>Translate Prior</li>' print '\t<li>Translate Diff</li>' else: print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + priorid + '" target="translateprior">Translate Prior</a></li>' print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?diff=' + thenid + '&oldid=' + priorid + '" target="translatediff">Translate Diff</a></li>' print "</ul></td></tr>"; print "</table>"; finally: if f: f.close()
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5): if os.path.exists(output_dir): if not os.path.isdir(output_dir): sys.stderr.write(output_dir + " is not a directory\n") sys.exit(1) else: os.makedirs(output_dir) mark = {} success = 0 articles = {} mark = {} for article, values in topics.items(): if success >= upperlimit: break title = article # resolve redirects if not wikipydia.query_exists(title, lang): continue title = wikipydia.query_redirects(title, lang).replace(' ','_') if title in mark: continue mark[title] = True # the file prefix for output files file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted if file_prefix.startswith('.'): file_prefix = "%2E" + file_prefix[1:] if dryrun: print file_prefix success += 1 continue done = False no_retry = 0 while not done and no_retry < retry: try: revid = values['thenid'] if revid == 0: revid = wikipydia.query_revid_by_date_fallback(title, lang, date) wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] done = True except: no_retry += 1 time.sleep(wait) if not wikimarkup: print 'Retrieving', title, 'failed' print 'RevID:', revid print 'Date:', date.isoformat() continue try: sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True) except: sys.stdout.flush() sys.stdout.write('Failed retrieving the text from ' + title + '\n') traceback.print_exc() sys.stdout.flush() continue # substitute angle brackets with html-like character encodings #sentences = [re.sub('<', '<', re.sub('>', '>', s)) for s in sentences] #sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.') output_filename = os.path.join(output_dir, file_prefix + '.sentences') output = write_lines_to_file(output_filename, sentences) output_filename = os.path.join(output_dir, file_prefix + '.tags') output = write_lines_to_file(output_filename, tags) success += 1 priorid = values['priorid'] if priorid == 0: priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15)) articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid} sys.stderr.write('.') sys.stderr.write('\n') if not dryrun: if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'): write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))