Esempio n. 1
0
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5):
	if os.path.exists(output_dir):
		if not os.path.isdir(output_dir):
			sys.stderr.write(output_dir + " is not a directory\n")
			sys.exit(1)
	else:
		os.makedirs(output_dir)

	mark = {}
	success = 0
	articles = {}
	mark = {}
	for article, values in topics.items():
		if success >= upperlimit:
			break
		title = article

		# resolve redirects
		if not wikipydia.query_exists(title, lang):
			continue
		title = wikipydia.query_redirects(title, lang).replace(' ','_')

		if title in mark:
			continue
		mark[title] = True

		# the file prefix for output files
		file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted
		if file_prefix.startswith('.'):
			file_prefix = "%2E" + file_prefix[1:]

		if dryrun:
			print file_prefix
			success += 1
			continue

		done = False
		no_retry = 0
		while not done and no_retry < retry:
			try:
				revid = values['thenid']
				if revid == 0:
					revid = wikipydia.query_revid_by_date_fallback(title, lang, date)
				wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
				done = True
			except:
				no_retry += 1
				time.sleep(wait)

		if not wikimarkup:
			print 'Retrieving', title, 'failed'
			print 'RevID:', revid
			print 'Date:', date.isoformat()
			continue
		try:
			sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True)
		except:
			sys.stdout.flush()
			sys.stdout.write('Failed retrieving the text from ' + title + '\n')
			traceback.print_exc()
			sys.stdout.flush()
			continue

		# substitute angle brackets with html-like character encodings
		#sentences = [re.sub('<', '&lt;', re.sub('>', '&gt;', s)) for s in sentences]
		#sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.')
		output_filename = os.path.join(output_dir, file_prefix + '.sentences')
		output = write_lines_to_file(output_filename, sentences)
		output_filename = os.path.join(output_dir, file_prefix + '.tags')
		output = write_lines_to_file(output_filename, tags)
		success += 1

		priorid = values['priorid']
		if priorid == 0:
			priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15))
		articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid}
		sys.stderr.write('.')
	sys.stderr.write('\n')

	if not dryrun:
		if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'):
			write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))
Esempio n. 2
0
def convert_topics(filename, lang):
	date = None
	topics_re = re.compile(r'^([0-9]{4})-([0-9]{2})-([0-9]{2})\.topics$')
	m = topics_re.match(os.path.basename(filename))
	if m:
		date = datetime.date(int(m.group(1)), int(m.group(2)), int(m.group(3)))

	lineno = 0
	try:
		f = open(filename, 'r')
		topic_line_re1 = re.compile("^(.+) ([0-9]+)$")
		topic_line_re2 = re.compile("^([^\t]+)\t([0-9]+)$")
		print "<table>";
		print "<tr><th>Rank</th><th>Titles</th><th>Actions</th></tr>";
		for line in f:
			lineno += 1
			line = line.rstrip('\n')
			m = topic_line_re1.match(line)
			if m:
				title = m.group(1)
				pageviews = int(m.group(2))
			else:
				m = topic_line_re2.match(line)
				if m:
					title = m.group(1)
					pageviews = int(m.group(2))
				else:
					title = line
					pageviews = None
			title = title.decode('utf8')
			if not wikipydia.query_exists(title, lang):
				continue
			title = wikipydia.query_redirects(title, lang)
			title = title.encode('utf8')
			escaped_title = urllib.quote(title.replace(' ','_'), safe="%") # force / to be quoted and % not to be quoted
			if pageviews:
				print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s<span class="score">%d</span></a></td>' % (lineno, lang, escaped_title, title, pageviews)
			else:
				print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s</a></td>' % (lineno, lang, escaped_title, title)

			print '<td><span class="more">more</span><ul class="subnav">'
			print '\t<li><a href="http://%s.wikipedia.org/wiki/%s" target="view">View Now</a></li>' % (lang, escaped_title)
			if date:
				thenid = str(wikipydia.query_revid_by_date_fallback(title, lang, date))
				priorid = str(wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15)))
				print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + thenid + '" target="viewthen">View Then</a></li>'
				if priorid == "0":
					print '\t<li>View Prior</li>'
					print '\t<li>View Diff</li>'
				else:
					print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + priorid + '" target="viewprior">View Prior</a></li>'
					print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?diff=' + thenid + '&oldid=' + priorid + '" target="viewdiff">View Diff</a></li>'
			if lang != 'en':
				print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fwiki%2F' + escaped_title + '" target="translate">Translate Now</a></li>'
				if date:
					print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + thenid + '" target="translatethen">Translate Then</a></li>'
					if priorid == "0":
						print '\t<li>Translate Prior</li>'
						print '\t<li>Translate Diff</li>'
					else:
						print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + priorid + '" target="translateprior">Translate Prior</a></li>'
						print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?diff=' + thenid + '&oldid=' + priorid + '" target="translatediff">Translate Diff</a></li>'
			print "</ul></td></tr>";
		print "</table>";
	finally:
		if f:
			f.close()
Esempio n. 3
0
	read_html(clusters_html, sys.argv[3])
	read_articles_list(articles, sys.argv[4])

	to_resolve = []
	to_resolve.extend([t for t in topics.keys() if t not in articles]) # from topics
	to_resolve.extend([t for t in topics_html.keys() if t not in articles]) # from topics_html
	to_resolve.extend([c for c in clusters_html.keys() if c not in articles]) # from clusters_html
	to_resolve = list(set(to_resolve)) # remove duplicates

	# filter pages that do not exist
	existing = dict([(t, 1) for t in to_resolve if wikipydia.query_exists(t)])
	failed = [t for t in to_resolve if t not in existing] # non-existing pages
	to_resolve = [t for t in to_resolve if t in existing] # only existing pages
	
	# filter pages whose redirects exist in articles
	redirects = dict([(t, wikipydia.query_redirects(t).replace(' ','_')) for t in to_resolve])
	to_resolve = [t for t in to_resolve if t in articles]

	# at least one of the pages in the article list now redirects to a different page
	if to_resolve:
		a2r = [(a, wikipydia.query_redirects(a).replace(' ','_')) for a in articles]
		r2a = dict([(r, a) for a, r in a2r if a != r]) # pages that have a changed redirect page
		to_update = dict([(t, r2a[t]) for t in to_resolve if t in r2a])
		redirects.update(to_update)

		to_resolve = [t for t in to_resolve if t not in to_update]
		if to_resolve:
			print "Still pages that failed:", to_resolve
			print "(Should not exist at this stage)"
		failed.extend(to_resolve)