lang = 'en' sentences,tags = wpTextExtractor.wiki2sentences("<!-- See -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True) for s in sentences: print s sys.exit(0) #topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre'] topics = ['Barack_Obama'] shown = {} shown2 = {} shown3 = {} for article in topics: revid = wikipydia.query_revid_by_date(article, lang, date) print revid wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) wikimarkup = '\n'.join(sentences) print wikimarkup.encode('utf-8') while True: m = re.search(r'{{[^{}]*}}', wikimarkup) if not m: break ss = m.start() - 100 if ss < 0: ss = 0 ee = m.end() + 100 if ee > len(wikimarkup): ee = len(wikimarkup) #print wikimarkup[ss:m.start()], m.group(), wikimarkup[m.end():ee]
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5): if os.path.exists(output_dir): if not os.path.isdir(output_dir): sys.stderr.write(output_dir + " is not a directory\n") sys.exit(1) else: os.makedirs(output_dir) mark = {} success = 0 articles = {} mark = {} for article, values in topics.items(): if success >= upperlimit: break title = article # resolve redirects if not wikipydia.query_exists(title, lang): continue title = wikipydia.query_redirects(title, lang).replace(' ','_') if title in mark: continue mark[title] = True # the file prefix for output files file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted if file_prefix.startswith('.'): file_prefix = "%2E" + file_prefix[1:] if dryrun: print file_prefix success += 1 continue done = False no_retry = 0 while not done and no_retry < retry: try: revid = values['thenid'] if revid == 0: revid = wikipydia.query_revid_by_date_fallback(title, lang, date) wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] done = True except: no_retry += 1 time.sleep(wait) if not wikimarkup: print 'Retrieving', title, 'failed' print 'RevID:', revid print 'Date:', date.isoformat() continue try: sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True) except: sys.stdout.flush() sys.stdout.write('Failed retrieving the text from ' + title + '\n') traceback.print_exc() sys.stdout.flush() continue # substitute angle brackets with html-like character encodings #sentences = [re.sub('<', '<', re.sub('>', '>', s)) for s in sentences] #sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.') output_filename = os.path.join(output_dir, file_prefix + '.sentences') output = write_lines_to_file(output_filename, sentences) output_filename = os.path.join(output_dir, file_prefix + '.tags') output = write_lines_to_file(output_filename, tags) success += 1 priorid = values['priorid'] if priorid == 0: priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15)) articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid} sys.stderr.write('.') sys.stderr.write('\n') if not dryrun: if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'): write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))