def initializer(backdir): self.backdir = backdir # Monkey-patch wikitools to always use our existing session opener = WikitoolsRequestsAdapter() APIRequest = wikitools.api.APIRequest class RequestsAPIRequest(wikitools.api.APIRequest): def __init__(self, *args, **kwds): APIRequest.__init__(self, *args, **kwds) self.opener = opener wikitools.APIRequest = RequestsAPIRequest wikitools.api.APIRequest = RequestsAPIRequest self.wiki = wikitools.wiki.Wiki(WIKIPEDIA_API_URL) self.wiki.setUserAgent( 'citationhunt (https://tools.wmflabs.org/citationhunt)') self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg) self.chdb = chdb.init_scratch_db() self.exception_count = 0 if cfg.profile: self.profiler = cProfile.Profile() self.profiler.enable() # Undocumented :( https://stackoverflow.com/questions/24717468 multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def initializer(backdir): self.backdir = backdir self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg) self.exception_count = 0 if cfg.profile: self.profiler = cProfile.Profile() self.profiler.enable() # Undocumented :( https://stackoverflow.com/questions/24717468 multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def compute_fixed_snippets(cfg): logger.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours = 3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: logger.info('No pages to process!') return logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys())) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, clicked_snippets in page_title_to_snippets.items(): start_ts = min(cs.ts for cs in clicked_snippets) revisions = get_page_revisions(wiki, page_title, start_ts) for rev in revisions: snippets = parser.extract(rev['contents']) gone_in_this_revision = { cs.snippet_id: cs for cs in clicked_snippets} # FIXME Duplicated logic with parse_live.py :( for sni in snippets: id = mkid(d(page_title) + sni.snippet) gone_in_this_revision.pop(id, None) for snippet_id, clicked_snippet in gone_in_this_revision.items(): if clicked_snippet.ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) clicked_snippets.remove(clicked_snippet) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)', clicked_snippet.ts, clicked_snippet.snippet_id, cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(cfg): logger.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours = 3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: logger.info('No pages to process!') return logger.info('Will reparse pages: %r' % page_title_to_snippets.keys()) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI( 'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, snippet_to_ts in page_title_to_snippets.items(): start_ts = min(snippet_to_ts.values()) revisions = get_page_revisions(wiki, page_title, start_ts) for rev in revisions: snippets = parser.extract(rev['contents']) gone_in_this_revision = dict(snippet_to_ts) # FIXME Duplicated logic with parse_live.py :( for sec, snips in snippets: for sni in snips: id = mkid(d(page_title) + sni) gone_in_this_revision.pop(id, None) for snippet_id, clicked_ts in gone_in_this_revision.items(): if clicked_ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) del snippet_to_ts[snippet_id] stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code, rev['rev_id']) live_db.close() stats_db.close() return 0
def compute_fixed_snippets(cfg): log.info('computing fixed snippets for %s' % cfg.lang_code) live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Load snippets that have been clicked in the past few hours to_ts = datetime.datetime.today() from_ts = to_ts - datetime.timedelta(hours=3) page_title_to_snippets = stats_db.execute_with_retry( load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts) if not page_title_to_snippets: log.info('No pages to process!') return log.info('Will reparse pages: %r' % page_title_to_snippets.keys()) # Now fetch and parse the pages and check which snippets are gone wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent) parser = snippet_parser.create_snippet_parser(wiki, cfg) for page_title, snippet_to_ts in page_title_to_snippets.items(): contents, page_ts = get_page_contents_and_timestamp(wiki, page_title) snippets = parser.extract(contents) # FIXME Duplicated logic with parse_live.py :( for sec, snips in snippets: for sni in snips: id = mkid(d(page_title) + sni) snippet_to_ts.pop(id, None) for snippet_id, clicked_ts in snippet_to_ts.items(): if clicked_ts < page_ts: log.info(snippet_id) stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code) live_db.close() stats_db.close() return 0
stdout, _ = lynx.communicate(html.encode('utf-8')) if lynx.returncode: print('Failed to render HTML! Do you have lynx?', file=sys.stderr) return html return stdout.decode('utf-8').strip('\n') if __name__ == '__main__': arguments = docopt.docopt(__doc__) cfg = config.get_localized_config() WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/' WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php' wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) parser = snippet_parser.create_snippet_parser(wikipedia, cfg) try: int(arguments['<title_or_pageid>']) wikitext = wikipedia.get_page_contents( pageid = int(arguments['<title_or_pageid>'])) except: wikitext = wikipedia.get_page_contents( title = arguments['<title_or_pageid>']) for snippet in parser.extract(wikitext): print('Section: %s' % snippet.section) if arguments['--output'] != 'raw': output = format_html(snippet.snippet) else: output = ' ' + '\n '.join(
stdout, _ = lynx.communicate(html.encode('utf-8')) if lynx.returncode: print >> sys.stderr, 'Failed to render HTML! Do you have lynx?' return html return stdout.decode('utf-8').strip('\n') if __name__ == '__main__': arguments = docopt.docopt(__doc__) cfg = config.get_localized_config() WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/' WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php' wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) parser = snippet_parser.create_snippet_parser(wikipedia, cfg) try: int(arguments['<title_or_pageid>']) wikitext = wikipedia.get_page_contents( pageid = int(arguments['<title_or_pageid>'])) except: wikitext = wikipedia.get_page_contents( title = arguments['<title_or_pageid>']) for section, snippets in parser.extract(wikitext): if not snippets: continue _print('Section: %s' % section) for snippet in snippets: if arguments['--output'] != 'raw': output = format_html(snippet)