def load_hidden_categories(wpcursor): cfg = config.get_localized_config() wpcursor.execute(''' SELECT cl_from FROM categorylinks WHERE cl_to = %s''', (cfg.hidden_category,)) hidden_page_ids = [row[0] for row in wpcursor] return category_ids_to_names(wpcursor, hidden_page_ids)
def wrapper(lang_code = '', *args, **kwds): accept_language = parse_accept_language_header( flask.request.headers.get('Accept-Language', '')) lang_code = lang_code.lower() if not lang_code: return redirect_to_lang_code( find_default_lang_code_for_request(accept_language)) flask.g._lang_code = lang_code if lang_code not in config.LANG_CODES_TO_LANG_NAMES: return redirect_to_lang_code('en') flask.g._cfg = config.get_localized_config(lang_code, api = False) if flask.current_app.debug and 'locale' in flask.request.args: flask.g._strings = chstrings.get_localized_strings( flask.g._cfg, flask.request.args['locale']) else: flask.g._lang_tag, flask.g._strings = load_strings_for_request( lang_code, flask.g._cfg, accept_language) if not flask.g._strings: # Shouldn't really happen, this means we have a misconfigured # language that has a config entry but no locales in the translation # files. return redirect_to_lang_code('en') return handler(lang_code, *args, **kwds)
def update_intersections(): db = chdb.init_scratch_db() cfg = config.get_localized_config(api=False) db.execute_with_retry_s('DELETE FROM intersections') db.execute_with_retry_s( ''' INSERT INTO intersections SELECT * FROM %s WHERE expiration > NOW()''' % chdb.get_table_name(db, 'citationhunt', 'intersections')) db.execute_with_retry_s('DELETE FROM articles_intersections') db.execute_with_retry_s( ''' INSERT INTO articles_intersections SELECT * FROM %s WHERE article_id IN (SELECT page_id FROM articles) AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name(db, 'citationhunt', 'articles_intersections')) def update_snippets_links(cursor): cursor.execute('SELECT id FROM intersections') intersection_ids = [row[0] for row in cursor] if intersection_ids: database.populate_snippets_links(cursor, intersection_ids=intersection_ids) db.execute_with_retry(update_snippets_links) # delete empty intersections. should this surface an error to the user # instead? db.execute_with_retry_s('''DELETE FROM intersections WHERE id NOT IN ( SELECT inter_id FROM articles_intersections)''')
def page_not_found(e): if hasattr(flask.g, '_cfg'): cfg = flask.g._cfg else: cfg = config.get_localized_config('en') return flask.render_template( '404.html', config = cfg), 404
def init_scratch_db(): cfg = config.get_localized_config() def connect_and_initialize(): db = _connect(ch_my_cnf) _ensure_database(db, 'scratch', cfg.lang_code) return db return RetryingConnection(connect_and_initialize)
def init_scratch_db(): cfg = config.get_localized_config() def connect_and_initialize(): db = _connect_to_ch_mysql() _use(db.cursor(), 'scratch', cfg.lang_code) return db return _RetryingConnection(connect_and_initialize)
def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() db = chdb.init_wp_replica_db(cfg.lang_code) cursor = db.cursor() categories = set([cfg.citation_needed_category]) while True: cursor.execute( 'SELECT cl_from, cl_type FROM categorylinks WHERE (' + ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories) subcategories = set() for page_id, type in cursor: if type == b'page': print(page_id) elif type == b'subcat': subcategories.add(page_id) if not subcategories: break # need to convert the page ids of subcategories into page # titles so we can query recursively cursor.execute( 'SELECT page_title FROM page WHERE (' + ' OR '.join(['page_id = %s'] * len(subcategories)) + ')', subcategories) categories = set([r[0] for r in cursor])
def install_scratch_db(): cfg = config.get_localized_config() db = init_db(cfg.lang_code) # ensure citationhunt is populated with tables create_tables(db) chname = _make_tools_labs_dbname(db, 'citationhunt', cfg.lang_code) scname = _make_tools_labs_dbname(db, 'scratch', cfg.lang_code) with db as cursor: # generate a sql query that will atomically swap tables in # 'citationhunt' and 'scratch'. Modified from: # http://blog.shlomoid.com/2010/02/emulating-missing-rename-database.html cursor.execute('''SET group_concat_max_len = 2048;''') cursor.execute(''' SELECT CONCAT('RENAME TABLE ', GROUP_CONCAT('%s.', table_name, ' TO ', table_schema, '.old_', table_name, ', ', table_schema, '.', table_name, ' TO ', '%s.', table_name),';') FROM information_schema.TABLES WHERE table_schema = '%s' GROUP BY table_schema; ''' % (chname, chname, scname)) rename_stmt = cursor.fetchone()[0] cursor.execute(rename_stmt) cursor.execute('DROP DATABASE ' + scname)
def update_intersections(): db = chdb.init_scratch_db() cfg = config.get_localized_config() db.execute_with_retry_s('DELETE FROM intersections') db.execute_with_retry_s(''' INSERT INTO intersections SELECT * FROM %s WHERE expiration > NOW()''' % chdb.get_table_name( db, 'citationhunt', 'intersections')) db.execute_with_retry_s('DELETE FROM articles_intersections') db.execute_with_retry_s(''' INSERT INTO articles_intersections SELECT * FROM %s WHERE article_id IN (SELECT page_id FROM articles) AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name( db, 'citationhunt', 'articles_intersections')) def update_snippets_links(cursor): cursor.execute('SELECT id FROM intersections') intersection_ids = [row[0] for row in cursor] if intersection_ids: database.populate_snippets_links(cursor, intersection_ids = intersection_ids) db.execute_with_retry(update_snippets_links) # delete empty intersections. should this surface an error to the user # instead? db.execute_with_retry_s( '''DELETE FROM intersections WHERE id NOT IN ( SELECT inter_id FROM articles_intersections)''')
def citation_hunt(lang_code): id = flask.request.args.get('id') cat = flask.request.args.get('cat') cfg = config.get_localized_config(lang_code) lang_dir = cfg.lang_dir if flask.current_app.debug: lang_dir = flask.request.args.get('dir', lang_dir) if cat is not None: cat = get_category_by_id(lang_code, cat) if cat is None: # invalid category, normalize to "all" and try again by id cat = CATEGORY_ALL return flask.redirect( flask.url_for('citation_hunt', lang_code=lang_code, id=id, cat=cat.id)) else: cat = CATEGORY_ALL if id is not None: sinfo = Database.query_snippet_by_id(lang_code, id) if sinfo is None: # invalid id flask.request.cfg = cfg flask.abort(404) snippet, section, aurl, atitle = sinfo next_snippet_id = select_next_id(lang_code, id, cat) if next_snippet_id is None: # the snippet doesn't belong to the category! assert cat is not CATEGORY_ALL return flask.redirect( flask.url_for('citation_hunt', id=id, cat=CATEGORY_ALL.id, lang_code=lang_code)) autofocus = should_autofocus_category_filter(cat, flask.request) article_url_path = urllib.quote( e(urlparse.urlparse(aurl).path.lstrip('/'))) return flask.render_template('index.html', snippet=snippet, section=section, article_url=aurl, article_url_path=article_url_path, article_title=atitle, current_category=cat, next_snippet_id=next_snippet_id, cn_marker=CITATION_NEEDED_MARKER, cn_html=CITATION_NEEDED_MARKUP, ref_marker=REF_MARKER, ref_html=SUPERSCRIPT_MARKUP, config=cfg, lang_dir=lang_dir, category_filter_autofocus=autofocus) id = select_random_id(lang_code, cat) return flask.redirect( flask.url_for('citation_hunt', id=id, cat=cat.id, lang_code=lang_code))
def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() db = chdb.init_wp_replica_db() cursor = db.cursor() categories = set([cfg.citation_needed_category]) while True: cursor.execute( 'SELECT cl_from, cl_type FROM categorylinks WHERE (' + ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories) subcategories = set() for page_id, type in cursor: if type == 'page': print page_id elif type == 'subcat': subcategories.add(page_id) if not subcategories: break # need to convert the page ids of subcategories into page # titles so we can query recursively cursor.execute( 'SELECT page_title FROM page WHERE (' + ' OR '.join(['page_id = %s'] * len(subcategories)) + ')', subcategories) categories = set([r[0] for r in cursor])
def init_wp_replica_db(): cfg = config.get_localized_config() def connect_and_initialize(): db = _connect(wp_my_cnf) with db as cursor: cursor.execute('USE ' + cfg.database) return db return RetryingConnection(connect_and_initialize)
def load_hidden_categories(wpcursor): cfg = config.get_localized_config() wpcursor.execute( ''' SELECT cl_from FROM categorylinks WHERE cl_to = %s''', (cfg.hidden_category, )) hidden_page_ids = [row[0] for row in wpcursor] return category_ids_to_names(wpcursor, hidden_page_ids)
def init_wp_replica_db(lang_code): cfg = config.get_localized_config(lang_code) def connect_and_initialize(): db = _connect_to_wp_mysql(cfg) with db as cursor: cursor.execute('USE ' + cfg.database) return db return _RetryingConnection(connect_and_initialize)
def category_is_usable(catname, hidden_categories): assert isinstance(catname, CategoryName) if catname in hidden_categories: return False cfg = config.get_localized_config() for regexp in cfg.category_name_regexps_blacklist: if re.search(regexp, catname): return False return True
def sanity_check(): cfg = config.get_localized_config() sdb = chdb.init_scratch_db() snippet_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM snippets''')[0][0] assert snippet_count > cfg.min_snippets_sanity_check article_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM articles''')[0][0] assert article_count > cfg.min_articles_sanity_check
def print_pageids_from_wikipedia(): cfg = config.get_localized_config() db = cddb.init_wp_replica_db(cfg.lang_code) cursor = db.cursor() cursor.execute('SELECT page_id FROM page where page_namespace = 0' + ' AND page_is_redirect = 0' + ' AND RAND() < %s' % cfg.articles_sampling_fraction) for page_id in cursor: print(page_id[0])
def init_cd_db(): cfg = config.get_localized_config() def connect_and_initialize(): db = _connect_to_ch_mysql() with db.cursor() as cursor: cursor.execute('USE ' + 's54245__citationdetective_p') return db return _RetryingConnection(connect_and_initialize)
def citation_hunt(lang_code): id = flask.request.args.get('id') cat = flask.request.args.get('cat') cfg = config.get_localized_config(lang_code) lang_dir = cfg.lang_dir if flask.current_app.debug: lang_dir = flask.request.args.get('dir', lang_dir) if cat is not None: cat = get_category_by_id(lang_code, cat) if cat is None: # invalid category, normalize to "all" and try again by id cat = CATEGORY_ALL return flask.redirect( flask.url_for('citation_hunt', lang_code = lang_code, id = id, cat = cat.id)) else: cat = CATEGORY_ALL if id is not None: sinfo = Database.query_snippet_by_id(lang_code, id) if sinfo is None: # invalid id flask.request.cfg = cfg flask.abort(404) snippet, section, aurl, atitle = sinfo next_snippet_id = select_next_id(lang_code, id, cat) if next_snippet_id is None: # the snippet doesn't belong to the category! assert cat is not CATEGORY_ALL return flask.redirect( flask.url_for('citation_hunt', id = id, cat = CATEGORY_ALL.id, lang_code = lang_code)) autofocus = should_autofocus_category_filter(cat, flask.request) article_url_path = urllib.quote( e(urlparse.urlparse(aurl).path.lstrip('/'))) return flask.render_template('index.html', snippet = snippet, section = section, article_url = aurl, article_url_path = article_url_path, article_title = atitle, current_category = cat, next_snippet_id = next_snippet_id, cn_marker = CITATION_NEEDED_MARKER, cn_html = CITATION_NEEDED_MARKUP, ref_marker = REF_MARKER, ref_html = SUPERSCRIPT_MARKUP, config = cfg, lang_dir = lang_dir, category_filter_autofocus = autofocus) id = select_random_id(lang_code, cat) return flask.redirect( flask.url_for('citation_hunt', id = id, cat = cat.id, lang_code = lang_code))
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) logger.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] logger.info('finished with %d categories' % len( category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() logger.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def is_citation_needed(self, template): '''Override to control which templates are considered Citation needed. The default implementation matches against config.citation_needed_templates. ''' cfg = config.get_localized_config() return any( template.name.matches(tpl) for tpl in cfg.citation_needed_templates)
def reset_scratch_db(): cfg = config.get_localized_config() db = init_db(cfg.lang_code) with db as cursor: dbname = _make_tools_labs_dbname(db, 'scratch', cfg.lang_code) with ignore_warnings(): cursor.execute('DROP DATABASE IF EXISTS ' + dbname) cursor.execute('CREATE DATABASE %s CHARACTER SET utf8mb4' % dbname) cursor.execute('USE ' + dbname) create_tables(db) return db
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] log.info('finished with %d categories' % len(category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() log.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def wrapper(lang_code='', *args, **kwds): flask.g._lang_code = lang_code if lang_code not in config.LANG_CODES_TO_LANG_NAMES: response = flask.redirect( flask.url_for('citation_hunt', lang_code='en', **flask.request.args)) if flask.request.path != '/': response.headers['Location'] += flask.request.path return response flask.g._cfg = config.get_localized_config(lang_code) return handler(lang_code, *args, **kwds)
def strip_wikilink(self, wikilink, normalize, collapse): '''Override to control how wikilinks are stripped in the wikicode. The return value will be the link's replacement. The default value will strip the wikilink entirely if its title has a prefix-match in config.wikilink_prefix_blacklist; otherwise, it will delegate to mwparserfromhell. ''' cfg = config.get_localized_config() for prefix in cfg.wikilink_prefix_blacklist: if wikilink.title.startswith(prefix): return '' return self.delegate_strip(wikilink, normalize, collapse)
def page_not_found(e): if hasattr(flask.g, '_cfg'): cfg = flask.g._cfg else: cfg = config.get_localized_config('en') if hasattr(flask.g, '_strings'): lang_tag = flask.g._lang_tag strings = flask.g._strings else: lang_tag = 'en' strings = chstrings.get_localized_strings(cfg, 'en') return flask.render_template( '404.html', config = cfg, lang_tag = lang_tag, lang_dir = cfg.lang_dir, strings = strings), 404
def page_not_found(e): if hasattr(flask.g, '_cfg'): cfg = flask.g._cfg else: cfg = config.get_localized_config('en') if hasattr(flask.g, '_strings'): lang_tag = flask.g._lang_tag strings = flask.g._strings else: lang_tag = 'en' strings = chstrings.get_localized_strings(cfg, 'en') return flask.render_template('404.html', config=cfg, lang_tag=lang_tag, lang_dir=cfg.lang_dir, strings=strings), 404
def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() templates = [t.replace(' ', '_') for t in cfg.citation_needed_templates] db = chdb.init_wp_replica_db(cfg.lang_code) cursor = db.cursor() or_clause = ( '(' + 'OR '.join(['tl_title = %s'] * len(templates)) + ')' ) # https://www.mediawiki.org/wiki/Help:Namespaces cursor.execute( 'SELECT tl_from FROM templatelinks WHERE ' + 'tl_from_namespace = 0 AND tl_namespace = 10 AND ' + or_clause, templates) for (page_id,) in cursor: print(page_id)
def create_tables(db): cfg = config.get_localized_config() with db as cursor, ignore_warnings(): cursor.execute(''' CREATE TABLE IF NOT EXISTS categories (id VARCHAR(128) PRIMARY KEY, title VARCHAR(255)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' INSERT IGNORE INTO categories VALUES("unassigned", "unassigned") ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS articles (page_id INT(8) UNSIGNED PRIMARY KEY, url VARCHAR(512), title VARCHAR(512)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS articles_categories ( article_id INT(8) UNSIGNED, category_id VARCHAR(128), FOREIGN KEY(article_id) REFERENCES articles(page_id) ON DELETE CASCADE, FOREIGN KEY(category_id) REFERENCES categories(id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS category_article_count ( category_id VARCHAR(128), article_count INT(8) UNSIGNED, FOREIGN KEY(category_id) REFERENCES categories(id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute( ''' CREATE TABLE IF NOT EXISTS snippets (id VARCHAR(128) PRIMARY KEY, snippet VARCHAR(%s), section VARCHAR(768), article_id INT(8) UNSIGNED, FOREIGN KEY(article_id) REFERENCES articles(page_id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''', (cfg.snippet_max_size * 2, )) cursor.execute(''' CREATE TABLE IF NOT EXISTS snippets_links (prev VARCHAR(128), next VARCHAR(128), cat_id VARCHAR(128), FOREIGN KEY(prev) REFERENCES snippets(id) ON DELETE CASCADE, FOREIGN KEY(next) REFERENCES snippets(id) ON DELETE CASCADE, FOREIGN KEY(cat_id) REFERENCES categories(id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''')
def __init__(self): # Monkey-patch mwparserfromhell to use our own methods. monkey_patched_classes = { mwparserfromhell.nodes.Template: self.strip_template, mwparserfromhell.nodes.Tag: self.strip_tag, mwparserfromhell.nodes.Wikilink: self.strip_wikilink, mwparserfromhell.nodes.Heading: self.strip_heading, } self._original_strip_methods = {} for klass, method in monkey_patched_classes.items(): self._original_strip_methods[klass] = klass.__strip__ def unbind(self, *args): return monkey_patched_classes[type(self)](self, *args) klass.__strip__ = unbind self.cfg = config.get_localized_config()
def initialize_all_databases(): def _do_create_database(cursor, database, lang_code): dbname = _make_tools_labs_dbname(cursor, database, lang_code) cursor.execute('SET SESSION sql_mode = ""') cursor.execute('CREATE DATABASE IF NOT EXISTS %s ' 'CHARACTER SET utf8mb4' % dbname) cfg = config.get_localized_config() db = _RetryingConnection(_connect_to_cd_mysql) with db.cursor() as cursor, ignore_warnings(): cursor.execute( 'DROP DATABASE IF EXISTS ' + _make_tools_labs_dbname(cursor, 'scratch', cfg.lang_code)) for database in ['citationdetective', 'scratch']: _do_create_database(cursor, database, cfg.lang_code) _use(cursor, 'scratch', cfg.lang_code) _create_citationdetective_tables(cfg, cursor) _use(cursor, 'citationdetective', cfg.lang_code) _create_citationdetective_tables(cfg, cursor)
def install_scratch_db(): cfg = config.get_localized_config() with init_db(cfg.lang_code) as cursor: chname = _make_tools_labs_dbname(cursor, 'citationhunt', cfg.lang_code) scname = _make_tools_labs_dbname(cursor, 'scratch', cfg.lang_code) # generate a sql query that will atomically swap tables in # 'citationhunt' and 'scratch'. Modified from: # http://blog.shlomoid.com/2010/02/emulating-missing-rename-database.html cursor.execute('''SET group_concat_max_len = 2048;''') cursor.execute(''' SELECT CONCAT('RENAME TABLE ', GROUP_CONCAT('%s.', table_name, ' TO ', table_schema, '.old_', table_name, ', ', table_schema, '.', table_name, ' TO ', '%s.', table_name),';') FROM information_schema.TABLES WHERE table_schema = '%s' GROUP BY table_schema; ''' % (chname, chname, scname)) rename_stmt = cursor.fetchone()[0] cursor.execute(rename_stmt) cursor.execute('DROP DATABASE ' + scname)
def initialize_all_databases(): def _do_create_database(cursor, database, lang_code): dbname = _make_tools_labs_dbname(cursor, database, lang_code) cursor.execute('SET SESSION sql_mode = ""') cursor.execute( 'CREATE DATABASE IF NOT EXISTS %s ' 'CHARACTER SET utf8mb4' % dbname) cfg = config.get_localized_config() db = _RetryingConnection(_connect_to_ch_mysql) with db as cursor, ignore_warnings(): cursor.execute('DROP DATABASE IF EXISTS ' + _make_tools_labs_dbname( cursor, 'scratch', cfg.lang_code)) for database in ['citationhunt', 'scratch', 'stats']: _do_create_database(cursor, database, cfg.lang_code if database != 'stats' else 'global') _use(cursor, 'scratch', cfg.lang_code) _create_citationhunt_tables(cfg, cursor) _use(cursor, 'citationhunt', cfg.lang_code) _create_citationhunt_tables(cfg, cursor) _use(cursor, 'stats', 'global') _create_stats_tables(cfg, cursor)
def wrapper(lang_code = '', *args, **kwds): accept_language_hdr = flask.request.headers.get('Accept-Language', '') if not lang_code: return redirect_to_lang_code( find_default_lang_code_for_request(accept_language_hdr)) flask.g._lang_code = lang_code if lang_code not in config.LANG_CODES_TO_LANG_NAMES: return redirect_to_lang_code('en') flask.g._cfg = config.get_localized_config(lang_code) if flask.current_app.debug and 'locale' in flask.request.args: flask.g._strings = chstrings.get_localized_strings( flask.g._cfg, flask.request.args['locale']) else: flask.g._lang_tag, flask.g._strings = load_strings_for_request( lang_code, flask.g._cfg, accept_language_hdr) if not flask.g._strings: # Shouldn't really happen, this means we have a misconfigured # language that has a config entry but no locales in the translation # files. return redirect_to_lang_code('en') return handler(lang_code, *args, **kwds)
def create_tables(db): cfg = config.get_localized_config() with db as cursor, ignore_warnings(): cursor.execute(''' CREATE TABLE IF NOT EXISTS categories (id VARCHAR(128) PRIMARY KEY, title VARCHAR(255)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' INSERT IGNORE INTO categories VALUES("unassigned", "unassigned") ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS articles (page_id VARCHAR(128) PRIMARY KEY, url VARCHAR(512), title VARCHAR(512)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS articles_categories ( article_id VARCHAR(128), category_id VARCHAR(128), FOREIGN KEY(article_id) REFERENCES articles(page_id) ON DELETE CASCADE, FOREIGN KEY(category_id) REFERENCES categories(id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''') cursor.execute(''' CREATE TABLE IF NOT EXISTS snippets (id VARCHAR(128) PRIMARY KEY, snippet VARCHAR(%s), section VARCHAR(768), article_id VARCHAR(128), FOREIGN KEY(article_id) REFERENCES articles(page_id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''', (cfg.snippet_max_size * 2,)) cursor.execute(''' CREATE TABLE IF NOT EXISTS snippets_links (prev VARCHAR(128), next VARCHAR(128), cat_id VARCHAR(128), FOREIGN KEY(prev) REFERENCES snippets(id) ON DELETE CASCADE, FOREIGN KEY(next) REFERENCES snippets(id) ON DELETE CASCADE, FOREIGN KEY(cat_id) REFERENCES categories(id) ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ''')
def test_fallback_lang_tag(self): gcfg = config.get_global_config() cfg = config.get_localized_config(gcfg.fallback_lang_tag, api=False) fallback_strings = chstrings.get_localized_strings( cfg, gcfg.fallback_lang_tag) original = chstrings._load_strings_for_lang_tag(gcfg.fallback_lang_tag) with mock.patch('chstrings._load_strings_for_lang_tag') as m: # Simulate an incomplete strings file. def _load_strings_side_effect(lang_tag): if lang_tag == 'fake': return {'tooltitle': 'Test Citation Hunt'} elif lang_tag == gcfg.fallback_lang_tag: return original raise ValueError m.side_effect = _load_strings_side_effect # The incomplete strings must have been merged with the fallback # ones. strings = chstrings.get_localized_strings(cfg, 'fake') self.assertEqual('Test Citation Hunt', strings['tooltitle']) self.assertEqual(fallback_strings['instructions_goal'], strings['instructions_goal'])
def compute_fixed_snippets(): start = time.time() # FIXME This could probably just be one query on a single database # connection, insead of one connection per database and loading all # snippets in memory for comparison. cfg = config.get_localized_config() scratch_db = chdb.init_scratch_db() live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Find the set of snippets that that were "clicked" (redirected to article) # between the dates of the previous/live and next/scratch database from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets') to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets') clicked = stats_db.execute_with_retry(load_snippet_clicks_between, cfg.lang_code, from_ts, to_ts) # Load the snippets from both databases scratch_snippets = scratch_db.execute_with_retry(load_snippets) live_snippets = live_db.execute_with_retry(load_snippets) # And for each snippet that disappeared across databases AND had been # clicked in the meantime, store its information in the stats database. gone = live_snippets.difference(scratch_snippets) for id, clicked_ts in clicked.iteritems(): if id in gone: log.info(id) stats_db.execute_with_retry_s( 'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id, cfg.lang_code) log.info('all done in %d seconds.' % (time.time() - start)) scratch_db.close() live_db.close() stats_db.close() return 0
try: _update_db_tools_labs(cfg) except Exception as e: traceback.print_exc(file = sys.stderr) email('Failed to build database for %s' % cfg.lang_code, logfiles) sys.exit(1) utils.mkdir_p(cfg.log_dir) for logfile in logfiles: os.rename(logfile, os.path.join(cfg.log_dir, logfile)) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Update the CitationHunt databases.') parser.add_argument('lang_code', help='One of the language codes in ../config.py') args = parser.parse_args() if not utils.running_in_tools_labs(): print('Not running in Tools Labs!', file=sys.stderr) sys.exit(1) if args.lang_code not in config.LANG_CODES_TO_LANG_NAMES: print('Invalid lang code! Use one of: ', end=' ', file=sys.stderr) print(list(config.LANG_CODES_TO_LANG_NAMES.keys()), file=sys.stderr) parser.print_usage() sys.exit(1) cfg = config.get_localized_config(args.lang_code) update_db_tools_labs(cfg)
import os os.environ['DEBUG'] = '1' # disable https redirects # Disable stats since it requires a database, and we're not # testing it anyway import config config.get_localized_config('en').flagged_off.append('stats') import app import mock import unittest class CitationHuntTest(unittest.TestCase): def setUp(self): self.app = app.app.test_client() self.sid = '93b6f3cf' self.cat = 'b5e1a25d' self.fake_snippet_info = ( 'Some snippet', 'Some section', 'https://en.wikipedia.org/wiki/A', 'Some title') methods_and_return_values = [ ('query_categories', [(self.cat, 'Category')]), ('query_snippet_by_id', self.fake_snippet_info), ('query_snippet_by_category', (self.sid,)), ('query_random_snippet', (self.sid,)), ('query_next_id', (self.sid[::-1],)), ]
CACHE_DURATION_SEMI_STATIC = 3 * 60 * 60 app = flask.Flask(__name__) Compress(app) debug = 'DEBUG' in os.environ if not debug: flask_sslify.SSLify(app, permanent = True) Mobility(app) @app.route('/') @handlers.validate_lang_code def index(lang_code): pass # nothing to do but validate lang_code app.add_url_rule('/<lang_code>', view_func = handlers.citation_hunt) if 'stats' not in config.get_localized_config('en').flagged_off: app.add_url_rule('/<lang_code>/stats.html', view_func = handlers.stats) app.after_request(handlers.log_request) @app.route('/<lang_code>/redirect') @handlers.validate_lang_code def redirect(lang_code): to = urllib.unquote(flask.request.args.get('to', '')) cfg = config.get_localized_config(lang_code) return flask.redirect( urlparse.urljoin('https://' + cfg.wikipedia_domain, to)) @app.route('/<lang_code>/categories.html') @handlers.validate_lang_code def categories_html(lang_code): response = flask.make_response(
def redirect(lang_code): to = urllib.parse.unquote(flask.request.args.get('to', '')) cfg = config.get_localized_config(lang_code) return flask.redirect( urllib.parse.urljoin('https://' + cfg.wikipedia_domain, to))
def redirect(lang_code): to = urllib.unquote(flask.request.args.get('to', '')) cfg = config.get_localized_config(lang_code) return flask.redirect( urlparse.urljoin('https://' + cfg.wikipedia_domain, to))
import textwrap def format_html(html): lynx = subprocess.Popen( 'lynx -dump -stdin -assume_charset UTF-8 ' '-display_charset UTF-8 -width 80', shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE) stdout, _ = lynx.communicate(html.encode('utf-8')) if lynx.returncode: print('Failed to render HTML! Do you have lynx?', file=sys.stderr) return html return stdout.decode('utf-8').strip('\n') if __name__ == '__main__': arguments = docopt.docopt(__doc__) cfg = config.get_localized_config() WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/' WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php' wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) parser = snippet_parser.create_snippet_parser(wikipedia, cfg) try: int(arguments['<title_or_pageid>']) wikitext = wikipedia.get_page_contents( pageid = int(arguments['<title_or_pageid>'])) except: wikitext = wikipedia.get_page_contents( title = arguments['<title_or_pageid>'])
import docopt import mwparserfromhell try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET import signal import bz2file import pickle import itertools import urllib cfg = config.get_localized_config() WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/' NAMESPACE_ARTICLE = '0' log = Logger() def section_name_to_anchor(section): # See Sanitizer::escapeId # https://doc.wikimedia.org/mediawiki-core/master/php/html/classSanitizer.html#ae091dfff62f13c9c1e0d2e503b0cab49 section = section.replace(' ', '_') # urllib.quote interacts really weirdly with unicode in Python2: # https://bugs.python.org/issue23885 section = urllib.quote(e(section), safe = e('')) section = section.replace('%3A', ':')
def page_not_found(e): if not hasattr(flask.request, 'cfg'): flask.request.cfg = config.get_localized_config('en') return flask.render_template( '404.html', config = flask.request.cfg), 404
id = mkid(d(page_title) + sni) gone_in_this_revision.pop(id, None) for snippet_id, clicked_ts in gone_in_this_revision.items(): if clicked_ts < rev['timestamp']: logger.info('%s fixed at revision %s' % ( snippet_id, rev['rev_id'])) del snippet_to_ts[snippet_id] stats_db.execute_with_retry_s( 'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)', clicked_ts, snippet_id, cfg.lang_code, rev['rev_id']) live_db.close() stats_db.close() return 0 if __name__ == '__main__': while True: start = time.time() args = docopt.docopt(__doc__) lang_codes = ( config.LANG_CODES_TO_LANG_NAMES.keys() if args['<lang-code>'] == 'global' else [args['<lang-code>']]) for lang_code in lang_codes: cfg = config.get_localized_config(lang_code) if cfg.extract == 'snippet': compute_fixed_snippets(cfg) logger.info('all done in %d seconds.' % (time.time() - start)) time.sleep(5 * 60)
def get_table_name(db, database, table): cfg = config.get_localized_config() return _make_tools_labs_dbname( db.cursor(), database, cfg.lang_code) + '.' + table
def assign_categories(max_categories, mysql_default_cnf): cfg = config.get_localized_config() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) projectindex = {} if running_in_tools_labs() and cfg.lang_code == 'en': tldb = chdb_.init_projectindex_db() tlcursor = tldb.cursor() projectindex = load_projectindex(tlcursor) log.info('loaded projects for %d talk pages (%s...)' % \ (len(projectindex), projectindex.values()[0])) hidden_categories = wpdb.execute_with_retry(load_hidden_categories) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) pinned_categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): categories = wpdb.execute_with_retry(load_categories_for_page, pageid) pinned_categories = (wpdb.execute_with_retry( load_pinned_categories_for_page, projectindex, pageid) if projectindex else set()) # Filter both kinds of categories and build the category -> pageid # indexes page_has_at_least_one_category = False for catname in categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) for catname in pinned_categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True pinned_categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) if pinned_categories_to_ids: log.info('%d pinned categories (%s, %s)' % \ (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0], pinned_categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) categories |= set( (k, frozenset(v)) for k, v in pinned_categories_to_ids.items()) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
try: _update_db_tools_labs(cfg) except Exception, e: traceback.print_exc(file = sys.stderr) email('Failed to build database for %s' % cfg.lang_code, logfiles) sys.exit(1) utils.mkdir_p(cfg.log_dir) for logfile in logfiles: os.rename(logfile, os.path.join(cfg.log_dir, logfile)) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Update the CitationHunt databases.') parser.add_argument('lang_code', help='One of the language codes in ../config.py') args = parser.parse_args() if not (utils.running_in_tools_labs() and utils.running_in_virtualenv()): print >>sys.stderr, 'Not running in a virtualenv in Tools Labs!' sys.exit(1) if args.lang_code not in config.LANG_CODES_TO_LANG_NAMES: print >>sys.stderr, 'Invalid lang code! Use one of: ', print >>sys.stderr, config.LANG_CODES_TO_LANG_NAMES.keys() parser.print_usage() sys.exit(1) cfg = config.get_localized_config(args.lang_code) update_db_tools_labs(cfg)