def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() db = chdb.init_wp_replica_db(cfg.lang_code) cursor = db.cursor() categories = set([cfg.citation_needed_category]) while True: cursor.execute( 'SELECT cl_from, cl_type FROM categorylinks WHERE (' + ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories) subcategories = set() for page_id, type in cursor: if type == b'page': print(page_id) elif type == b'subcat': subcategories.add(page_id) if not subcategories: break # need to convert the page ids of subcategories into page # titles so we can query recursively cursor.execute( 'SELECT page_title FROM page WHERE (' + ' OR '.join(['page_id = %s'] * len(subcategories)) + ')', subcategories) categories = set([r[0] for r in cursor])
def query_rev_users(lang_code, rev_ids): wpdb = chdb.init_wp_replica_db(lang_code) with wpdb as cursor: cursor.execute( 'SELECT rev_user_text FROM revision_userindex ' 'WHERE rev_user != 0 AND rev_id IN %s', (tuple(rev_ids),)) return [row[0].decode('utf-8') for row in cursor.fetchall()]
def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() db = chdb.init_wp_replica_db() cursor = db.cursor() categories = set([cfg.citation_needed_category]) while True: cursor.execute( 'SELECT cl_from, cl_type FROM categorylinks WHERE (' + ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories) subcategories = set() for page_id, type in cursor: if type == 'page': print page_id elif type == 'subcat': subcategories.add(page_id) if not subcategories: break # need to convert the page ids of subcategories into page # titles so we can query recursively cursor.execute( 'SELECT page_title FROM page WHERE (' + ' OR '.join(['page_id = %s'] * len(subcategories)) + ')', subcategories) categories = set([r[0] for r in cursor])
def query_rev_users(lang_code, rev_ids): wpdb = chdb.init_wp_replica_db(lang_code) with wpdb.cursor() as cursor: cursor.execute( 'SELECT actor_name FROM actor ' 'JOIN revision_userindex ON actor_id = rev_actor ' 'WHERE NOT ISNULL(actor_user) AND rev_id IN %s', (tuple(rev_ids), )) return [row[0].decode('utf-8') for row in cursor.fetchall()]
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) logger.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] logger.info('finished with %d categories' % len( category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() logger.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] log.info('finished with %d categories' % len(category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() log.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def print_unsourced_ids_from_wikipedia(): cfg = config.get_localized_config() templates = [t.replace(' ', '_') for t in cfg.citation_needed_templates] db = chdb.init_wp_replica_db(cfg.lang_code) cursor = db.cursor() or_clause = ( '(' + 'OR '.join(['tl_title = %s'] * len(templates)) + ')' ) # https://www.mediawiki.org/wiki/Help:Namespaces cursor.execute( 'SELECT tl_from FROM templatelinks WHERE ' + 'tl_from_namespace = 0 AND tl_namespace = 10 AND ' + or_clause, templates) for (page_id,) in cursor: print(page_id)
def assign_categories(max_categories, mysql_default_cnf): chdb = chdb_.init_scratch_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) wpdb = chdb_.init_wp_replica_db() wpcursor = wpdb.cursor() assert wpcursor.execute('SELECT * FROM page LIMIT 1;') == 1 assert wpcursor.execute('SELECT * FROM categorylinks LIMIT 1;') == 1 hidden_categories = load_hidden_categories(wpcursor) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): page_has_at_least_one_category = False for catname in load_categories_for_page(wpcursor, pageid): if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
#!/usr/bin/env python import sys sys.path.append('../') import chdb import config cfg = config.get_localized_config() db = chdb.init_wp_replica_db() cursor = db.cursor() cursor.execute('SELECT cl_from FROM categorylinks WHERE cl_to = %s', (cfg.citation_needed_category,)) for row in cursor: print row[0]
def assign_categories(mysql_default_cnf): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) # Load a list of (wikiproject, page ids), if applicable # FIXME: We load all category -> page id mappings for all projects, then # filter out the ones with no unsourced snippets. It's likely better to just # query the projects of the pages we know of instead. projectindex = load_projectindex(cfg) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry(load_hidden_categories, cfg) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories into a dict category -> [page ids] category_to_page_ids = {} for c, p in projectindex: if p in unsourced_pageids: category_to_page_ids.setdefault(c, []).append(p) for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry(load_categories_for_pages, c): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry( count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] log.info('finished with %d categories' % len(category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() log.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def assign_categories(max_categories, mysql_default_cnf): cfg = config.get_localized_config() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) projectindex = {} if running_in_tools_labs() and cfg.lang_code == 'en': tldb = chdb_.init_projectindex_db() tlcursor = tldb.cursor() projectindex = load_projectindex(tlcursor) log.info('loaded projects for %d talk pages (%s...)' % \ (len(projectindex), projectindex.values()[0])) hidden_categories = wpdb.execute_with_retry(load_hidden_categories) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) pinned_categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): categories = wpdb.execute_with_retry(load_categories_for_page, pageid) pinned_categories = (wpdb.execute_with_retry( load_pinned_categories_for_page, projectindex, pageid) if projectindex else set()) # Filter both kinds of categories and build the category -> pageid # indexes page_has_at_least_one_category = False for catname in categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) for catname in pinned_categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True pinned_categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) if pinned_categories_to_ids: log.info('%d pinned categories (%s, %s)' % \ (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0], pinned_categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) categories |= set( (k, frozenset(v)) for k, v in pinned_categories_to_ids.items()) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
def assign_categories(max_categories, mysql_default_cnf): cfg = config.get_localized_config() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) projectindex = {} if running_in_tools_labs() and cfg.lang_code == 'en': tldb = chdb_.init_projectindex_db() tlcursor = tldb.cursor() projectindex = load_projectindex(tlcursor) log.info('loaded projects for %d pages (%s...)' % \ (len(projectindex), projectindex.values()[0])) hidden_categories = wpdb.execute_with_retry(load_hidden_categories) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) pinned_categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): categories = wpdb.execute_with_retry(load_categories_for_page, pageid) pinned_categories = set(projectindex.get(pageid, [])) # Filter both kinds of categories and build the category -> pageid # indexes page_has_at_least_one_category = False for catname in categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) for catname in pinned_categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True pinned_categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) if pinned_categories_to_ids: log.info('%d pinned categories (%s, %s)' % \ (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0], pinned_categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) categories |= set( (k, frozenset(v)) for k, v in pinned_categories_to_ids.items()) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
#!/usr/bin/env python import sys sys.path.append('../') import chdb import config cfg = config.get_localized_config() db = chdb.init_wp_replica_db() cursor = db.cursor() categories = set([cfg.citation_needed_category]) while True: cursor.execute( 'SELECT cl_from, cl_type FROM categorylinks WHERE (' + ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories) subcategories = set() for page_id, type in cursor: if type == 'page': print page_id elif type == 'subcat': subcategories.add(page_id) if not subcategories: break # need to convert the page ids of subcategories into page # titles so we can query recursively cursor.execute( 'SELECT page_title FROM page WHERE (' +