def update_intersections(): db = chdb.init_scratch_db() cfg = config.get_localized_config() db.execute_with_retry_s('DELETE FROM intersections') db.execute_with_retry_s(''' INSERT INTO intersections SELECT * FROM %s WHERE expiration > NOW()''' % chdb.get_table_name( db, 'citationhunt', 'intersections')) db.execute_with_retry_s('DELETE FROM articles_intersections') db.execute_with_retry_s(''' INSERT INTO articles_intersections SELECT * FROM %s WHERE article_id IN (SELECT page_id FROM articles) AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name( db, 'citationhunt', 'articles_intersections')) def update_snippets_links(cursor): cursor.execute('SELECT id FROM intersections') intersection_ids = [row[0] for row in cursor] if intersection_ids: database.populate_snippets_links(cursor, intersection_ids = intersection_ids) db.execute_with_retry(update_snippets_links) # delete empty intersections. should this surface an error to the user # instead? db.execute_with_retry_s( '''DELETE FROM intersections WHERE id NOT IN ( SELECT inter_id FROM articles_intersections)''')
def initializer(backdir): self.backdir = backdir # Monkey-patch wikitools to always use our existing session opener = WikitoolsRequestsAdapter() APIRequest = wikitools.api.APIRequest class RequestsAPIRequest(wikitools.api.APIRequest): def __init__(self, *args, **kwds): APIRequest.__init__(self, *args, **kwds) self.opener = opener wikitools.APIRequest = RequestsAPIRequest wikitools.api.APIRequest = RequestsAPIRequest self.wiki = wikitools.wiki.Wiki(WIKIPEDIA_API_URL) self.wiki.setUserAgent( 'citationhunt (https://tools.wmflabs.org/citationhunt)') self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg) self.chdb = chdb.init_scratch_db() self.exception_count = 0 if cfg.profile: self.profiler = cProfile.Profile() self.profiler.enable() # Undocumented :( https://stackoverflow.com/questions/24717468 multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def work(citation_detective, job): rows = [] if not citation_detective: results = query_article_data(self.wiki, job, None) else: results = query_article_data( self.wiki, None, set([row[0] for row in job])) for pageid, revid, title, wikitext in results: url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_') snippets_rows = [] if not citation_detective: snippets = self.parser.extract(wikitext) else: sentences = [row[1] for row in job if row[0] == revid] snippets = self.parser.extract_from_sentences(wikitext, sentences) for sec, snips in snippets: sec = section_name_to_anchor(sec) for sni in snips: id = mkid(title + sni) row = (id, sni, sec, pageid) snippets_rows.append(row) if snippets_rows: article_row = (pageid, url, title) rows.append({'article': article_row, 'snippets': snippets_rows}) # Open a short-lived connection to try to avoid the limit of 20 per user: # https://phabricator.wikimedia.org/T216170 db = chdb.init_scratch_db() for r in rows: db.execute_with_retry(insert, r)
def update_intersections(): db = chdb.init_scratch_db() cfg = config.get_localized_config(api=False) db.execute_with_retry_s('DELETE FROM intersections') db.execute_with_retry_s( ''' INSERT INTO intersections SELECT * FROM %s WHERE expiration > NOW()''' % chdb.get_table_name(db, 'citationhunt', 'intersections')) db.execute_with_retry_s('DELETE FROM articles_intersections') db.execute_with_retry_s( ''' INSERT INTO articles_intersections SELECT * FROM %s WHERE article_id IN (SELECT page_id FROM articles) AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name(db, 'citationhunt', 'articles_intersections')) def update_snippets_links(cursor): cursor.execute('SELECT id FROM intersections') intersection_ids = [row[0] for row in cursor] if intersection_ids: database.populate_snippets_links(cursor, intersection_ids=intersection_ids) db.execute_with_retry(update_snippets_links) # delete empty intersections. should this surface an error to the user # instead? db.execute_with_retry_s('''DELETE FROM intersections WHERE id NOT IN ( SELECT inter_id FROM articles_intersections)''')
def sanity_check(): sdb = chdb.init_scratch_db() snippet_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM snippets''')[0][0] assert snippet_count > 100 article_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM articles''')[0][0] assert article_count > 100
def sanity_check(): cfg = config.get_localized_config() sdb = chdb.init_scratch_db() snippet_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM snippets''')[0][0] assert snippet_count > cfg.min_snippets_sanity_check article_count = sdb.execute_with_retry_s( '''SELECT COUNT(*) FROM articles''')[0][0] assert article_count > cfg.min_articles_sanity_check
def work(pageids): rows = [] results = query_pageids(self.wiki, pageids) for pageid, title, wikitext in results: url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_') snippets_rows = [] snippets = self.parser.extract(wikitext) for sec, snips in snippets: sec = section_name_to_anchor(sec) for sni in snips: id = mkid(title + sni) row = (id, sni, sec, pageid) snippets_rows.append(row) if snippets_rows: article_row = (pageid, url, title) rows.append({'article': article_row, 'snippets': snippets_rows}) def insert(cursor, r): cursor.execute( ''' INSERT INTO articles VALUES(%s, %s, %s)''', r['article']) cursor.executemany( ''' INSERT IGNORE INTO snippets VALUES(%s, %s, %s, %s)''', r['snippets']) # We can't allow data to be truncated for HTML snippets, as that can # completely break the UI, so we detect truncation warnings and get rid # of the corresponding data. warnings = cursor.execute('SHOW WARNINGS') truncated_snippets = [] for _, _, message in cursor.fetchall(): m = DATA_TRUNCATED_WARNING_RE.match(message) if m is None: # Not a truncation, ignore (it's already logged) continue # MySQL warnings index rows starting at 1 idx = int(m.groups()[0]) - 1 truncated_snippets.append((r['snippets'][idx][0], )) if len(truncated_snippets) < len(r['snippets']): cursor.executemany( ''' DELETE FROM snippets WHERE id = %s''', truncated_snippets) else: # Every single snippet was truncated, remove the article itself cursor.execute('''DELETE FROM articles WHERE page_id = %s''', (r['article'][0], )) # Open a short-lived connection to try to avoid the limit of 20 per user: # https://phabricator.wikimedia.org/T216170 db = chdb.init_scratch_db() for r in rows: db.execute_with_retry(insert, r)
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) logger.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] logger.info('finished with %d categories' % len( category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() logger.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def assign_categories(): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db(cfg.lang_code) unsourced_pageids = load_unsourced_pageids(chdb) # Load an initial {wikiproject -> [page ids]} dict, if applicable category_to_page_ids = load_projectindex(cfg, chdb) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry( load_hidden_categories, cfg) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories and page ids for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry( load_categories_for_pages, tuple(c)): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] log.info('finished with %d categories' % len(category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() log.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def initializer(backdir): self.backdir = backdir self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent) self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg) self.chdb = chdb.init_scratch_db() self.exception_count = 0 if cfg.profile: self.profiler = cProfile.Profile() self.profiler.enable() # Undocumented :( https://stackoverflow.com/questions/24717468 multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def work(pageids): rows = [] results = query_pageids(self.wiki, pageids) for pageid, title, wikitext in results: url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_') snippets_rows = [] snippets = self.parser.extract(wikitext) for sec, snips in snippets: sec = section_name_to_anchor(sec) for sni in snips: id = mkid(title + sni) row = (id, sni, sec, pageid) snippets_rows.append(row) if snippets_rows: article_row = (pageid, url, title) rows.append({'article': article_row, 'snippets': snippets_rows}) def insert(cursor, r): cursor.execute(''' INSERT INTO articles VALUES(%s, %s, %s)''', r['article']) cursor.executemany(''' INSERT IGNORE INTO snippets VALUES(%s, %s, %s, %s)''', r['snippets']) # We can't allow data to be truncated for HTML snippets, as that can # completely break the UI, so we detect truncation warnings and get rid # of the corresponding data. warnings = cursor.execute('SHOW WARNINGS') truncated_snippets = [] for _, _, message in cursor.fetchall(): m = DATA_TRUNCATED_WARNING_RE.match(message) if m is None: # Not a truncation, ignore (it's already logged) continue # MySQL warnings index rows starting at 1 idx = int(m.groups()[0]) - 1 truncated_snippets.append((r['snippets'][idx][0],)) if len(truncated_snippets) < len(r['snippets']): cursor.executemany(''' DELETE FROM snippets WHERE id = %s''', truncated_snippets) else: # Every single snippet was truncated, remove the article itself cursor.execute('''DELETE FROM articles WHERE page_id = %s''', (r['article'][0],)) # Open a short-lived connection to try to avoid the limit of 20 per user: # https://phabricator.wikimedia.org/T216170 db = chdb.init_scratch_db() for r in rows: db.execute_with_retry(insert, r)
def assign_categories(max_categories, mysql_default_cnf): chdb = chdb_.init_scratch_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) wpdb = chdb_.init_wp_replica_db() wpcursor = wpdb.cursor() assert wpcursor.execute('SELECT * FROM page LIMIT 1;') == 1 assert wpcursor.execute('SELECT * FROM categorylinks LIMIT 1;') == 1 hidden_categories = load_hidden_categories(wpcursor) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): page_has_at_least_one_category = False for catname in load_categories_for_page(wpcursor, pageid): if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
def compute_fixed_snippets(): start = time.time() # FIXME This could probably just be one query on a single database # connection, insead of one connection per database and loading all # snippets in memory for comparison. cfg = config.get_localized_config() scratch_db = chdb.init_scratch_db() live_db = chdb.init_db(cfg.lang_code) stats_db = chdb.init_stats_db() # Find the set of snippets that that were "clicked" (redirected to article) # between the dates of the previous/live and next/scratch database from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets') to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets') clicked = stats_db.execute_with_retry(load_snippet_clicks_between, cfg.lang_code, from_ts, to_ts) # Load the snippets from both databases scratch_snippets = scratch_db.execute_with_retry(load_snippets) live_snippets = live_db.execute_with_retry(load_snippets) # And for each snippet that disappeared across databases AND had been # clicked in the meantime, store its information in the stats database. gone = live_snippets.difference(scratch_snippets) for id, clicked_ts in clicked.iteritems(): if id in gone: log.info(id) stats_db.execute_with_retry_s( 'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id, cfg.lang_code) log.info('all done in %d seconds.' % (time.time() - start)) scratch_db.close() live_db.close() stats_db.close() return 0
def assign_categories(mysql_default_cnf): cfg = config.get_localized_config() profiler = cProfile.Profile() if cfg.profile: profiler.enable() start = time.time() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) # Load a list of (wikiproject, page ids), if applicable # FIXME: We load all category -> page id mappings for all projects, then # filter out the ones with no unsourced snippets. It's likely better to just # query the projects of the pages we know of instead. projectindex = load_projectindex(cfg) # Load a set() of hidden categories hidden_categories = wpdb.execute_with_retry(load_hidden_categories, cfg) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) # Load all usable categories into a dict category -> [page ids] category_to_page_ids = {} for c, p in projectindex: if p in unsourced_pageids: category_to_page_ids.setdefault(c, []).append(p) for c in ichunk(unsourced_pageids, 10000): for c, p in wpdb.execute_with_retry(load_categories_for_pages, c): if category_is_usable(cfg, c, hidden_categories): category_to_page_ids.setdefault(c, []).append(p) # Now find out how many snippets each category has category_to_snippet_count = {} page_id_to_snippet_count = chdb.execute_with_retry( count_snippets_for_pages) for category, page_ids in category_to_page_ids.iteritems(): category_to_snippet_count[category] = sum( page_id_to_snippet_count.get(p, 0) for p in page_ids) # And keep only the ones with at least two. category_name_id_and_page_ids = [ (unicode(category), category_name_to_id(category), page_ids) for category, page_ids in category_to_page_ids.iteritems() if category_to_snippet_count[category] >= 2 ] log.info('finished with %d categories' % len(category_name_id_and_page_ids)) update_citationhunt_db(chdb, category_name_id_and_page_ids) wpdb.close() chdb.close() log.info('all done in %d seconds.' % (time.time() - start)) if cfg.profile: profiler.disable() pstats.Stats(profiler).sort_stats('cumulative').print_stats( 30, 'assign_categories.py') return 0
def assign_categories(max_categories, mysql_default_cnf): cfg = config.get_localized_config() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) projectindex = {} if running_in_tools_labs() and cfg.lang_code == 'en': tldb = chdb_.init_projectindex_db() tlcursor = tldb.cursor() projectindex = load_projectindex(tlcursor) log.info('loaded projects for %d talk pages (%s...)' % \ (len(projectindex), projectindex.values()[0])) hidden_categories = wpdb.execute_with_retry(load_hidden_categories) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) pinned_categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): categories = wpdb.execute_with_retry(load_categories_for_page, pageid) pinned_categories = (wpdb.execute_with_retry( load_pinned_categories_for_page, projectindex, pageid) if projectindex else set()) # Filter both kinds of categories and build the category -> pageid # indexes page_has_at_least_one_category = False for catname in categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) for catname in pinned_categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True pinned_categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) if pinned_categories_to_ids: log.info('%d pinned categories (%s, %s)' % \ (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0], pinned_categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) categories |= set( (k, frozenset(v)) for k, v in pinned_categories_to_ids.items()) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0
def assign_categories(max_categories, mysql_default_cnf): cfg = config.get_localized_config() chdb = chdb_.init_scratch_db() wpdb = chdb_.init_wp_replica_db() chdb.execute_with_retry(reset_chdb_tables) unsourced_pageids = load_unsourced_pageids(chdb) projectindex = {} if running_in_tools_labs() and cfg.lang_code == 'en': tldb = chdb_.init_projectindex_db() tlcursor = tldb.cursor() projectindex = load_projectindex(tlcursor) log.info('loaded projects for %d pages (%s...)' % \ (len(projectindex), projectindex.values()[0])) hidden_categories = wpdb.execute_with_retry(load_hidden_categories) log.info('loaded %d hidden categories (%s...)' % \ (len(hidden_categories), next(iter(hidden_categories)))) categories_to_ids = collections.defaultdict(set) pinned_categories_to_ids = collections.defaultdict(set) page_ids_with_no_categories = 0 for n, pageid in enumerate(list(unsourced_pageids)): categories = wpdb.execute_with_retry(load_categories_for_page, pageid) pinned_categories = set(projectindex.get(pageid, [])) # Filter both kinds of categories and build the category -> pageid # indexes page_has_at_least_one_category = False for catname in categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True categories_to_ids[catname].add(pageid) for catname in pinned_categories: if category_is_usable(catname, hidden_categories): page_has_at_least_one_category = True pinned_categories_to_ids[catname].add(pageid) if not page_has_at_least_one_category: unsourced_pageids.remove(pageid) page_ids_with_no_categories += 1 log.progress('loaded categories for %d pageids' % (n + 1)) log.info('%d pages lack usable categories!' % page_ids_with_no_categories) log.info('found %d usable categories (%s, %s...)' % \ (len(categories_to_ids), categories_to_ids.keys()[0], categories_to_ids.keys()[1])) if pinned_categories_to_ids: log.info('%d pinned categories (%s, %s)' % \ (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0], pinned_categories_to_ids.keys()[1])) categories = choose_categories(categories_to_ids, unsourced_pageids, max_categories) categories |= set( (k, frozenset(v)) for k, v in pinned_categories_to_ids.items()) update_citationhunt_db(chdb, categories) wpdb.close() chdb.close() return 0