Python init_scratch_dbの例、chdb.init_scratch_db Pythonの例

コード例 #1

0

ファイルを表示

ファイル: update_intersections.py プロジェクト: eggpi/citationhunt

def update_intersections():
    db = chdb.init_scratch_db()
    cfg = config.get_localized_config()

    db.execute_with_retry_s('DELETE FROM intersections')
    db.execute_with_retry_s('''
        INSERT INTO intersections SELECT * FROM %s
        WHERE expiration > NOW()''' % chdb.get_table_name(
            db, 'citationhunt', 'intersections'))

    db.execute_with_retry_s('DELETE FROM articles_intersections')
    db.execute_with_retry_s('''
        INSERT INTO articles_intersections SELECT * FROM %s
        WHERE article_id IN (SELECT page_id FROM articles)
        AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name(
            db, 'citationhunt', 'articles_intersections'))

    def update_snippets_links(cursor):
        cursor.execute('SELECT id FROM intersections')
        intersection_ids = [row[0] for row in cursor]
        if intersection_ids:
            database.populate_snippets_links(cursor,
                intersection_ids = intersection_ids)
    db.execute_with_retry(update_snippets_links)
    # delete empty intersections. should this surface an error to the user
    # instead?
    db.execute_with_retry_s(
        '''DELETE FROM intersections WHERE id NOT IN (
            SELECT inter_id FROM articles_intersections)''')

コード例 #2

0

ファイルを表示

def initializer(backdir):
    self.backdir = backdir

    # Monkey-patch wikitools to always use our existing session
    opener = WikitoolsRequestsAdapter()
    APIRequest = wikitools.api.APIRequest

    class RequestsAPIRequest(wikitools.api.APIRequest):
        def __init__(self, *args, **kwds):
            APIRequest.__init__(self, *args, **kwds)
            self.opener = opener

    wikitools.APIRequest = RequestsAPIRequest
    wikitools.api.APIRequest = RequestsAPIRequest

    self.wiki = wikitools.wiki.Wiki(WIKIPEDIA_API_URL)
    self.wiki.setUserAgent(
        'citationhunt (https://tools.wmflabs.org/citationhunt)')
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.chdb = chdb.init_scratch_db()
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)

コード例 #3

0

ファイルを表示

ファイル: parse_live.py プロジェクト: AikoChou/citationhunt

def work(citation_detective, job):
    rows = []
    if not citation_detective:
        results = query_article_data(self.wiki, job, None)
    else:
        results = query_article_data(
            self.wiki, None, set([row[0] for row in job]))
    for pageid, revid, title, wikitext in results:
        url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_')

        snippets_rows = []
        if not citation_detective:
            snippets = self.parser.extract(wikitext)
        else:
            sentences = [row[1] for row in job if row[0] == revid]
            snippets = self.parser.extract_from_sentences(wikitext, sentences)
        for sec, snips in snippets:
            sec = section_name_to_anchor(sec)
            for sni in snips:
                id = mkid(title + sni)
                row = (id, sni, sec, pageid)
                snippets_rows.append(row)

        if snippets_rows:
            article_row = (pageid, url, title)
            rows.append({'article': article_row, 'snippets': snippets_rows})
    # Open a short-lived connection to try to avoid the limit of 20 per user:
    # https://phabricator.wikimedia.org/T216170
    db = chdb.init_scratch_db()
    for r in rows:
        db.execute_with_retry(insert, r)

コード例 #4

0

ファイルを表示

ファイル: update_intersections.py プロジェクト: eggpi/citationhunt

def update_intersections():
    db = chdb.init_scratch_db()
    cfg = config.get_localized_config(api=False)

    db.execute_with_retry_s('DELETE FROM intersections')
    db.execute_with_retry_s(
        '''
        INSERT INTO intersections SELECT * FROM %s
        WHERE expiration > NOW()''' %
        chdb.get_table_name(db, 'citationhunt', 'intersections'))

    db.execute_with_retry_s('DELETE FROM articles_intersections')
    db.execute_with_retry_s(
        '''
        INSERT INTO articles_intersections SELECT * FROM %s
        WHERE article_id IN (SELECT page_id FROM articles)
        AND inter_id IN (SELECT id FROM intersections)''' %
        chdb.get_table_name(db, 'citationhunt', 'articles_intersections'))

    def update_snippets_links(cursor):
        cursor.execute('SELECT id FROM intersections')
        intersection_ids = [row[0] for row in cursor]
        if intersection_ids:
            database.populate_snippets_links(cursor,
                                             intersection_ids=intersection_ids)

    db.execute_with_retry(update_snippets_links)
    # delete empty intersections. should this surface an error to the user
    # instead?
    db.execute_with_retry_s('''DELETE FROM intersections WHERE id NOT IN (
            SELECT inter_id FROM articles_intersections)''')

コード例 #5

0

ファイルを表示

def sanity_check():
    sdb = chdb.init_scratch_db()
    snippet_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM snippets''')[0][0]
    assert snippet_count > 100

    article_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM articles''')[0][0]
    assert article_count > 100

コード例 #6

0

ファイルを表示

def sanity_check():
    cfg = config.get_localized_config()
    sdb = chdb.init_scratch_db()
    snippet_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM snippets''')[0][0]
    assert snippet_count > cfg.min_snippets_sanity_check

    article_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM articles''')[0][0]
    assert article_count > cfg.min_articles_sanity_check

コード例 #7

0

ファイルを表示

ファイル: parse_live.py プロジェクト: sarojdhakal/citationhunt

def work(pageids):
    rows = []
    results = query_pageids(self.wiki, pageids)
    for pageid, title, wikitext in results:
        url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_')

        snippets_rows = []
        snippets = self.parser.extract(wikitext)
        for sec, snips in snippets:
            sec = section_name_to_anchor(sec)
            for sni in snips:
                id = mkid(title + sni)
                row = (id, sni, sec, pageid)
                snippets_rows.append(row)

        if snippets_rows:
            article_row = (pageid, url, title)
            rows.append({'article': article_row, 'snippets': snippets_rows})

    def insert(cursor, r):
        cursor.execute(
            '''
            INSERT INTO articles VALUES(%s, %s, %s)''', r['article'])
        cursor.executemany(
            '''
            INSERT IGNORE INTO snippets VALUES(%s, %s, %s, %s)''',
            r['snippets'])

        # We can't allow data to be truncated for HTML snippets, as that can
        # completely break the UI, so we detect truncation warnings and get rid
        # of the corresponding data.
        warnings = cursor.execute('SHOW WARNINGS')
        truncated_snippets = []
        for _, _, message in cursor.fetchall():
            m = DATA_TRUNCATED_WARNING_RE.match(message)
            if m is None:
                # Not a truncation, ignore (it's already logged)
                continue
            # MySQL warnings index rows starting at 1
            idx = int(m.groups()[0]) - 1
            truncated_snippets.append((r['snippets'][idx][0], ))
        if len(truncated_snippets) < len(r['snippets']):
            cursor.executemany(
                '''
                DELETE FROM snippets WHERE id = %s''', truncated_snippets)
        else:
            # Every single snippet was truncated, remove the article itself
            cursor.execute('''DELETE FROM articles WHERE page_id = %s''',
                           (r['article'][0], ))

    # Open a short-lived connection to try to avoid the limit of 20 per user:
    # https://phabricator.wikimedia.org/T216170
    db = chdb.init_scratch_db()
    for r in rows:
        db.execute_with_retry(insert, r)

コード例 #8

0

ファイルを表示

ファイル: assign_categories.py プロジェクト: eggpi/citationhunt

def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    logger.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    logger.info('finished with %d categories' % len(
        category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    logger.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0

コード例 #9

0

ファイルを表示

def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    log.info('finished with %d categories' % len(category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    log.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0

コード例 #10

0

ファイルを表示

ファイル: parse_live.py プロジェクト: henriquecrang/citationhunt

def initializer(backdir):
    self.backdir = backdir

    self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.chdb = chdb.init_scratch_db()
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)

コード例 #11

0

ファイルを表示

ファイル: parse_live.py プロジェクト: eggpi/citationhunt

def work(pageids):
    rows = []
    results = query_pageids(self.wiki, pageids)
    for pageid, title, wikitext in results:
        url = WIKIPEDIA_WIKI_URL + title.replace(' ', '_')

        snippets_rows = []
        snippets = self.parser.extract(wikitext)
        for sec, snips in snippets:
            sec = section_name_to_anchor(sec)
            for sni in snips:
                id = mkid(title + sni)
                row = (id, sni, sec, pageid)
                snippets_rows.append(row)

        if snippets_rows:
            article_row = (pageid, url, title)
            rows.append({'article': article_row, 'snippets': snippets_rows})

    def insert(cursor, r):
        cursor.execute('''
            INSERT INTO articles VALUES(%s, %s, %s)''', r['article'])
        cursor.executemany('''
            INSERT IGNORE INTO snippets VALUES(%s, %s, %s, %s)''',
            r['snippets'])

        # We can't allow data to be truncated for HTML snippets, as that can
        # completely break the UI, so we detect truncation warnings and get rid
        # of the corresponding data.
        warnings = cursor.execute('SHOW WARNINGS')
        truncated_snippets = []
        for _, _, message in cursor.fetchall():
            m = DATA_TRUNCATED_WARNING_RE.match(message)
            if m is None:
                # Not a truncation, ignore (it's already logged)
                continue
            # MySQL warnings index rows starting at 1
            idx = int(m.groups()[0]) - 1
            truncated_snippets.append((r['snippets'][idx][0],))
        if len(truncated_snippets) < len(r['snippets']):
            cursor.executemany('''
                DELETE FROM snippets WHERE id = %s''', truncated_snippets)
        else:
            # Every single snippet was truncated, remove the article itself
            cursor.execute('''DELETE FROM articles WHERE page_id = %s''',
                (r['article'][0],))
    # Open a short-lived connection to try to avoid the limit of 20 per user:
    # https://phabricator.wikimedia.org/T216170
    db = chdb.init_scratch_db()
    for r in rows:
        db.execute_with_retry(insert, r)

コード例 #12

0

ファイルを表示

ファイル: assign_categories.py プロジェクト: Halibutt/citationhunt

def assign_categories(max_categories, mysql_default_cnf):
    chdb = chdb_.init_scratch_db()
    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    wpdb = chdb_.init_wp_replica_db()
    wpcursor = wpdb.cursor()
    assert wpcursor.execute('SELECT * FROM page LIMIT 1;') == 1
    assert wpcursor.execute('SELECT * FROM categorylinks LIMIT 1;') == 1

    hidden_categories = load_hidden_categories(wpcursor)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        page_has_at_least_one_category = False
        for catname in load_categories_for_page(wpcursor, pageid):
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
        max_categories)

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0

コード例 #13

0

ファイルを表示

def compute_fixed_snippets():
    start = time.time()
    # FIXME This could probably just be one query on a single database
    # connection, insead of one connection per database and loading all
    # snippets in memory for comparison.
    cfg = config.get_localized_config()
    scratch_db = chdb.init_scratch_db()
    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Find the set of snippets that that were "clicked" (redirected to article)
    # between the dates of the previous/live and next/scratch database
    from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets')
    to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets')
    clicked = stats_db.execute_with_retry(load_snippet_clicks_between,
                                          cfg.lang_code, from_ts, to_ts)

    # Load the snippets from both databases
    scratch_snippets = scratch_db.execute_with_retry(load_snippets)
    live_snippets = live_db.execute_with_retry(load_snippets)

    # And for each snippet that disappeared across databases AND had been
    # clicked in the meantime, store its information in the stats database.
    gone = live_snippets.difference(scratch_snippets)
    for id, clicked_ts in clicked.iteritems():
        if id in gone:
            log.info(id)
            stats_db.execute_with_retry_s(
                'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id,
                cfg.lang_code)

    log.info('all done in %d seconds.' % (time.time() - start))
    scratch_db.close()
    live_db.close()
    stats_db.close()
    return 0

コード例 #14

0

ファイルを表示

def assign_categories(mysql_default_cnf):
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load a list of (wikiproject, page ids), if applicable
    # FIXME: We load all category -> page id mappings for all projects, then
    # filter out the ones with no unsourced snippets. It's likely better to just
    # query the projects of the pages we know of instead.
    projectindex = load_projectindex(cfg)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(load_hidden_categories, cfg)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories into a dict category -> [page ids]
    category_to_page_ids = {}
    for c, p in projectindex:
        if p in unsourced_pageids:
            category_to_page_ids.setdefault(c, []).append(p)
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(load_categories_for_pages, c):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(
        count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    log.info('finished with %d categories' %
             len(category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    log.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0

コード例 #15

0

ファイルを表示

ファイル: assign_categories.py プロジェクト: CristianCantoro/citationhunt

def assign_categories(max_categories, mysql_default_cnf):
    cfg = config.get_localized_config()
    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    projectindex = {}
    if running_in_tools_labs() and cfg.lang_code == 'en':
        tldb = chdb_.init_projectindex_db()
        tlcursor = tldb.cursor()

        projectindex = load_projectindex(tlcursor)
        log.info('loaded projects for %d talk pages (%s...)' % \
            (len(projectindex), projectindex.values()[0]))

    hidden_categories = wpdb.execute_with_retry(load_hidden_categories)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    pinned_categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        categories = wpdb.execute_with_retry(load_categories_for_page, pageid)
        pinned_categories = (wpdb.execute_with_retry(
            load_pinned_categories_for_page, projectindex, pageid)
            if projectindex else set())
        # Filter both kinds of categories and build the category -> pageid
        # indexes
        page_has_at_least_one_category = False
        for catname in categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        for catname in pinned_categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                pinned_categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))
    if pinned_categories_to_ids:
        log.info('%d pinned categories (%s, %s)' % \
            (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0],
             pinned_categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
        max_categories)
    categories |= set(
        (k, frozenset(v)) for k, v in pinned_categories_to_ids.items())

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0

コード例 #16

0

ファイルを表示

def assign_categories(max_categories, mysql_default_cnf):
    cfg = config.get_localized_config()
    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    projectindex = {}
    if running_in_tools_labs() and cfg.lang_code == 'en':
        tldb = chdb_.init_projectindex_db()
        tlcursor = tldb.cursor()

        projectindex = load_projectindex(tlcursor)
        log.info('loaded projects for %d pages (%s...)' % \
            (len(projectindex), projectindex.values()[0]))

    hidden_categories = wpdb.execute_with_retry(load_hidden_categories)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    pinned_categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        categories = wpdb.execute_with_retry(load_categories_for_page, pageid)
        pinned_categories = set(projectindex.get(pageid, []))
        # Filter both kinds of categories and build the category -> pageid
        # indexes
        page_has_at_least_one_category = False
        for catname in categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        for catname in pinned_categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                pinned_categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))
    if pinned_categories_to_ids:
        log.info('%d pinned categories (%s, %s)' % \
            (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0],
             pinned_categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
                                   max_categories)
    categories |= set(
        (k, frozenset(v)) for k, v in pinned_categories_to_ids.items())

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0