Ejemplo n.º 1
0
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    db = chdb.init_wp_replica_db(cfg.lang_code)
    cursor = db.cursor()
    categories = set([cfg.citation_needed_category])
    while True:
        cursor.execute(
            'SELECT cl_from, cl_type FROM categorylinks WHERE (' +
            ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories)
        subcategories = set()
        for page_id, type in cursor:
            if type == b'page':
                print(page_id)
            elif type == b'subcat':
                subcategories.add(page_id)
        if not subcategories:
            break

        # need to convert the page ids of subcategories into page
        # titles so we can query recursively
        cursor.execute(
            'SELECT page_title FROM page WHERE (' +
            ' OR '.join(['page_id = %s'] * len(subcategories)) + ')',
            subcategories)
        categories = set([r[0] for r in cursor])
Ejemplo n.º 2
0
def query_rev_users(lang_code, rev_ids):
    wpdb = chdb.init_wp_replica_db(lang_code)
    with wpdb as cursor:
        cursor.execute(
            'SELECT rev_user_text FROM revision_userindex '
            'WHERE rev_user != 0 AND rev_id IN %s', (tuple(rev_ids),))
    return [row[0].decode('utf-8') for row in cursor.fetchall()]
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    db = chdb.init_wp_replica_db()
    cursor = db.cursor()
    categories = set([cfg.citation_needed_category])
    while True:
        cursor.execute(
            'SELECT cl_from, cl_type FROM categorylinks WHERE (' +
            ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories)
        subcategories = set()
        for page_id, type in cursor:
            if type == 'page':
                print page_id
            elif type == 'subcat':
                subcategories.add(page_id)
        if not subcategories:
            break

        # need to convert the page ids of subcategories into page
        # titles so we can query recursively
        cursor.execute(
            'SELECT page_title FROM page WHERE (' +
            ' OR '.join(['page_id = %s'] * len(subcategories)) + ')',
            subcategories)
        categories = set([r[0] for r in cursor])
Ejemplo n.º 4
0
def query_rev_users(lang_code, rev_ids):
    wpdb = chdb.init_wp_replica_db(lang_code)
    with wpdb.cursor() as cursor:
        cursor.execute(
            'SELECT actor_name FROM actor '
            'JOIN revision_userindex ON actor_id = rev_actor '
            'WHERE NOT ISNULL(actor_user) AND rev_id IN %s',
            (tuple(rev_ids), ))
        return [row[0].decode('utf-8') for row in cursor.fetchall()]
Ejemplo n.º 5
0
def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    logger.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    logger.info('finished with %d categories' % len(
        category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    logger.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0
Ejemplo n.º 6
0
def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    log.info('finished with %d categories' % len(category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    log.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0
Ejemplo n.º 7
0
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    templates = [t.replace(' ', '_') for t in cfg.citation_needed_templates]

    db = chdb.init_wp_replica_db(cfg.lang_code)
    cursor = db.cursor()

    or_clause = (
        '(' + 'OR '.join(['tl_title = %s'] * len(templates)) + ')'
    )
    # https://www.mediawiki.org/wiki/Help:Namespaces
    cursor.execute(
        'SELECT tl_from FROM templatelinks WHERE ' +
        'tl_from_namespace = 0 AND tl_namespace = 10 AND ' +
        or_clause, templates)
    for (page_id,) in cursor:
        print(page_id)
Ejemplo n.º 8
0
def assign_categories(max_categories, mysql_default_cnf):
    chdb = chdb_.init_scratch_db()
    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    wpdb = chdb_.init_wp_replica_db()
    wpcursor = wpdb.cursor()
    assert wpcursor.execute('SELECT * FROM page LIMIT 1;') == 1
    assert wpcursor.execute('SELECT * FROM categorylinks LIMIT 1;') == 1

    hidden_categories = load_hidden_categories(wpcursor)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        page_has_at_least_one_category = False
        for catname in load_categories_for_page(wpcursor, pageid):
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
        max_categories)

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0
#!/usr/bin/env python

import sys
sys.path.append('../')

import chdb
import config

cfg = config.get_localized_config()
db = chdb.init_wp_replica_db()
cursor = db.cursor()
cursor.execute('SELECT cl_from FROM categorylinks WHERE cl_to = %s',
    (cfg.citation_needed_category,))
for row in cursor:
    print row[0]
Ejemplo n.º 10
0
def assign_categories(mysql_default_cnf):
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load a list of (wikiproject, page ids), if applicable
    # FIXME: We load all category -> page id mappings for all projects, then
    # filter out the ones with no unsourced snippets. It's likely better to just
    # query the projects of the pages we know of instead.
    projectindex = load_projectindex(cfg)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(load_hidden_categories, cfg)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories into a dict category -> [page ids]
    category_to_page_ids = {}
    for c, p in projectindex:
        if p in unsourced_pageids:
            category_to_page_ids.setdefault(c, []).append(p)
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(load_categories_for_pages, c):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(
        count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    log.info('finished with %d categories' %
             len(category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    log.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0
def assign_categories(max_categories, mysql_default_cnf):
    cfg = config.get_localized_config()
    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    projectindex = {}
    if running_in_tools_labs() and cfg.lang_code == 'en':
        tldb = chdb_.init_projectindex_db()
        tlcursor = tldb.cursor()

        projectindex = load_projectindex(tlcursor)
        log.info('loaded projects for %d talk pages (%s...)' % \
            (len(projectindex), projectindex.values()[0]))

    hidden_categories = wpdb.execute_with_retry(load_hidden_categories)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    pinned_categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        categories = wpdb.execute_with_retry(load_categories_for_page, pageid)
        pinned_categories = (wpdb.execute_with_retry(
            load_pinned_categories_for_page, projectindex, pageid)
            if projectindex else set())
        # Filter both kinds of categories and build the category -> pageid
        # indexes
        page_has_at_least_one_category = False
        for catname in categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        for catname in pinned_categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                pinned_categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))
    if pinned_categories_to_ids:
        log.info('%d pinned categories (%s, %s)' % \
            (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0],
             pinned_categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
        max_categories)
    categories |= set(
        (k, frozenset(v)) for k, v in pinned_categories_to_ids.items())

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0
Ejemplo n.º 12
0
def assign_categories(max_categories, mysql_default_cnf):
    cfg = config.get_localized_config()
    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    projectindex = {}
    if running_in_tools_labs() and cfg.lang_code == 'en':
        tldb = chdb_.init_projectindex_db()
        tlcursor = tldb.cursor()

        projectindex = load_projectindex(tlcursor)
        log.info('loaded projects for %d pages (%s...)' % \
            (len(projectindex), projectindex.values()[0]))

    hidden_categories = wpdb.execute_with_retry(load_hidden_categories)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    pinned_categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        categories = wpdb.execute_with_retry(load_categories_for_page, pageid)
        pinned_categories = set(projectindex.get(pageid, []))
        # Filter both kinds of categories and build the category -> pageid
        # indexes
        page_has_at_least_one_category = False
        for catname in categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        for catname in pinned_categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                pinned_categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))
    if pinned_categories_to_ids:
        log.info('%d pinned categories (%s, %s)' % \
            (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0],
             pinned_categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
                                   max_categories)
    categories |= set(
        (k, frozenset(v)) for k, v in pinned_categories_to_ids.items())

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0
Ejemplo n.º 13
0
#!/usr/bin/env python

import sys

sys.path.append('../')

import chdb
import config

cfg = config.get_localized_config()
db = chdb.init_wp_replica_db()
cursor = db.cursor()

categories = set([cfg.citation_needed_category])
while True:
    cursor.execute(
        'SELECT cl_from, cl_type FROM categorylinks WHERE (' +
        ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories)
    subcategories = set()
    for page_id, type in cursor:
        if type == 'page':
            print page_id
        elif type == 'subcat':
            subcategories.add(page_id)
    if not subcategories:
        break

    # need to convert the page ids of subcategories into page
    # titles so we can query recursively
    cursor.execute(
        'SELECT page_title FROM page WHERE (' +