def load_hidden_categories(wpcursor):
    cfg = config.get_localized_config()
    wpcursor.execute('''
        SELECT cl_from FROM categorylinks WHERE
        cl_to = %s''', (cfg.hidden_category,))
    hidden_page_ids = [row[0] for row in wpcursor]
    return category_ids_to_names(wpcursor, hidden_page_ids)
Ejemplo n.º 2
0
    def wrapper(lang_code = '', *args, **kwds):
        accept_language = parse_accept_language_header(
            flask.request.headers.get('Accept-Language', ''))

        lang_code = lang_code.lower()
        if not lang_code:
            return redirect_to_lang_code(
                find_default_lang_code_for_request(accept_language))

        flask.g._lang_code = lang_code
        if lang_code not in config.LANG_CODES_TO_LANG_NAMES:
            return redirect_to_lang_code('en')

        flask.g._cfg = config.get_localized_config(lang_code, api = False)
        if flask.current_app.debug and 'locale' in flask.request.args:
            flask.g._strings = chstrings.get_localized_strings(
                flask.g._cfg, flask.request.args['locale'])
        else:
            flask.g._lang_tag, flask.g._strings = load_strings_for_request(
                lang_code, flask.g._cfg, accept_language)
        if not flask.g._strings:
            # Shouldn't really happen, this means we have a misconfigured
            # language that has a config entry but no locales in the translation
            # files.
            return redirect_to_lang_code('en')
        return handler(lang_code, *args, **kwds)
Ejemplo n.º 3
0
def update_intersections():
    db = chdb.init_scratch_db()
    cfg = config.get_localized_config(api=False)

    db.execute_with_retry_s('DELETE FROM intersections')
    db.execute_with_retry_s(
        '''
        INSERT INTO intersections SELECT * FROM %s
        WHERE expiration > NOW()''' %
        chdb.get_table_name(db, 'citationhunt', 'intersections'))

    db.execute_with_retry_s('DELETE FROM articles_intersections')
    db.execute_with_retry_s(
        '''
        INSERT INTO articles_intersections SELECT * FROM %s
        WHERE article_id IN (SELECT page_id FROM articles)
        AND inter_id IN (SELECT id FROM intersections)''' %
        chdb.get_table_name(db, 'citationhunt', 'articles_intersections'))

    def update_snippets_links(cursor):
        cursor.execute('SELECT id FROM intersections')
        intersection_ids = [row[0] for row in cursor]
        if intersection_ids:
            database.populate_snippets_links(cursor,
                                             intersection_ids=intersection_ids)

    db.execute_with_retry(update_snippets_links)
    # delete empty intersections. should this surface an error to the user
    # instead?
    db.execute_with_retry_s('''DELETE FROM intersections WHERE id NOT IN (
            SELECT inter_id FROM articles_intersections)''')
Ejemplo n.º 4
0
def page_not_found(e):
    if hasattr(flask.g, '_cfg'):
        cfg = flask.g._cfg
    else:
        cfg = config.get_localized_config('en')
    return flask.render_template(
        '404.html', config = cfg), 404
Ejemplo n.º 5
0
def init_scratch_db():
    cfg = config.get_localized_config()
    def connect_and_initialize():
        db = _connect(ch_my_cnf)
        _ensure_database(db, 'scratch', cfg.lang_code)
        return db
    return RetryingConnection(connect_and_initialize)
Ejemplo n.º 6
0
def init_scratch_db():
    cfg = config.get_localized_config()
    def connect_and_initialize():
        db = _connect_to_ch_mysql()
        _use(db.cursor(), 'scratch', cfg.lang_code)
        return db
    return _RetryingConnection(connect_and_initialize)
Ejemplo n.º 7
0
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    db = chdb.init_wp_replica_db(cfg.lang_code)
    cursor = db.cursor()
    categories = set([cfg.citation_needed_category])
    while True:
        cursor.execute(
            'SELECT cl_from, cl_type FROM categorylinks WHERE (' +
            ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories)
        subcategories = set()
        for page_id, type in cursor:
            if type == b'page':
                print(page_id)
            elif type == b'subcat':
                subcategories.add(page_id)
        if not subcategories:
            break

        # need to convert the page ids of subcategories into page
        # titles so we can query recursively
        cursor.execute(
            'SELECT page_title FROM page WHERE (' +
            ' OR '.join(['page_id = %s'] * len(subcategories)) + ')',
            subcategories)
        categories = set([r[0] for r in cursor])
Ejemplo n.º 8
0
def install_scratch_db():
    cfg = config.get_localized_config()
    db = init_db(cfg.lang_code)
    # ensure citationhunt is populated with tables
    create_tables(db)

    chname = _make_tools_labs_dbname(db, 'citationhunt', cfg.lang_code)
    scname = _make_tools_labs_dbname(db, 'scratch', cfg.lang_code)
    with db as cursor:
        # generate a sql query that will atomically swap tables in
        # 'citationhunt' and 'scratch'. Modified from:
        # http://blog.shlomoid.com/2010/02/emulating-missing-rename-database.html
        cursor.execute('''SET group_concat_max_len = 2048;''')
        cursor.execute('''
            SELECT CONCAT('RENAME TABLE ',
            GROUP_CONCAT('%s.', table_name,
            ' TO ', table_schema, '.old_', table_name, ', ',
            table_schema, '.', table_name, ' TO ', '%s.', table_name),';')
            FROM information_schema.TABLES WHERE table_schema = '%s'
            GROUP BY table_schema;
        ''' % (chname, chname, scname))

        rename_stmt = cursor.fetchone()[0]
        cursor.execute(rename_stmt)
        cursor.execute('DROP DATABASE ' + scname)
Ejemplo n.º 9
0
def update_intersections():
    db = chdb.init_scratch_db()
    cfg = config.get_localized_config()

    db.execute_with_retry_s('DELETE FROM intersections')
    db.execute_with_retry_s('''
        INSERT INTO intersections SELECT * FROM %s
        WHERE expiration > NOW()''' % chdb.get_table_name(
            db, 'citationhunt', 'intersections'))

    db.execute_with_retry_s('DELETE FROM articles_intersections')
    db.execute_with_retry_s('''
        INSERT INTO articles_intersections SELECT * FROM %s
        WHERE article_id IN (SELECT page_id FROM articles)
        AND inter_id IN (SELECT id FROM intersections)''' % chdb.get_table_name(
            db, 'citationhunt', 'articles_intersections'))

    def update_snippets_links(cursor):
        cursor.execute('SELECT id FROM intersections')
        intersection_ids = [row[0] for row in cursor]
        if intersection_ids:
            database.populate_snippets_links(cursor,
                intersection_ids = intersection_ids)
    db.execute_with_retry(update_snippets_links)
    # delete empty intersections. should this surface an error to the user
    # instead?
    db.execute_with_retry_s(
        '''DELETE FROM intersections WHERE id NOT IN (
            SELECT inter_id FROM articles_intersections)''')
Ejemplo n.º 10
0
def init_scratch_db():
    cfg = config.get_localized_config()
    def connect_and_initialize():
        db = _connect(ch_my_cnf)
        _ensure_database(db, 'scratch', cfg.lang_code)
        return db
    return RetryingConnection(connect_and_initialize)
Ejemplo n.º 11
0
def citation_hunt(lang_code):
    id = flask.request.args.get('id')
    cat = flask.request.args.get('cat')
    cfg = config.get_localized_config(lang_code)

    lang_dir = cfg.lang_dir
    if flask.current_app.debug:
        lang_dir = flask.request.args.get('dir', lang_dir)

    if cat is not None:
        cat = get_category_by_id(lang_code, cat)
        if cat is None:
            # invalid category, normalize to "all" and try again by id
            cat = CATEGORY_ALL
            return flask.redirect(
                flask.url_for('citation_hunt',
                              lang_code=lang_code,
                              id=id,
                              cat=cat.id))
    else:
        cat = CATEGORY_ALL

    if id is not None:
        sinfo = Database.query_snippet_by_id(lang_code, id)
        if sinfo is None:
            # invalid id
            flask.request.cfg = cfg
            flask.abort(404)
        snippet, section, aurl, atitle = sinfo
        next_snippet_id = select_next_id(lang_code, id, cat)
        if next_snippet_id is None:
            # the snippet doesn't belong to the category!
            assert cat is not CATEGORY_ALL
            return flask.redirect(
                flask.url_for('citation_hunt',
                              id=id,
                              cat=CATEGORY_ALL.id,
                              lang_code=lang_code))
        autofocus = should_autofocus_category_filter(cat, flask.request)
        article_url_path = urllib.quote(
            e(urlparse.urlparse(aurl).path.lstrip('/')))
        return flask.render_template('index.html',
                                     snippet=snippet,
                                     section=section,
                                     article_url=aurl,
                                     article_url_path=article_url_path,
                                     article_title=atitle,
                                     current_category=cat,
                                     next_snippet_id=next_snippet_id,
                                     cn_marker=CITATION_NEEDED_MARKER,
                                     cn_html=CITATION_NEEDED_MARKUP,
                                     ref_marker=REF_MARKER,
                                     ref_html=SUPERSCRIPT_MARKUP,
                                     config=cfg,
                                     lang_dir=lang_dir,
                                     category_filter_autofocus=autofocus)

    id = select_random_id(lang_code, cat)
    return flask.redirect(
        flask.url_for('citation_hunt', id=id, cat=cat.id, lang_code=lang_code))
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    db = chdb.init_wp_replica_db()
    cursor = db.cursor()
    categories = set([cfg.citation_needed_category])
    while True:
        cursor.execute(
            'SELECT cl_from, cl_type FROM categorylinks WHERE (' +
            ' OR '.join(['cl_to = %s'] * len(categories)) + ')', categories)
        subcategories = set()
        for page_id, type in cursor:
            if type == 'page':
                print page_id
            elif type == 'subcat':
                subcategories.add(page_id)
        if not subcategories:
            break

        # need to convert the page ids of subcategories into page
        # titles so we can query recursively
        cursor.execute(
            'SELECT page_title FROM page WHERE (' +
            ' OR '.join(['page_id = %s'] * len(subcategories)) + ')',
            subcategories)
        categories = set([r[0] for r in cursor])
Ejemplo n.º 13
0
def init_wp_replica_db():
    cfg = config.get_localized_config()
    def connect_and_initialize():
        db = _connect(wp_my_cnf)
        with db as cursor:
            cursor.execute('USE ' + cfg.database)
        return db
    return RetryingConnection(connect_and_initialize)
Ejemplo n.º 14
0
def load_hidden_categories(wpcursor):
    cfg = config.get_localized_config()
    wpcursor.execute(
        '''
        SELECT cl_from FROM categorylinks WHERE
        cl_to = %s''', (cfg.hidden_category, ))
    hidden_page_ids = [row[0] for row in wpcursor]
    return category_ids_to_names(wpcursor, hidden_page_ids)
Ejemplo n.º 15
0
def init_wp_replica_db():
    cfg = config.get_localized_config()
    def connect_and_initialize():
        db = _connect(wp_my_cnf)
        with db as cursor:
            cursor.execute('USE ' + cfg.database)
        return db
    return RetryingConnection(connect_and_initialize)
Ejemplo n.º 16
0
def init_wp_replica_db(lang_code):
    cfg = config.get_localized_config(lang_code)
    def connect_and_initialize():
        db = _connect_to_wp_mysql(cfg)
        with db as cursor:
            cursor.execute('USE ' + cfg.database)
        return db
    return _RetryingConnection(connect_and_initialize)
Ejemplo n.º 17
0
def init_scratch_db():
    cfg = config.get_localized_config()

    def connect_and_initialize():
        db = _connect_to_ch_mysql()
        _use(db.cursor(), 'scratch', cfg.lang_code)
        return db

    return _RetryingConnection(connect_and_initialize)
Ejemplo n.º 18
0
def category_is_usable(catname, hidden_categories):
    assert isinstance(catname, CategoryName)
    if catname in hidden_categories:
        return False
    cfg = config.get_localized_config()
    for regexp in cfg.category_name_regexps_blacklist:
        if re.search(regexp, catname):
            return False
    return True
def category_is_usable(catname, hidden_categories):
    assert isinstance(catname, CategoryName)
    if catname in hidden_categories:
        return False
    cfg = config.get_localized_config()
    for regexp in cfg.category_name_regexps_blacklist:
        if re.search(regexp, catname):
            return False
    return True
Ejemplo n.º 20
0
def init_wp_replica_db(lang_code):
    cfg = config.get_localized_config(lang_code)

    def connect_and_initialize():
        db = _connect_to_wp_mysql(cfg)
        with db as cursor:
            cursor.execute('USE ' + cfg.database)
        return db

    return _RetryingConnection(connect_and_initialize)
Ejemplo n.º 21
0
def sanity_check():
    cfg = config.get_localized_config()
    sdb = chdb.init_scratch_db()
    snippet_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM snippets''')[0][0]
    assert snippet_count > cfg.min_snippets_sanity_check

    article_count = sdb.execute_with_retry_s(
        '''SELECT COUNT(*) FROM articles''')[0][0]
    assert article_count > cfg.min_articles_sanity_check
def print_pageids_from_wikipedia():
    cfg = config.get_localized_config()
    db = cddb.init_wp_replica_db(cfg.lang_code)
    cursor = db.cursor()

    cursor.execute('SELECT page_id FROM page where page_namespace = 0' +
                   ' AND page_is_redirect = 0' +
                   ' AND RAND() < %s' % cfg.articles_sampling_fraction)
    for page_id in cursor:
        print(page_id[0])
Ejemplo n.º 23
0
def init_cd_db():
    cfg = config.get_localized_config()

    def connect_and_initialize():
        db = _connect_to_ch_mysql()
        with db.cursor() as cursor:
            cursor.execute('USE ' + 's54245__citationdetective_p')
        return db

    return _RetryingConnection(connect_and_initialize)
Ejemplo n.º 24
0
def citation_hunt(lang_code):
    id = flask.request.args.get('id')
    cat = flask.request.args.get('cat')
    cfg = config.get_localized_config(lang_code)

    lang_dir = cfg.lang_dir
    if flask.current_app.debug:
        lang_dir = flask.request.args.get('dir', lang_dir)

    if cat is not None:
        cat = get_category_by_id(lang_code, cat)
        if cat is None:
            # invalid category, normalize to "all" and try again by id
            cat = CATEGORY_ALL
            return flask.redirect(
                flask.url_for('citation_hunt',
                    lang_code = lang_code, id = id, cat = cat.id))
    else:
        cat = CATEGORY_ALL

    if id is not None:
        sinfo = Database.query_snippet_by_id(lang_code, id)
        if sinfo is None:
            # invalid id
            flask.request.cfg = cfg
            flask.abort(404)
        snippet, section, aurl, atitle = sinfo
        next_snippet_id = select_next_id(lang_code, id, cat)
        if next_snippet_id is None:
            # the snippet doesn't belong to the category!
            assert cat is not CATEGORY_ALL
            return flask.redirect(
                flask.url_for('citation_hunt',
                    id = id, cat = CATEGORY_ALL.id,
                    lang_code = lang_code))
        autofocus = should_autofocus_category_filter(cat, flask.request)
        article_url_path = urllib.quote(
            e(urlparse.urlparse(aurl).path.lstrip('/')))
        return flask.render_template('index.html',
            snippet = snippet, section = section, article_url = aurl,
            article_url_path = article_url_path,
            article_title = atitle, current_category = cat,
            next_snippet_id = next_snippet_id,
            cn_marker = CITATION_NEEDED_MARKER,
            cn_html = CITATION_NEEDED_MARKUP,
            ref_marker = REF_MARKER,
            ref_html = SUPERSCRIPT_MARKUP,
            config = cfg,
            lang_dir = lang_dir,
            category_filter_autofocus = autofocus)

    id = select_random_id(lang_code, cat)
    return flask.redirect(
        flask.url_for('citation_hunt',
            id = id, cat = cat.id, lang_code = lang_code))
Ejemplo n.º 25
0
def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    logger.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    logger.info('finished with %d categories' % len(
        category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    logger.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0
Ejemplo n.º 26
0
    def is_citation_needed(self, template):
        '''Override to control which templates are considered Citation needed.

        The default implementation matches against
        config.citation_needed_templates.
        '''

        cfg = config.get_localized_config()
        return any(
            template.name.matches(tpl)
            for tpl in cfg.citation_needed_templates)
Ejemplo n.º 27
0
def reset_scratch_db():
    cfg = config.get_localized_config()
    db = init_db(cfg.lang_code)
    with db as cursor:
        dbname = _make_tools_labs_dbname(db, 'scratch', cfg.lang_code)
        with ignore_warnings():
            cursor.execute('DROP DATABASE IF EXISTS ' + dbname)
        cursor.execute('CREATE DATABASE %s CHARACTER SET utf8mb4' % dbname)
        cursor.execute('USE ' + dbname)
    create_tables(db)
    return db
Ejemplo n.º 28
0
    def is_citation_needed(self, template):
        '''Override to control which templates are considered Citation needed.

        The default implementation matches against
        config.citation_needed_templates.
        '''

        cfg = config.get_localized_config()
        return any(
            template.name.matches(tpl)
            for tpl in cfg.citation_needed_templates)
Ejemplo n.º 29
0
def reset_scratch_db():
    cfg = config.get_localized_config()
    db = init_db(cfg.lang_code)
    with db as cursor:
        dbname = _make_tools_labs_dbname(db, 'scratch', cfg.lang_code)
        with ignore_warnings():
            cursor.execute('DROP DATABASE IF EXISTS ' + dbname)
        cursor.execute('CREATE DATABASE %s CHARACTER SET utf8mb4' % dbname)
        cursor.execute('USE ' + dbname)
    create_tables(db)
    return db
Ejemplo n.º 30
0
def assign_categories():
    cfg = config.get_localized_config()
    profiler = cProfile.Profile()
    if cfg.profile:
        profiler.enable()
    start = time.time()

    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db(cfg.lang_code)

    unsourced_pageids = load_unsourced_pageids(chdb)

    # Load an initial {wikiproject -> [page ids]} dict, if applicable
    category_to_page_ids = load_projectindex(cfg, chdb)

    # Load a set() of hidden categories
    hidden_categories = wpdb.execute_with_retry(
        load_hidden_categories, cfg)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    # Load all usable categories and page ids
    for c in ichunk(unsourced_pageids, 10000):
        for c, p in wpdb.execute_with_retry(
            load_categories_for_pages, tuple(c)):
            if category_is_usable(cfg, c, hidden_categories):
                category_to_page_ids.setdefault(c, []).append(p)

    # Now find out how many snippets each category has
    category_to_snippet_count = {}
    page_id_to_snippet_count = chdb.execute_with_retry(count_snippets_for_pages)
    for category, page_ids in category_to_page_ids.iteritems():
        category_to_snippet_count[category] = sum(
            page_id_to_snippet_count.get(p, 0) for p in page_ids)

    # And keep only the ones with at least two.
    category_name_id_and_page_ids = [
        (unicode(category), category_name_to_id(category), page_ids)
        for category, page_ids in category_to_page_ids.iteritems()
        if category_to_snippet_count[category] >= 2
    ]
    log.info('finished with %d categories' % len(category_name_id_and_page_ids))

    update_citationhunt_db(chdb, category_name_id_and_page_ids)
    wpdb.close()
    chdb.close()
    log.info('all done in %d seconds.' % (time.time() - start))

    if cfg.profile:
        profiler.disable()
        pstats.Stats(profiler).sort_stats('cumulative').print_stats(
            30, 'assign_categories.py')
    return 0
Ejemplo n.º 31
0
 def wrapper(lang_code='', *args, **kwds):
     flask.g._lang_code = lang_code
     if lang_code not in config.LANG_CODES_TO_LANG_NAMES:
         response = flask.redirect(
             flask.url_for('citation_hunt',
                           lang_code='en',
                           **flask.request.args))
         if flask.request.path != '/':
             response.headers['Location'] += flask.request.path
         return response
     flask.g._cfg = config.get_localized_config(lang_code)
     return handler(lang_code, *args, **kwds)
Ejemplo n.º 32
0
    def strip_wikilink(self, wikilink, normalize, collapse):
        '''Override to control how wikilinks are stripped in the wikicode.

        The return value will be the link's replacement. The default value
        will strip the wikilink entirely if its title has a prefix-match in
        config.wikilink_prefix_blacklist; otherwise, it will delegate to
        mwparserfromhell.
        '''

        cfg = config.get_localized_config()
        for prefix in cfg.wikilink_prefix_blacklist:
            if wikilink.title.startswith(prefix):
                return ''
        return self.delegate_strip(wikilink, normalize, collapse)
Ejemplo n.º 33
0
    def strip_wikilink(self, wikilink, normalize, collapse):
        '''Override to control how wikilinks are stripped in the wikicode.

        The return value will be the link's replacement. The default value
        will strip the wikilink entirely if its title has a prefix-match in
        config.wikilink_prefix_blacklist; otherwise, it will delegate to
        mwparserfromhell.
        '''

        cfg = config.get_localized_config()
        for prefix in cfg.wikilink_prefix_blacklist:
            if wikilink.title.startswith(prefix):
                return ''
        return self.delegate_strip(wikilink, normalize, collapse)
Ejemplo n.º 34
0
def page_not_found(e):
    if hasattr(flask.g, '_cfg'):
        cfg = flask.g._cfg
    else:
        cfg = config.get_localized_config('en')
    if hasattr(flask.g, '_strings'):
        lang_tag = flask.g._lang_tag
        strings = flask.g._strings
    else:
        lang_tag = 'en'
        strings = chstrings.get_localized_strings(cfg, 'en')
    return flask.render_template(
        '404.html', config = cfg,
        lang_tag = lang_tag,
        lang_dir = cfg.lang_dir, strings = strings), 404
Ejemplo n.º 35
0
def page_not_found(e):
    if hasattr(flask.g, '_cfg'):
        cfg = flask.g._cfg
    else:
        cfg = config.get_localized_config('en')
    if hasattr(flask.g, '_strings'):
        lang_tag = flask.g._lang_tag
        strings = flask.g._strings
    else:
        lang_tag = 'en'
        strings = chstrings.get_localized_strings(cfg, 'en')
    return flask.render_template('404.html',
                                 config=cfg,
                                 lang_tag=lang_tag,
                                 lang_dir=cfg.lang_dir,
                                 strings=strings), 404
Ejemplo n.º 36
0
def print_unsourced_ids_from_wikipedia():
    cfg = config.get_localized_config()
    templates = [t.replace(' ', '_') for t in cfg.citation_needed_templates]

    db = chdb.init_wp_replica_db(cfg.lang_code)
    cursor = db.cursor()

    or_clause = (
        '(' + 'OR '.join(['tl_title = %s'] * len(templates)) + ')'
    )
    # https://www.mediawiki.org/wiki/Help:Namespaces
    cursor.execute(
        'SELECT tl_from FROM templatelinks WHERE ' +
        'tl_from_namespace = 0 AND tl_namespace = 10 AND ' +
        or_clause, templates)
    for (page_id,) in cursor:
        print(page_id)
Ejemplo n.º 37
0
def create_tables(db):
    cfg = config.get_localized_config()
    with db as cursor, ignore_warnings():
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS categories (id VARCHAR(128) PRIMARY KEY,
            title VARCHAR(255)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            INSERT IGNORE INTO categories VALUES("unassigned", "unassigned")
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS articles (page_id INT(8) UNSIGNED
            PRIMARY KEY, url VARCHAR(512), title VARCHAR(512))
            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS articles_categories (
            article_id INT(8) UNSIGNED, category_id VARCHAR(128),
            FOREIGN KEY(article_id) REFERENCES articles(page_id)
            ON DELETE CASCADE,
            FOREIGN KEY(category_id) REFERENCES categories(id)
            ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS category_article_count (
            category_id VARCHAR(128), article_count INT(8) UNSIGNED,
            FOREIGN KEY(category_id) REFERENCES categories(id)
            ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute(
            '''
            CREATE TABLE IF NOT EXISTS snippets (id VARCHAR(128) PRIMARY KEY,
            snippet VARCHAR(%s), section VARCHAR(768), article_id INT(8)
            UNSIGNED, FOREIGN KEY(article_id) REFERENCES articles(page_id)
            ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''', (cfg.snippet_max_size * 2, ))
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS snippets_links (prev VARCHAR(128),
            next VARCHAR(128), cat_id VARCHAR(128),
            FOREIGN KEY(prev) REFERENCES snippets(id) ON DELETE CASCADE,
            FOREIGN KEY(next) REFERENCES snippets(id) ON DELETE CASCADE,
            FOREIGN KEY(cat_id) REFERENCES categories(id) ON DELETE CASCADE)
            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
Ejemplo n.º 38
0
    def __init__(self):
        # Monkey-patch mwparserfromhell to use our own methods.
        monkey_patched_classes = {
            mwparserfromhell.nodes.Template: self.strip_template,
            mwparserfromhell.nodes.Tag: self.strip_tag,
            mwparserfromhell.nodes.Wikilink: self.strip_wikilink,
            mwparserfromhell.nodes.Heading: self.strip_heading,
        }

        self._original_strip_methods = {}
        for klass, method in monkey_patched_classes.items():
            self._original_strip_methods[klass] = klass.__strip__

            def unbind(self, *args):
                return monkey_patched_classes[type(self)](self, *args)

            klass.__strip__ = unbind

        self.cfg = config.get_localized_config()
Ejemplo n.º 39
0
def initialize_all_databases():
    def _do_create_database(cursor, database, lang_code):
        dbname = _make_tools_labs_dbname(cursor, database, lang_code)
        cursor.execute('SET SESSION sql_mode = ""')
        cursor.execute('CREATE DATABASE IF NOT EXISTS %s '
                       'CHARACTER SET utf8mb4' % dbname)

    cfg = config.get_localized_config()
    db = _RetryingConnection(_connect_to_cd_mysql)
    with db.cursor() as cursor, ignore_warnings():
        cursor.execute(
            'DROP DATABASE IF EXISTS ' +
            _make_tools_labs_dbname(cursor, 'scratch', cfg.lang_code))
        for database in ['citationdetective', 'scratch']:
            _do_create_database(cursor, database, cfg.lang_code)
        _use(cursor, 'scratch', cfg.lang_code)
        _create_citationdetective_tables(cfg, cursor)
        _use(cursor, 'citationdetective', cfg.lang_code)
        _create_citationdetective_tables(cfg, cursor)
Ejemplo n.º 40
0
def install_scratch_db():
    cfg = config.get_localized_config()
    with init_db(cfg.lang_code) as cursor:
        chname = _make_tools_labs_dbname(cursor, 'citationhunt', cfg.lang_code)
        scname = _make_tools_labs_dbname(cursor, 'scratch', cfg.lang_code)
        # generate a sql query that will atomically swap tables in
        # 'citationhunt' and 'scratch'. Modified from:
        # http://blog.shlomoid.com/2010/02/emulating-missing-rename-database.html
        cursor.execute('''SET group_concat_max_len = 2048;''')
        cursor.execute('''
            SELECT CONCAT('RENAME TABLE ',
            GROUP_CONCAT('%s.', table_name,
            ' TO ', table_schema, '.old_', table_name, ', ',
            table_schema, '.', table_name, ' TO ', '%s.', table_name),';')
            FROM information_schema.TABLES WHERE table_schema = '%s'
            GROUP BY table_schema;
        ''' % (chname, chname, scname))

        rename_stmt = cursor.fetchone()[0]
        cursor.execute(rename_stmt)
        cursor.execute('DROP DATABASE ' + scname)
Ejemplo n.º 41
0
def initialize_all_databases():
    def _do_create_database(cursor, database, lang_code):
        dbname = _make_tools_labs_dbname(cursor, database, lang_code)
        cursor.execute('SET SESSION sql_mode = ""')
        cursor.execute(
            'CREATE DATABASE IF NOT EXISTS %s '
            'CHARACTER SET utf8mb4' % dbname)
    cfg = config.get_localized_config()
    db = _RetryingConnection(_connect_to_ch_mysql)
    with db as cursor, ignore_warnings():
        cursor.execute('DROP DATABASE IF EXISTS ' + _make_tools_labs_dbname(
            cursor, 'scratch', cfg.lang_code))
        for database in ['citationhunt', 'scratch', 'stats']:
            _do_create_database(cursor, database,
                cfg.lang_code if database != 'stats' else 'global')
        _use(cursor, 'scratch', cfg.lang_code)
        _create_citationhunt_tables(cfg, cursor)
        _use(cursor, 'citationhunt', cfg.lang_code)
        _create_citationhunt_tables(cfg, cursor)
        _use(cursor, 'stats', 'global')
        _create_stats_tables(cfg, cursor)
Ejemplo n.º 42
0
    def wrapper(lang_code = '', *args, **kwds):
        accept_language_hdr = flask.request.headers.get('Accept-Language', '')
        if not lang_code:
            return redirect_to_lang_code(
                find_default_lang_code_for_request(accept_language_hdr))

        flask.g._lang_code = lang_code
        if lang_code not in config.LANG_CODES_TO_LANG_NAMES:
            return redirect_to_lang_code('en')

        flask.g._cfg = config.get_localized_config(lang_code)
        if flask.current_app.debug and 'locale' in flask.request.args:
            flask.g._strings = chstrings.get_localized_strings(
                flask.g._cfg, flask.request.args['locale'])
        else:
            flask.g._lang_tag, flask.g._strings = load_strings_for_request(
                lang_code, flask.g._cfg, accept_language_hdr)
        if not flask.g._strings:
            # Shouldn't really happen, this means we have a misconfigured
            # language that has a config entry but no locales in the translation
            # files.
            return redirect_to_lang_code('en')
        return handler(lang_code, *args, **kwds)
Ejemplo n.º 43
0
def create_tables(db):
    cfg = config.get_localized_config()
    with db as cursor, ignore_warnings():
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS categories (id VARCHAR(128) PRIMARY KEY,
            title VARCHAR(255)) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            INSERT IGNORE INTO categories VALUES("unassigned", "unassigned")
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS articles (page_id VARCHAR(128)
            PRIMARY KEY, url VARCHAR(512), title VARCHAR(512))
            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS articles_categories (
            article_id VARCHAR(128), category_id VARCHAR(128),
            FOREIGN KEY(article_id) REFERENCES articles(page_id)
            ON DELETE CASCADE,
            FOREIGN KEY(category_id) REFERENCES categories(id)
            ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS snippets (id VARCHAR(128) PRIMARY KEY,
            snippet VARCHAR(%s), section VARCHAR(768), article_id VARCHAR(128),
            FOREIGN KEY(article_id) REFERENCES articles(page_id)
            ON DELETE CASCADE) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''', (cfg.snippet_max_size * 2,))
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS snippets_links (prev VARCHAR(128),
            next VARCHAR(128), cat_id VARCHAR(128),
            FOREIGN KEY(prev) REFERENCES snippets(id) ON DELETE CASCADE,
            FOREIGN KEY(next) REFERENCES snippets(id) ON DELETE CASCADE,
            FOREIGN KEY(cat_id) REFERENCES categories(id) ON DELETE CASCADE)
            ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
        ''')
Ejemplo n.º 44
0
    def test_fallback_lang_tag(self):
        gcfg = config.get_global_config()
        cfg = config.get_localized_config(gcfg.fallback_lang_tag, api=False)
        fallback_strings = chstrings.get_localized_strings(
            cfg, gcfg.fallback_lang_tag)

        original = chstrings._load_strings_for_lang_tag(gcfg.fallback_lang_tag)
        with mock.patch('chstrings._load_strings_for_lang_tag') as m:
            # Simulate an incomplete strings file.
            def _load_strings_side_effect(lang_tag):
                if lang_tag == 'fake':
                    return {'tooltitle': 'Test Citation Hunt'}
                elif lang_tag == gcfg.fallback_lang_tag:
                    return original
                raise ValueError

            m.side_effect = _load_strings_side_effect

            # The incomplete strings must have been merged with the fallback
            # ones.
            strings = chstrings.get_localized_strings(cfg, 'fake')
            self.assertEqual('Test Citation Hunt', strings['tooltitle'])
            self.assertEqual(fallback_strings['instructions_goal'],
                             strings['instructions_goal'])
Ejemplo n.º 45
0
def compute_fixed_snippets():
    start = time.time()
    # FIXME This could probably just be one query on a single database
    # connection, insead of one connection per database and loading all
    # snippets in memory for comparison.
    cfg = config.get_localized_config()
    scratch_db = chdb.init_scratch_db()
    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Find the set of snippets that that were "clicked" (redirected to article)
    # between the dates of the previous/live and next/scratch database
    from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets')
    to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets')
    clicked = stats_db.execute_with_retry(load_snippet_clicks_between,
                                          cfg.lang_code, from_ts, to_ts)

    # Load the snippets from both databases
    scratch_snippets = scratch_db.execute_with_retry(load_snippets)
    live_snippets = live_db.execute_with_retry(load_snippets)

    # And for each snippet that disappeared across databases AND had been
    # clicked in the meantime, store its information in the stats database.
    gone = live_snippets.difference(scratch_snippets)
    for id, clicked_ts in clicked.iteritems():
        if id in gone:
            log.info(id)
            stats_db.execute_with_retry_s(
                'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id,
                cfg.lang_code)

    log.info('all done in %d seconds.' % (time.time() - start))
    scratch_db.close()
    live_db.close()
    stats_db.close()
    return 0
Ejemplo n.º 46
0
    try:
        _update_db_tools_labs(cfg)
    except Exception as e:
        traceback.print_exc(file = sys.stderr)
        email('Failed to build database for %s' % cfg.lang_code, logfiles)
        sys.exit(1)
    utils.mkdir_p(cfg.log_dir)
    for logfile in logfiles:
        os.rename(logfile, os.path.join(cfg.log_dir, logfile))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Update the CitationHunt databases.')
    parser.add_argument('lang_code',
        help='One of the language codes in ../config.py')
    args = parser.parse_args()

    if not utils.running_in_tools_labs():
        print('Not running in Tools Labs!', file=sys.stderr)
        sys.exit(1)

    if args.lang_code not in config.LANG_CODES_TO_LANG_NAMES:
        print('Invalid lang code! Use one of: ', end=' ', file=sys.stderr)
        print(list(config.LANG_CODES_TO_LANG_NAMES.keys()), file=sys.stderr)
        parser.print_usage()
        sys.exit(1)

    cfg = config.get_localized_config(args.lang_code)
    update_db_tools_labs(cfg)
Ejemplo n.º 47
0
import os
os.environ['DEBUG'] = '1' # disable https redirects

# Disable stats since it requires a database, and we're not
# testing it anyway
import config
config.get_localized_config('en').flagged_off.append('stats')

import app
import mock

import unittest

class CitationHuntTest(unittest.TestCase):
    def setUp(self):
        self.app = app.app.test_client()

        self.sid = '93b6f3cf'
        self.cat = 'b5e1a25d'
        self.fake_snippet_info = (
            'Some snippet', 'Some section',
            'https://en.wikipedia.org/wiki/A', 'Some title')

        methods_and_return_values = [
            ('query_categories', [(self.cat, 'Category')]),
            ('query_snippet_by_id', self.fake_snippet_info),
            ('query_snippet_by_category', (self.sid,)),
            ('query_random_snippet', (self.sid,)),
            ('query_next_id', (self.sid[::-1],)),
        ]
Ejemplo n.º 48
0
CACHE_DURATION_SEMI_STATIC = 3 * 60 * 60

app = flask.Flask(__name__)
Compress(app)
debug = 'DEBUG' in os.environ
if not debug:
    flask_sslify.SSLify(app, permanent = True)
Mobility(app)

@app.route('/')
@handlers.validate_lang_code
def index(lang_code):
    pass # nothing to do but validate lang_code

app.add_url_rule('/<lang_code>', view_func = handlers.citation_hunt)
if 'stats' not in config.get_localized_config('en').flagged_off:
    app.add_url_rule('/<lang_code>/stats.html', view_func = handlers.stats)
    app.after_request(handlers.log_request)

@app.route('/<lang_code>/redirect')
@handlers.validate_lang_code
def redirect(lang_code):
    to = urllib.unquote(flask.request.args.get('to', ''))
    cfg = config.get_localized_config(lang_code)
    return flask.redirect(
        urlparse.urljoin('https://' + cfg.wikipedia_domain, to))

@app.route('/<lang_code>/categories.html')
@handlers.validate_lang_code
def categories_html(lang_code):
    response = flask.make_response(
Ejemplo n.º 49
0
def redirect(lang_code):
    to = urllib.parse.unquote(flask.request.args.get('to', ''))
    cfg = config.get_localized_config(lang_code)
    return flask.redirect(
        urllib.parse.urljoin('https://' + cfg.wikipedia_domain, to))
Ejemplo n.º 50
0
def redirect(lang_code):
    to = urllib.unquote(flask.request.args.get('to', ''))
    cfg = config.get_localized_config(lang_code)
    return flask.redirect(
        urlparse.urljoin('https://' + cfg.wikipedia_domain, to))
Ejemplo n.º 51
0
import textwrap

def format_html(html):
    lynx = subprocess.Popen(
        'lynx -dump -stdin -assume_charset UTF-8 '
        '-display_charset UTF-8 -width 80', shell = True,
        stdin = subprocess.PIPE, stdout = subprocess.PIPE)
    stdout, _ = lynx.communicate(html.encode('utf-8'))
    if lynx.returncode:
        print('Failed to render HTML! Do you have lynx?', file=sys.stderr)
        return html
    return stdout.decode('utf-8').strip('\n')

if __name__ == '__main__':
    arguments = docopt.docopt(__doc__)
    cfg = config.get_localized_config()

    WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain
    WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/'
    WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php'

    wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wikipedia, cfg)

    try:
        int(arguments['<title_or_pageid>'])
        wikitext = wikipedia.get_page_contents(
            pageid = int(arguments['<title_or_pageid>']))
    except:
        wikitext = wikipedia.get_page_contents(
            title = arguments['<title_or_pageid>'])
Ejemplo n.º 52
0
import docopt
import mwparserfromhell

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET

import signal
import bz2file
import pickle
import itertools
import urllib

cfg = config.get_localized_config()
WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain
WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/'

NAMESPACE_ARTICLE = '0'

log = Logger()

def section_name_to_anchor(section):
    # See Sanitizer::escapeId
    # https://doc.wikimedia.org/mediawiki-core/master/php/html/classSanitizer.html#ae091dfff62f13c9c1e0d2e503b0cab49
    section = section.replace(' ', '_')
    # urllib.quote interacts really weirdly with unicode in Python2:
    # https://bugs.python.org/issue23885
    section = urllib.quote(e(section), safe = e(''))
    section = section.replace('%3A', ':')
Ejemplo n.º 53
0
def page_not_found(e):
    if not hasattr(flask.request, 'cfg'):
        flask.request.cfg = config.get_localized_config('en')
    return flask.render_template(
        '404.html', config = flask.request.cfg), 404
Ejemplo n.º 54
0
                    id = mkid(d(page_title) + sni)
                    gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_ts in gone_in_this_revision.items():
                if clicked_ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    del snippet_to_ts[snippet_id]
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)',
                        clicked_ts, snippet_id, cfg.lang_code, rev['rev_id'])

    live_db.close()
    stats_db.close()
    return 0

if __name__ == '__main__':
    while True:
        start = time.time()
        args = docopt.docopt(__doc__)
        lang_codes = (
            config.LANG_CODES_TO_LANG_NAMES.keys()
            if args['<lang-code>'] == 'global'
            else [args['<lang-code>']])

        for lang_code in lang_codes:
            cfg = config.get_localized_config(lang_code)
            if cfg.extract == 'snippet':
                compute_fixed_snippets(cfg)
        logger.info('all done in %d seconds.' % (time.time() - start))
        time.sleep(5 * 60)
Ejemplo n.º 55
0
def get_table_name(db, database, table):
    cfg = config.get_localized_config()
    return _make_tools_labs_dbname(
        db.cursor(), database, cfg.lang_code) + '.' + table
def assign_categories(max_categories, mysql_default_cnf):
    cfg = config.get_localized_config()
    chdb = chdb_.init_scratch_db()
    wpdb = chdb_.init_wp_replica_db()

    chdb.execute_with_retry(reset_chdb_tables)
    unsourced_pageids = load_unsourced_pageids(chdb)

    projectindex = {}
    if running_in_tools_labs() and cfg.lang_code == 'en':
        tldb = chdb_.init_projectindex_db()
        tlcursor = tldb.cursor()

        projectindex = load_projectindex(tlcursor)
        log.info('loaded projects for %d talk pages (%s...)' % \
            (len(projectindex), projectindex.values()[0]))

    hidden_categories = wpdb.execute_with_retry(load_hidden_categories)
    log.info('loaded %d hidden categories (%s...)' % \
        (len(hidden_categories), next(iter(hidden_categories))))

    categories_to_ids = collections.defaultdict(set)
    pinned_categories_to_ids = collections.defaultdict(set)
    page_ids_with_no_categories = 0
    for n, pageid in enumerate(list(unsourced_pageids)):
        categories = wpdb.execute_with_retry(load_categories_for_page, pageid)
        pinned_categories = (wpdb.execute_with_retry(
            load_pinned_categories_for_page, projectindex, pageid)
            if projectindex else set())
        # Filter both kinds of categories and build the category -> pageid
        # indexes
        page_has_at_least_one_category = False
        for catname in categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                categories_to_ids[catname].add(pageid)
        for catname in pinned_categories:
            if category_is_usable(catname, hidden_categories):
                page_has_at_least_one_category = True
                pinned_categories_to_ids[catname].add(pageid)
        if not page_has_at_least_one_category:
            unsourced_pageids.remove(pageid)
            page_ids_with_no_categories += 1
        log.progress('loaded categories for %d pageids' % (n + 1))

    log.info('%d pages lack usable categories!' % page_ids_with_no_categories)
    log.info('found %d usable categories (%s, %s...)' % \
        (len(categories_to_ids), categories_to_ids.keys()[0],
        categories_to_ids.keys()[1]))
    if pinned_categories_to_ids:
        log.info('%d pinned categories (%s, %s)' % \
            (len(pinned_categories_to_ids), pinned_categories_to_ids.keys()[0],
             pinned_categories_to_ids.keys()[1]))

    categories = choose_categories(categories_to_ids, unsourced_pageids,
        max_categories)
    categories |= set(
        (k, frozenset(v)) for k, v in pinned_categories_to_ids.items())

    update_citationhunt_db(chdb, categories)
    wpdb.close()
    chdb.close()
    return 0
Ejemplo n.º 57
0
    try:
        _update_db_tools_labs(cfg)
    except Exception, e:
        traceback.print_exc(file = sys.stderr)
        email('Failed to build database for %s' % cfg.lang_code, logfiles)
        sys.exit(1)
    utils.mkdir_p(cfg.log_dir)
    for logfile in logfiles:
        os.rename(logfile, os.path.join(cfg.log_dir, logfile))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Update the CitationHunt databases.')
    parser.add_argument('lang_code',
        help='One of the language codes in ../config.py')
    args = parser.parse_args()

    if not (utils.running_in_tools_labs() and utils.running_in_virtualenv()):
        print >>sys.stderr, 'Not running in a virtualenv in Tools Labs!'
        sys.exit(1)

    if args.lang_code not in config.LANG_CODES_TO_LANG_NAMES:
        print >>sys.stderr, 'Invalid lang code! Use one of: ',
        print >>sys.stderr, config.LANG_CODES_TO_LANG_NAMES.keys()
        parser.print_usage()
        sys.exit(1)

    cfg = config.get_localized_config(args.lang_code)
    update_db_tools_labs(cfg)