def get_db_names_to_archive(lang_code):
    database_names = []
    for db in [chdb.init_db(lang_code), chdb.init_stats_db()]:
        with db as cursor:
            cursor.execute('SELECT DATABASE()')
            database_names.append(cursor.fetchone()[0])
    return database_names
def get_db_names_to_archive(lang_code):
    database_names = []
    for db in [chdb.init_db(lang_code), chdb.init_stats_db()]:
        with db.cursor() as cursor:
            cursor.execute('SELECT DATABASE()')
            database_names.append(cursor.fetchone()[0])
    return database_names
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
        return
    logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys()))

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, clicked_snippets in page_title_to_snippets.items():
        start_ts = min(cs.ts for cs in clicked_snippets)
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = {
                cs.snippet_id: cs for cs in clicked_snippets}
            # FIXME Duplicated logic with parse_live.py :(
            for sni in snippets:
                id = mkid(d(page_title) + sni.snippet)
                gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_snippet in gone_in_this_revision.items():
                if clicked_snippet.ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    clicked_snippets.remove(clicked_snippet)
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)',
                        clicked_snippet.ts, clicked_snippet.snippet_id,
                        cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id)

    live_db.close()
    stats_db.close()
    return 0
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
        return
    logger.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        start_ts = min(snippet_to_ts.values())
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = dict(snippet_to_ts)
            # FIXME Duplicated logic with parse_live.py :(
            for sec, snips in snippets:
                for sni in snips:
                    id = mkid(d(page_title) + sni)
                    gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_ts in gone_in_this_revision.items():
                if clicked_ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    del snippet_to_ts[snippet_id]
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)',
                        clicked_ts, snippet_id, cfg.lang_code, rev['rev_id'])

    live_db.close()
    stats_db.close()
    return 0
def compute_fixed_snippets(cfg):
    log.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours=3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        log.info('No pages to process!')
        return
    log.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php',
                              cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        contents, page_ts = get_page_contents_and_timestamp(wiki, page_title)
        snippets = parser.extract(contents)
        # FIXME Duplicated logic with parse_live.py :(
        for sec, snips in snippets:
            for sni in snips:
                id = mkid(d(page_title) + sni)
                snippet_to_ts.pop(id, None)

        for snippet_id, clicked_ts in snippet_to_ts.items():
            if clicked_ts < page_ts:
                log.info(snippet_id)
                stats_db.execute_with_retry_s(
                    'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts,
                    snippet_id, cfg.lang_code)

    live_db.close()
    stats_db.close()
    return 0
Exemple #6
0
def compute_fixed_snippets():
    start = time.time()
    # FIXME This could probably just be one query on a single database
    # connection, insead of one connection per database and loading all
    # snippets in memory for comparison.
    cfg = config.get_localized_config()
    scratch_db = chdb.init_scratch_db()
    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Find the set of snippets that that were "clicked" (redirected to article)
    # between the dates of the previous/live and next/scratch database
    from_ts = live_db.execute_with_retry(load_table_creation_date, 'snippets')
    to_ts = scratch_db.execute_with_retry(load_table_creation_date, 'snippets')
    clicked = stats_db.execute_with_retry(load_snippet_clicks_between,
                                          cfg.lang_code, from_ts, to_ts)

    # Load the snippets from both databases
    scratch_snippets = scratch_db.execute_with_retry(load_snippets)
    live_snippets = live_db.execute_with_retry(load_snippets)

    # And for each snippet that disappeared across databases AND had been
    # clicked in the meantime, store its information in the stats database.
    gone = live_snippets.difference(scratch_snippets)
    for id, clicked_ts in clicked.iteritems():
        if id in gone:
            log.info(id)
            stats_db.execute_with_retry_s(
                'INSERT INTO fixed VALUES (%s, %s, %s)', clicked_ts, id,
                cfg.lang_code)

    log.info('all done in %d seconds.' % (time.time() - start))
    scratch_db.close()
    live_db.close()
    stats_db.close()
    return 0
def expire_stats(cfg):
    stats_db = chdb.init_stats_db()
    with chdb.init_stats_db().cursor() as cursor, chdb.ignore_warnings():
        cursor.execute('DELETE FROM requests WHERE DATEDIFF(NOW(), ts) > %s',
                (cfg.stats_max_age_days,))
Exemple #8
0
def get_stats_db():
    db = getattr(flask.g, '_stats_db', None)
    if db is None:
        db = flask.g._stats_db = chdb.init_stats_db()
    return db
def expire_stats(cfg):
    stats_db = chdb.init_stats_db()
    with chdb.init_stats_db() as cursor, chdb.ignore_warnings():
        cursor.execute('DELETE FROM requests WHERE DATEDIFF(NOW(), ts) > %s',
                (cfg.stats_max_age_days,))
Exemple #10
0
def get_stats_db():
    db = getattr(flask.g, '_stats_db', None)
    if db is None:
        db = flask.g._stats_db = chdb.init_stats_db()
    return db