Esempio n. 1
0
def initializer(backdir):
    self.backdir = backdir

    # Monkey-patch wikitools to always use our existing session
    opener = WikitoolsRequestsAdapter()
    APIRequest = wikitools.api.APIRequest

    class RequestsAPIRequest(wikitools.api.APIRequest):
        def __init__(self, *args, **kwds):
            APIRequest.__init__(self, *args, **kwds)
            self.opener = opener

    wikitools.APIRequest = RequestsAPIRequest
    wikitools.api.APIRequest = RequestsAPIRequest

    self.wiki = wikitools.wiki.Wiki(WIKIPEDIA_API_URL)
    self.wiki.setUserAgent(
        'citationhunt (https://tools.wmflabs.org/citationhunt)')
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.chdb = chdb.init_scratch_db()
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
Esempio n. 2
0
def initializer(backdir):
    self.backdir = backdir

    self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
Esempio n. 3
0
def initializer(backdir):
    self.backdir = backdir

    self.wiki = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    self.parser = snippet_parser.create_snippet_parser(self.wiki, cfg)
    self.exception_count = 0

    if cfg.profile:
        self.profiler = cProfile.Profile()
        self.profiler.enable()
        # Undocumented :( https://stackoverflow.com/questions/24717468
        multiprocessing.util.Finalize(None, finalizer, exitpriority=16)
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
        return
    logger.info('Will reparse pages: %r' % list(page_title_to_snippets.keys()))

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, clicked_snippets in page_title_to_snippets.items():
        start_ts = min(cs.ts for cs in clicked_snippets)
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = {
                cs.snippet_id: cs for cs in clicked_snippets}
            # FIXME Duplicated logic with parse_live.py :(
            for sni in snippets:
                id = mkid(d(page_title) + sni.snippet)
                gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_snippet in gone_in_this_revision.items():
                if clicked_snippet.ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    clicked_snippets.remove(clicked_snippet)
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s, %s)',
                        clicked_snippet.ts, clicked_snippet.snippet_id,
                        cfg.lang_code, rev['rev_id'], clicked_snippet.inter_id)

    live_db.close()
    stats_db.close()
    return 0
def compute_fixed_snippets(cfg):
    logger.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours = 3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        logger.info('No pages to process!')
        return
    logger.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI(
        'https://' + cfg.wikipedia_domain + '/w/api.php', cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        start_ts = min(snippet_to_ts.values())
        revisions = get_page_revisions(wiki, page_title, start_ts)
        for rev in revisions:
            snippets = parser.extract(rev['contents'])
            gone_in_this_revision = dict(snippet_to_ts)
            # FIXME Duplicated logic with parse_live.py :(
            for sec, snips in snippets:
                for sni in snips:
                    id = mkid(d(page_title) + sni)
                    gone_in_this_revision.pop(id, None)
            for snippet_id, clicked_ts in gone_in_this_revision.items():
                if clicked_ts < rev['timestamp']:
                    logger.info('%s fixed at revision %s' % (
                        snippet_id, rev['rev_id']))
                    del snippet_to_ts[snippet_id]
                    stats_db.execute_with_retry_s(
                        'INSERT IGNORE INTO fixed VALUES (%s, %s, %s, %s)',
                        clicked_ts, snippet_id, cfg.lang_code, rev['rev_id'])

    live_db.close()
    stats_db.close()
    return 0
def compute_fixed_snippets(cfg):
    log.info('computing fixed snippets for %s' % cfg.lang_code)

    live_db = chdb.init_db(cfg.lang_code)
    stats_db = chdb.init_stats_db()

    # Load snippets that have been clicked in the past few hours
    to_ts = datetime.datetime.today()
    from_ts = to_ts - datetime.timedelta(hours=3)
    page_title_to_snippets = stats_db.execute_with_retry(
        load_pages_and_snippets_to_process, cfg.lang_code, from_ts, to_ts)

    if not page_title_to_snippets:
        log.info('No pages to process!')
        return
    log.info('Will reparse pages: %r' % page_title_to_snippets.keys())

    # Now fetch and parse the pages and check which snippets are gone
    wiki = mwapi.MediaWikiAPI('https://' + cfg.wikipedia_domain + '/w/api.php',
                              cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wiki, cfg)

    for page_title, snippet_to_ts in page_title_to_snippets.items():
        contents, page_ts = get_page_contents_and_timestamp(wiki, page_title)
        snippets = parser.extract(contents)
        # FIXME Duplicated logic with parse_live.py :(
        for sec, snips in snippets:
            for sni in snips:
                id = mkid(d(page_title) + sni)
                snippet_to_ts.pop(id, None)

        for snippet_id, clicked_ts in snippet_to_ts.items():
            if clicked_ts < page_ts:
                log.info(snippet_id)
                stats_db.execute_with_retry_s(
                    'INSERT IGNORE INTO fixed VALUES (%s, %s, %s)', clicked_ts,
                    snippet_id, cfg.lang_code)

    live_db.close()
    stats_db.close()
    return 0
Esempio n. 7
0
    stdout, _ = lynx.communicate(html.encode('utf-8'))
    if lynx.returncode:
        print('Failed to render HTML! Do you have lynx?', file=sys.stderr)
        return html
    return stdout.decode('utf-8').strip('\n')

if __name__ == '__main__':
    arguments = docopt.docopt(__doc__)
    cfg = config.get_localized_config()

    WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain
    WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/'
    WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php'

    wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wikipedia, cfg)

    try:
        int(arguments['<title_or_pageid>'])
        wikitext = wikipedia.get_page_contents(
            pageid = int(arguments['<title_or_pageid>']))
    except:
        wikitext = wikipedia.get_page_contents(
            title = arguments['<title_or_pageid>'])

    for snippet in parser.extract(wikitext):
        print('Section: %s' % snippet.section)
        if arguments['--output'] != 'raw':
            output = format_html(snippet.snippet)
        else:
            output = '   ' + '\n   '.join(
Esempio n. 8
0
    stdout, _ = lynx.communicate(html.encode('utf-8'))
    if lynx.returncode:
        print >> sys.stderr, 'Failed to render HTML! Do you have lynx?'
        return html
    return stdout.decode('utf-8').strip('\n')

if __name__ == '__main__':
    arguments = docopt.docopt(__doc__)
    cfg = config.get_localized_config()

    WIKIPEDIA_BASE_URL = 'https://' + cfg.wikipedia_domain
    WIKIPEDIA_WIKI_URL = WIKIPEDIA_BASE_URL + '/wiki/'
    WIKIPEDIA_API_URL = WIKIPEDIA_BASE_URL + '/w/api.php'

    wikipedia = mwapi.MediaWikiAPI(WIKIPEDIA_API_URL, cfg.user_agent)
    parser = snippet_parser.create_snippet_parser(wikipedia, cfg)

    try:
        int(arguments['<title_or_pageid>'])
        wikitext = wikipedia.get_page_contents(
            pageid = int(arguments['<title_or_pageid>']))
    except:
        wikitext = wikipedia.get_page_contents(
            title = arguments['<title_or_pageid>'])

    for section, snippets in parser.extract(wikitext):
        if not snippets: continue
        _print('Section: %s' % section)
        for snippet in snippets:
            if arguments['--output'] != 'raw':
                output = format_html(snippet)