Exemple #1
0
def topics(report=False, dryrun=False, force=False):
    index = set_hosts_index()

    logprint('debug', '------------------------------------------------------------------------')
    logprint('debug', 'indexing topics...')
    Elasticsearch.index_topics()
    logprint('debug', 'DONE')
Exemple #2
0
def authors(report=False, dryrun=False, force=False):
    index = set_hosts_index()

    logprint('debug', '------------------------------------------------------------------------')
    logprint('debug', 'getting mw_authors...')
    mw_author_titles = Proxy().authors(cached_ok=False)
    mw_articles = Proxy().articles_lastmod()
    logprint('debug', 'getting es_authors...')
    es_authors = Author.authors()
    if force:
        logprint('debug', 'forcibly update all authors')
        authors_new = [page['title'] for page in es_authors]
        authors_delete = []
    else:
        logprint('debug', 'determining new,delete...')
        authors_new,authors_delete = Elasticsearch.authors_to_update(
            mw_author_titles, mw_articles, es_authors)
    logprint('debug', 'mediawiki authors: %s' % len(mw_author_titles))
    logprint('debug', 'authors to add: %s' % len(authors_new))
    logprint('debug', 'authors to delete: %s' % len(authors_delete))
    if report:
        return
    
    logprint('debug', 'deleting...')
    for n,title in enumerate(authors_delete):
        logprint('debug', '--------------------')
        logprint('debug', '%s/%s %s' % (n, len(authors_delete), title))
        author = Author.get(title)
        if not dryrun:
            author.delete()
     
    logprint('debug', 'adding...')
    for n,title in enumerate(authors_new):
        logprint('debug', '--------------------')
        logprint('debug', '%s/%s %s' % (n, len(authors_new), title))
        logprint('debug', 'getting from mediawiki')
        mwauthor = Proxy().page(title)
        try:
            existing_author = Author.get(title)
            logprint('debug', 'exists in elasticsearch')
        except:
            existing_author = None
        logprint('debug', 'creating author')
        author = Author.from_mw(mwauthor, author=existing_author)
        if not dryrun:
            logprint('debug', 'saving')
            author.save()
            try:
                a = Author.get(title)
            except NotFoundError:
                logprint('error', 'ERROR: Author(%s) NOT SAVED!' % title)
    
    logprint('debug', 'DONE')
Exemple #3
0
def articles(report=False, dryrun=False, force=False):
    index = set_hosts_index()
    
    logprint('debug', '------------------------------------------------------------------------')
    # authors need to be refreshed
    logprint('debug', 'getting mw_authors,articles...')
    mw_author_titles = Proxy().authors(cached_ok=False)
    mw_articles = Proxy().articles_lastmod()
    logprint('debug', 'getting es_articles...')
    es_articles = Page.pages()
    if force:
        logprint('debug', 'forcibly update all articles')
        articles_update = [page['title'] for page in es_articles]
        articles_delete = []
    else:
        logprint('debug', 'determining new,delete...')
        articles_update,articles_delete = Elasticsearch.articles_to_update(
            mw_author_titles, mw_articles, es_articles)
    logprint('debug', 'mediawiki articles: %s' % len(mw_articles))
    logprint('debug', 'elasticsearch articles: %s' % len(es_articles))
    logprint('debug', 'articles to update: %s' % len(articles_update))
    logprint('debug', 'articles to delete: %s' % len(articles_delete))
    if report:
        return
    
    logprint('debug', 'adding articles...')
    posted = 0
    could_not_post = []
    for n,title in enumerate(articles_update):
        logprint('debug', '--------------------')
        logprint('debug', '%s/%s %s' % (n+1, len(articles_update), title))
        logprint('debug', 'getting from mediawiki')
        mwpage = Proxy().page(title)
        try:
            existing_page = Page.get(title)
            logprint('debug', 'exists in elasticsearch')
        except:
            existing_page = None
        if (mwpage.published or settings.MEDIAWIKI_SHOW_UNPUBLISHED):
            page_sources = [source['encyclopedia_id'] for source in mwpage.sources]
            for mwsource in mwpage.sources:
                logprint('debug', '- source %s' % mwsource['encyclopedia_id'])
                source = Source.from_mw(mwsource, title)
                if not dryrun:
                    source.save()
            logprint('debug', 'creating page')
            page = Page.from_mw(mwpage, page=existing_page)
            if not dryrun:
                logprint('debug', 'saving')
                page.save()
                try:
                    p = Page.get(title)
                except NotFoundError:
                    logprint('error', 'ERROR: Page(%s) NOT SAVED!' % title)
        else:
            logprint('debug', 'not publishable: %s' % mwpage)
            could_not_post.append(mwpage)
    
    if could_not_post:
        logprint('debug', '========================================================================')
        logprint('debug', 'Could not post these: %s' % could_not_post)
    logprint('debug', 'DONE')