Esempio n. 1
0
def dashboard_classifier():
    docs = db_articles.find()

    if 'stats' not in cache:
        stats = defaultdict(Counter)

        for doc in docs:
            if 'machine' in doc.get('analysis', {}):
                timestamp = time.mktime(doc['time_added'].date().timetuple())

                stats[timestamp]['remove'] += len(
                    doc['analysis']['machine']['candidates']['remove'])
                stats[timestamp]['confirm'] += len(
                    doc['analysis']['machine']['candidates']['confirm'])

        stats_json = []
        for day, day_stats in sorted(stats.items()):
            day_stats = dict(day_stats)
            day_stats['day'] = day
            stats_json.append(day_stats)

        cache['stats'] = stats_json

    return render_template('dashboard_classifier.html',
                           stats_json=json.dumps(cache))
Esempio n. 2
0
def dashboard_article(query_id):
    query = dashboard_query_index[query_id]
    docs = list(db_articles.find(query['query']))

    for doc in docs:
        doc['page'] = Page.get(doc['pages'][0])

    return render_template('dashboard_query.html', query=query, articles=docs)
Esempio n. 3
0
def moderation_queue():
    articles = list(db_articles.find({'state': 'moderated'}) \
                          .sort([('time_added', -1)])[:50])

    for doc in articles:
        if 'pages' in doc:
            doc['page'] = Page.get(doc['pages'][0])

    return render_template('moderation_queue.html', articles=articles)
Esempio n. 4
0
def moderation_queue():
    articles = list(db_articles.find({'state': 'moderated'}) \
                          .sort([('time_added', -1)])[:50])

    for doc in articles:
        if 'pages' in doc:
            doc['page'] = Page.get(doc['pages'][0])

    return render_template('moderation_queue.html',
                           articles=articles)
Esempio n. 5
0
def dashboard_article(query_id):
    query = dashboard_query_index[query_id]
    docs = list(db_articles.find(query['query']))

    for doc in docs:
        doc['page'] = Page.get(doc['pages'][0])

    return render_template('dashboard_query.html',
                           query=query,
                           articles=docs)
Esempio n. 6
0
def dashboard_classifier():
    docs = db_articles.find()

    if 'stats' not in cache:
        stats = defaultdict(Counter)

        for doc in docs:
            if 'machine' in doc.get('analysis', {}):
                timestamp = time.mktime(doc['time_added'].date().timetuple())

                stats[timestamp]['remove'] += len(doc['analysis']['machine']['candidates']['remove'])
                stats[timestamp]['confirm'] += len(doc['analysis']['machine']['candidates']['confirm'])

        stats_json = []
        for day, day_stats in sorted(stats.items()):
            day_stats = dict(day_stats)
            day_stats['day'] = day
            stats_json.append(day_stats)

        cache['stats'] = stats_json
    
    return render_template('dashboard_classifier.html',
                           stats_json=json.dumps(cache))
Esempio n. 7
0
from ppsay.db import db_articles
from ppsay.article import Article

docs = db_articles.find()

for doc in docs:
    article = Article(doc)

    print article.id

    num_final_candidates =len([x for x in article.analysis['final']['candidates'] if x['state'] not in ['removed', 'removed_ml']])
    num_final_constituencies =len([x for x in article.analysis['final']['constituencies'] if x['state'] not in ['removed', 'removed_ml']])

    if num_final_candidates == 0 and num_final_constituencies == 0:
        article.update_stream()

Esempio n. 8
0
from ppsay.page import Page
from ppsay.db import db_articles
from urlparse import urlparse

for doc in db_articles.find():
    if 'page' not in doc or doc['page'] is None:
        print "Skipping"
        continue

    page_ids = []

    for url, final_url in zip(doc['page']['urls'], doc['page']['final_urls']):
        print url, final_url

        d = doc['page']

        d['url'] = url
        d['final_url'] = final_url
        d['source'] = doc['source']
        d['domain'] = urlparse(final_url).netloc

        page = Page.get_url(d['url'])

        if page is None:
            print "Creating {}".format(d['url'])
            page = Page(d)
            page.save()
        else:
            print "Exists"

        page_ids.append(page._id)
Esempio n. 9
0
"""
    Check that every article's pages exist.
"""

from ppsay.db import db_articles, db_pages

for article_doc in db_articles.find():
    if 'pages' not in article_doc:
        print article_doc['_id'], "MISSING PAGES"
        continue

    for page_id in article_doc['pages']:
        page_doc = db_pages.find_one({'_id': page_id})

        if page_doc is None:
            print page_id, "PAGE MISSING"


Esempio n. 10
0
from ppsay.page import Page
from ppsay.db import db_articles
from urlparse import urlparse

for doc in db_articles.find():
    if 'page' not in doc or doc['page'] is None:
        print "Skipping"
        continue

    page_ids = []

    for url, final_url in zip(doc['page']['urls'], doc['page']['final_urls']):
        print url, final_url

        d = doc['page']

        d['url'] = url
        d['final_url'] = final_url
        d['source'] = doc['source']
        d['domain'] = urlparse(final_url).netloc

        page = Page.get_url(d['url'])

        if page is None:
            print "Creating {}".format(d['url'])
            page = Page(d)
            page.save()
        else:
            print "Exists"

        page_ids.append(page._id)
Esempio n. 11
0
from ppsay.domains import get_domain, new_domain
from ppsay.db import db_articles

for article in db_articles.find({'state': 'approved'}):
    article_domain = article.get('domain')

    if article_domain is not None:
        domain = get_domain(article_domain)

        if domain is None:
            new_domain = new_domain(article_domain)
            print new_domain

Esempio n. 12
0
from ppsay.db import db_articles
from ppsay.article import Article

docs = db_articles.find()

for doc in docs:
    if 'pages' not in doc:
        print "MISSING PAGES"
        continue

    article = Article(doc)
    print article.id
    article.update_stream()