def dashboard_classifier(): docs = db_articles.find() if 'stats' not in cache: stats = defaultdict(Counter) for doc in docs: if 'machine' in doc.get('analysis', {}): timestamp = time.mktime(doc['time_added'].date().timetuple()) stats[timestamp]['remove'] += len( doc['analysis']['machine']['candidates']['remove']) stats[timestamp]['confirm'] += len( doc['analysis']['machine']['candidates']['confirm']) stats_json = [] for day, day_stats in sorted(stats.items()): day_stats = dict(day_stats) day_stats['day'] = day stats_json.append(day_stats) cache['stats'] = stats_json return render_template('dashboard_classifier.html', stats_json=json.dumps(cache))
def dashboard_article(query_id): query = dashboard_query_index[query_id] docs = list(db_articles.find(query['query'])) for doc in docs: doc['page'] = Page.get(doc['pages'][0]) return render_template('dashboard_query.html', query=query, articles=docs)
def moderation_queue(): articles = list(db_articles.find({'state': 'moderated'}) \ .sort([('time_added', -1)])[:50]) for doc in articles: if 'pages' in doc: doc['page'] = Page.get(doc['pages'][0]) return render_template('moderation_queue.html', articles=articles)
def dashboard_classifier(): docs = db_articles.find() if 'stats' not in cache: stats = defaultdict(Counter) for doc in docs: if 'machine' in doc.get('analysis', {}): timestamp = time.mktime(doc['time_added'].date().timetuple()) stats[timestamp]['remove'] += len(doc['analysis']['machine']['candidates']['remove']) stats[timestamp]['confirm'] += len(doc['analysis']['machine']['candidates']['confirm']) stats_json = [] for day, day_stats in sorted(stats.items()): day_stats = dict(day_stats) day_stats['day'] = day stats_json.append(day_stats) cache['stats'] = stats_json return render_template('dashboard_classifier.html', stats_json=json.dumps(cache))
from ppsay.db import db_articles from ppsay.article import Article docs = db_articles.find() for doc in docs: article = Article(doc) print article.id num_final_candidates =len([x for x in article.analysis['final']['candidates'] if x['state'] not in ['removed', 'removed_ml']]) num_final_constituencies =len([x for x in article.analysis['final']['constituencies'] if x['state'] not in ['removed', 'removed_ml']]) if num_final_candidates == 0 and num_final_constituencies == 0: article.update_stream()
from ppsay.page import Page from ppsay.db import db_articles from urlparse import urlparse for doc in db_articles.find(): if 'page' not in doc or doc['page'] is None: print "Skipping" continue page_ids = [] for url, final_url in zip(doc['page']['urls'], doc['page']['final_urls']): print url, final_url d = doc['page'] d['url'] = url d['final_url'] = final_url d['source'] = doc['source'] d['domain'] = urlparse(final_url).netloc page = Page.get_url(d['url']) if page is None: print "Creating {}".format(d['url']) page = Page(d) page.save() else: print "Exists" page_ids.append(page._id)
""" Check that every article's pages exist. """ from ppsay.db import db_articles, db_pages for article_doc in db_articles.find(): if 'pages' not in article_doc: print article_doc['_id'], "MISSING PAGES" continue for page_id in article_doc['pages']: page_doc = db_pages.find_one({'_id': page_id}) if page_doc is None: print page_id, "PAGE MISSING"
from ppsay.domains import get_domain, new_domain from ppsay.db import db_articles for article in db_articles.find({'state': 'approved'}): article_domain = article.get('domain') if article_domain is not None: domain = get_domain(article_domain) if domain is None: new_domain = new_domain(article_domain) print new_domain
from ppsay.db import db_articles from ppsay.article import Article docs = db_articles.find() for doc in docs: if 'pages' not in doc: print "MISSING PAGES" continue article = Article(doc) print article.id article.update_stream()