def process_unseen_articles(wikidb, wp10db, project, old_ratings, seen): denom = len(old_ratings.keys()) ratio = len(seen) / denom if denom != 0 else 'NaN' logger.debug('Looking for unseen articles, ratio was: %s', ratio) in_seen = 0 skipped = 0 processed = 0 n = 0 for ref, old_rating in old_ratings.items(): if ref in seen: in_seen += 1 continue # By default, we evaluate both assessment kinds. kind = AssessmentKind.BOTH if old_rating.r_quality == NOT_A_CLASS or old_rating.r_quality is None: # The quality rating is not set, so just evaluate importance kind = AssessmentKind.IMPORTANCE if (old_rating.r_importance == NOT_A_CLASS or old_rating.r_importance is None): # The importance rating is also not set, so don't do anything. skipped += 1 continue logger.debug('Processing unseen article %s', ref.decode('utf-8')) processed += 1 ns, title = ref.decode('utf-8').split(':', 1) ns = int(ns.encode('utf-8')) title = title.encode('utf-8') move_data = logic_page.get_move_data(wp10db, wikidb, ns, title, project.timestamp_dt) if move_data is not None: logic_page.update_page_moved(wp10db, project, ns, title, move_data['dest_ns'], move_data['dest_title'], move_data['timestamp_dt']) # Mark this article as having NOT_A_CLASS for it's quality or importance. # This probably means the article was deleted, but could in fact mean that # we just failed to find its move data. Either way, the new article would # have already been picked up by the assessment updater, assuming it was # tagged correctly. rating = Rating(r_project=project.p_project, r_namespace=ns, r_article=title, r_score=0) if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH): rating.quality = NOT_A_CLASS.encode('utf-8') if move_data: rating.set_quality_timestamp_dt(move_data['timestamp_dt']) else: rating.r_quality_timestamp = GLOBAL_TIMESTAMP_WIKI if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH): rating.importance = NOT_A_CLASS.encode('utf-8') if move_data: rating.set_importance_timestamp_dt(move_data['timestamp_dt']) else: rating.r_importance_timestamp = GLOBAL_TIMESTAMP_WIKI logic_rating.insert_or_update(wp10db, rating, kind) if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH): logic_rating.add_log_for_rating(wp10db, rating, AssessmentKind.QUALITY, old_rating.r_quality) if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH): logic_rating.add_log_for_rating(wp10db, rating, AssessmentKind.IMPORTANCE, old_rating.r_importance) n += 1 if n >= MAX_ARTICLES_BEFORE_COMMIT: wp10db.ping() wp10db.commit() logger.info('End, committing db') wp10db.ping() wp10db.commit() logger.debug('SEEN REPORT:\nin seen: %s\nskipped: %s\nprocessed: %s', in_seen, skipped, processed)
def update_project_assessments_by_kind(wikidb, wp10db, project, extra_assessments, kind, old_ratings, seen): if kind not in (AssessmentKind.QUALITY, AssessmentKind.IMPORTANCE): raise ValueError( 'Parameter "kind" was not one of QUALITY or IMPORTANCE') logger.info('Updating project %s assessments for %s', kind, project.p_project) rating_to_category = update_project_categories_by_kind( wikidb, wp10db, project, extra_assessments, kind) n = 0 new_ratings = defaultdict(list) for current_rating, (category, ranking) in rating_to_category.items(): logger.info('Fetching article list for %r' % category.decode('utf-8')) current_rating = current_rating.encode('utf-8') for page in logic_page.get_pages_by_category(wikidb, category): # Talk pages are tagged, we want the NS of the article itself. namespace = page.page_namespace - 1 if not logic_util.is_namespace_acceptable(namespace): logger.debug('Skipping %s with namespace=%s', page.page_title, namespace) continue article_ref = str(namespace).encode( 'utf-8') + b':' + page.page_title seen.add(article_ref) old_rating = old_ratings.get(article_ref) old_rating_value = None if old_rating: rating = Rating(**attr.asdict(old_rating)) if kind == AssessmentKind.QUALITY: old_rating_value = rating.r_quality elif kind == AssessmentKind.IMPORTANCE: old_rating_value = rating.r_importance else: rating = Rating(r_project=project.p_project, r_namespace=namespace, r_article=page.page_title, r_score=0) old_rating_value = NOT_A_CLASS.encode('utf-8') if kind == AssessmentKind.QUALITY: rating.r_quality = current_rating rating.set_quality_timestamp_dt(page.cl_timestamp) elif kind == AssessmentKind.IMPORTANCE: rating.r_importance = current_rating rating.set_importance_timestamp_dt(page.cl_timestamp) new_ratings[article_ref].append((rating, kind, old_rating_value)) n += 1 if n >= MAX_ARTICLES_BEFORE_COMMIT: wp10db.ping() wp10db.commit() logger.info('End, committing db') wp10db.ping() wp10db.commit() return (new_ratings, rating_to_category)