Ejemplo n.º 1
0
def process_unseen_articles(wikidb, wp10db, project, old_ratings, seen):
    denom = len(old_ratings.keys())
    ratio = len(seen) / denom if denom != 0 else 'NaN'

    logger.debug('Looking for unseen articles, ratio was: %s', ratio)
    in_seen = 0
    skipped = 0
    processed = 0
    n = 0
    for ref, old_rating in old_ratings.items():
        if ref in seen:
            in_seen += 1
            continue

        # By default, we evaluate both assessment kinds.
        kind = AssessmentKind.BOTH
        if old_rating.r_quality == NOT_A_CLASS or old_rating.r_quality is None:
            # The quality rating is not set, so just evaluate importance
            kind = AssessmentKind.IMPORTANCE
            if (old_rating.r_importance == NOT_A_CLASS
                    or old_rating.r_importance is None):
                # The importance rating is also not set, so don't do anything.
                skipped += 1
                continue

        logger.debug('Processing unseen article %s', ref.decode('utf-8'))
        processed += 1
        ns, title = ref.decode('utf-8').split(':', 1)
        ns = int(ns.encode('utf-8'))
        title = title.encode('utf-8')

        move_data = logic_page.get_move_data(wp10db, wikidb, ns, title,
                                             project.timestamp_dt)
        if move_data is not None:
            logic_page.update_page_moved(wp10db, project, ns, title,
                                         move_data['dest_ns'],
                                         move_data['dest_title'],
                                         move_data['timestamp_dt'])

        # Mark this article as having NOT_A_CLASS for it's quality or importance.
        # This probably means the article was deleted, but could in fact mean that
        # we just failed to find its move data. Either way, the new article would
        # have already been picked up by the assessment updater, assuming it was
        # tagged correctly.
        rating = Rating(r_project=project.p_project,
                        r_namespace=ns,
                        r_article=title,
                        r_score=0)
        if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH):
            rating.quality = NOT_A_CLASS.encode('utf-8')
            if move_data:
                rating.set_quality_timestamp_dt(move_data['timestamp_dt'])
            else:
                rating.r_quality_timestamp = GLOBAL_TIMESTAMP_WIKI
        if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH):
            rating.importance = NOT_A_CLASS.encode('utf-8')
            if move_data:
                rating.set_importance_timestamp_dt(move_data['timestamp_dt'])
            else:
                rating.r_importance_timestamp = GLOBAL_TIMESTAMP_WIKI

        logic_rating.insert_or_update(wp10db, rating, kind)

        if kind in (AssessmentKind.QUALITY, AssessmentKind.BOTH):
            logic_rating.add_log_for_rating(wp10db, rating,
                                            AssessmentKind.QUALITY,
                                            old_rating.r_quality)
        if kind in (AssessmentKind.IMPORTANCE, AssessmentKind.BOTH):
            logic_rating.add_log_for_rating(wp10db, rating,
                                            AssessmentKind.IMPORTANCE,
                                            old_rating.r_importance)

        n += 1
        if n >= MAX_ARTICLES_BEFORE_COMMIT:
            wp10db.ping()
            wp10db.commit()
    logger.info('End, committing db')
    wp10db.ping()
    wp10db.commit()

    logger.debug('SEEN REPORT:\nin seen: %s\nskipped: %s\nprocessed: %s',
                 in_seen, skipped, processed)
Ejemplo n.º 2
0
def update_project_assessments_by_kind(wikidb, wp10db, project,
                                       extra_assessments, kind, old_ratings,
                                       seen):
    if kind not in (AssessmentKind.QUALITY, AssessmentKind.IMPORTANCE):
        raise ValueError(
            'Parameter "kind" was not one of QUALITY or IMPORTANCE')

    logger.info('Updating project %s assessments for %s', kind,
                project.p_project)
    rating_to_category = update_project_categories_by_kind(
        wikidb, wp10db, project, extra_assessments, kind)

    n = 0
    new_ratings = defaultdict(list)
    for current_rating, (category, ranking) in rating_to_category.items():
        logger.info('Fetching article list for %r' % category.decode('utf-8'))
        current_rating = current_rating.encode('utf-8')

        for page in logic_page.get_pages_by_category(wikidb, category):
            # Talk pages are tagged, we want the NS of the article itself.
            namespace = page.page_namespace - 1
            if not logic_util.is_namespace_acceptable(namespace):
                logger.debug('Skipping %s with namespace=%s', page.page_title,
                             namespace)
                continue

            article_ref = str(namespace).encode(
                'utf-8') + b':' + page.page_title
            seen.add(article_ref)

            old_rating = old_ratings.get(article_ref)
            old_rating_value = None

            if old_rating:
                rating = Rating(**attr.asdict(old_rating))
                if kind == AssessmentKind.QUALITY:
                    old_rating_value = rating.r_quality
                elif kind == AssessmentKind.IMPORTANCE:
                    old_rating_value = rating.r_importance
            else:
                rating = Rating(r_project=project.p_project,
                                r_namespace=namespace,
                                r_article=page.page_title,
                                r_score=0)
                old_rating_value = NOT_A_CLASS.encode('utf-8')

            if kind == AssessmentKind.QUALITY:
                rating.r_quality = current_rating
                rating.set_quality_timestamp_dt(page.cl_timestamp)
            elif kind == AssessmentKind.IMPORTANCE:
                rating.r_importance = current_rating
                rating.set_importance_timestamp_dt(page.cl_timestamp)

            new_ratings[article_ref].append((rating, kind, old_rating_value))
            n += 1
            if n >= MAX_ARTICLES_BEFORE_COMMIT:
                wp10db.ping()
                wp10db.commit()
    logger.info('End, committing db')
    wp10db.ping()
    wp10db.commit()

    return (new_ratings, rating_to_category)