Exemple #1
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [[1]])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [[1, 2]])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)), [[1, 2], [3, 4], [5]])

        # passing in a length overrides the real len(list)
        eq_(list(chunked([1, 2, 3, 4, 5, 6, 7], 2, length=4)), [[1, 2], [3, 4]])
Exemple #2
0
def auto_archive_old_questions():
    """Archive all questions that were created over 180 days ago"""
    # Set up logging so it doesn't send Ricky email.
    logging.basicConfig(level=logging.ERROR)

    # Get a list of ids of questions we're going to go change. We need
    # a list of ids so that we can feed it to the update, but then
    # also know what we need to update in the index.
    days_180 = datetime.now() - timedelta(days=180)
    q_ids = list(Question.objects.filter(is_archived=False)
                                 .filter(created__lte=days_180)
                                 .values_list('id', flat=True))

    if q_ids:
        log.info('Updating %d questions', len(q_ids))

        sql = """
            UPDATE questions_question
            SET is_archived = 1
            WHERE id IN (%s)
            """ % ','.join(map(str, q_ids))

        cursor = connection.cursor()
        cursor.execute(sql)
        if not transaction.get_connection().in_atomic_block:
            transaction.commit()

        if settings.ES_LIVE_INDEXING:
            try:
                # So... the first time this runs, it'll handle 160K
                # questions or so which stresses everything. Thus we
                # do it in chunks because otherwise this won't work.
                #
                # After we've done this for the first time, we can nix
                # the chunking code.

                from kitsune.search.utils import chunked
                for chunk in chunked(q_ids, 100):

                    # Fetch all the documents we need to update.
                    es_docs = get_documents(QuestionMappingType, chunk)

                    log.info('Updating %d index documents', len(es_docs))

                    documents = []

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_archived'] = True
                        doc[u'indexed_on'] = int(time.time())
                        documents.append(doc)

                    QuestionMappingType.bulk_index(documents)

            except ES_EXCEPTIONS:
                # Something happened with ES, so let's push index
                # updating into an index_task which retries when it
                # fails because of ES issues.
                index_task.delay(QuestionMappingType, q_ids)
Exemple #3
0
def auto_archive_old_questions():
    """Archive all questions that were created over 180 days ago"""
    # Set up logging so it doesn't send Ricky email.
    logging.basicConfig(level=logging.ERROR)

    # Get a list of ids of questions we're going to go change. We need
    # a list of ids so that we can feed it to the update, but then
    # also know what we need to update in the index.
    days_180 = datetime.now() - timedelta(days=180)
    q_ids = list(
        Question.objects.filter(is_archived=False).filter(
            created__lte=days_180).values_list('id', flat=True))

    if q_ids:
        log.info('Updating %d questions', len(q_ids))

        sql = """
            UPDATE questions_question
            SET is_archived = 1
            WHERE id IN (%s)
            """ % ','.join(map(str, q_ids))

        cursor = connection.cursor()
        cursor.execute(sql)
        if not transaction.get_connection().in_atomic_block:
            transaction.commit()

        if settings.ES_LIVE_INDEXING:
            try:
                # So... the first time this runs, it'll handle 160K
                # questions or so which stresses everything. Thus we
                # do it in chunks because otherwise this won't work.
                #
                # After we've done this for the first time, we can nix
                # the chunking code.

                from kitsune.search.utils import chunked
                for chunk in chunked(q_ids, 100):

                    # Fetch all the documents we need to update.
                    es_docs = get_documents(QuestionMappingType, chunk)

                    log.info('Updating %d index documents', len(es_docs))

                    documents = []

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_archived'] = True
                        doc[u'indexed_on'] = int(time.time())
                        documents.append(doc)

                    QuestionMappingType.bulk_index(documents)

            except ES_EXCEPTIONS:
                # Something happened with ES, so let's push index
                # updating into an index_task which retries when it
                # fails because of ES issues.
                index_task.delay(QuestionMappingType, q_ids)
Exemple #4
0
    def test_chunked(self):
        # chunking nothing yields nothing.
        eq_(list(chunked([], 1)), [])

        # chunking list where len(list) < n
        eq_(list(chunked([1], 10)), [[1]])

        # chunking a list where len(list) == n
        eq_(list(chunked([1, 2], 2)), [[1, 2]])

        # chunking list where len(list) > n
        eq_(list(chunked([1, 2, 3, 4, 5], 2)), [[1, 2], [3, 4], [5]])

        # passing in a length overrides the real len(list)
        eq_(list(chunked([1, 2, 3, 4, 5, 6, 7], 2, length=4)),
            [[1, 2], [3, 4]])
Exemple #5
0
def recalculate_karma_points():
    """Go through all karma action data and recalculate points."""
    if not waffle.switch_is_active('karma'):
        return

    for chunk in chunked(list(KarmaManager().user_ids()), 2500):
        _process_recalculate_chunk.apply_async(args=[chunk])
Exemple #6
0
def recalculate_karma_points():
    """Go through all karma action data and recalculate points."""
    if not waffle.switch_is_active('karma'):
        return

    for chunk in chunked(list(KarmaManager().user_ids()), 2500):
        _process_recalculate_chunk.apply_async(args=[chunk])
Exemple #7
0
def rebuild_kb():
    """Re-render all documents in the KB in chunks."""
    cache.delete(settings.WIKI_REBUILD_TOKEN)

    d = Document.objects.using("default").filter(current_revision__isnull=False).values_list("id", flat=True)

    for chunk in chunked(d, 100):
        _rebuild_kb_chunk.apply_async(args=[chunk])
Exemple #8
0
def init_karma():
    """Flushes the karma redis backend and populates with fresh data.

    Goes through all questions/answers/votes and save karma actions for them.
    """
    if not waffle.switch_is_active('karma'):
        return

    redis_client('karma').flushdb()

    questions = Question.objects.all()
    for chunk in chunked(questions.values_list('pk', flat=True), 200):
        _process_question_chunk.apply_async(args=[chunk])

    votes = AnswerVote.objects.all()
    for chunk in chunked(votes.values_list('pk', flat=True), 1000):
        _process_answer_vote_chunk.apply_async(args=[chunk])
Exemple #9
0
def init_karma():
    """Flushes the karma redis backend and populates with fresh data.

    Goes through all questions/answers/votes and save karma actions for them.
    """
    if not waffle.switch_is_active('karma'):
        return

    redis_client('karma').flushdb()

    questions = Question.objects.all()
    for chunk in chunked(questions.values_list('pk', flat=True), 200):
        _process_question_chunk.apply_async(args=[chunk])

    votes = AnswerVote.objects.all()
    for chunk in chunked(votes.values_list('pk', flat=True), 1000):
        _process_answer_vote_chunk.apply_async(args=[chunk])
Exemple #10
0
def rebuild_kb():
    """Re-render all documents in the KB in chunks."""
    cache.delete(settings.WIKI_REBUILD_TOKEN)

    d = (Document.objects.using('default').filter(
        current_revision__isnull=False).values_list('id', flat=True))

    for chunk in chunked(d, 100):
        _rebuild_kb_chunk.apply_async(args=[chunk])
    def forwards(self, orm):
        """Copy all the question.{topics,products} to {topic,product} FK."""

        count = orm.Question.objects.count()
        for chunk in chunked(orm.Question.objects.all(), 2500, count):
            for question in chunk:
                question.product = question.products.first()
                question.topic = question.topics.first()
                if question.product or question.topic:
                    question.save()
    def forwards(self, orm):
        """Copy all the question.{topics,products} to {topic,product} FK."""

        count = orm.Question.objects.count()
        for chunk in chunked(orm.Question.objects.all(), 2500, count):
            for question in chunk:
                question.product = question.products.first()
                question.topic = question.topics.first()
                if question.product or question.topic:
                    question.save()
Exemple #13
0
    def change_and_reindex(self, orm, is_archived, is_locked):
        """Locks all questions that were created over 180 days ago"""
        # Get a list of ids of questions we're going to go change. We need
        # a list of ids so that we can feed it to the update, but then
        # also know what we need to update in the index.
        days_180 = datetime.now() - timedelta(days=180)
        assert is_archived != is_locked
        f = Q(created__lte=days_180)
        if is_archived:
            f |= Q(is_locked=True)
        if is_locked:
            f |= Q(is_archived=True)

        # Update the DB
        (orm.Question.objects.filter(f).update(
            is_archived=is_archived, is_locked=is_locked))

        # Using the efficient .update() of query sets doesn't emit any
        # signals, so live indexing won't automatically happen. This
        # does it manually.
        if settings.ES_LIVE_INDEXING:
            q_ids = list(
                orm.Question.objects.filter(f).values_list('id', flat=True))
            try:
                # This is going to process about 200K questions in
                # production, so it will take a while and stress
                # everything. To alleviate this stress, it is
                # divided into chunks.

                for chunk in chunked(q_ids, 1000):
                    # Fetch all the documents we need to update.
                    es_docs = get_documents(QuestionMappingType, chunk)

                    documents = []

                    # For each document, update the data and stick it
                    # back in the index.
                    for doc in es_docs:
                        doc[u'question_is_locked'] = is_locked
                        doc[u'question_is_archived'] = is_archived
                        doc[u'indexed_on'] = int(time.time())
                        documents.append(doc)

                    if documents:
                        QuestionMappingType.bulk_index(documents)

            except ES_EXCEPTIONS:
                # Something happened with ES, so let's push index
                # updating into an index_task which retries when it
                # fails because of ES issues.
                index_task.delay(QuestionMappingType, q_ids)
Exemple #14
0
    def delete_expired_users(self):
        """
        Remove expired instances of this manager's object class.

        Accounts to be deleted are identified by searching for
        instances of this manager's object class with expired activation
        keys, and then checking to see if their associated ``User``
        instances have the field ``is_active`` set to ``False``; any
        ``User`` who is both inactive and has an expired activation
        key will be deleted.
        """
        days_valid = settings.ACCOUNT_ACTIVATION_DAYS
        expired = datetime.now() - timedelta(days=days_valid)
        prof_ids = self.filter(user__date_joined__lt=expired)
        prof_ids = prof_ids.values_list('id', flat=True)
        for chunk in chunked(prof_ids, 1000):
            _delete_registration_profiles_chunk.apply_async(args=[chunk])
Exemple #15
0
    def delete_expired_users(self):
        """
        Remove expired instances of this manager's object class.

        Accounts to be deleted are identified by searching for
        instances of this manager's object class with expired activation
        keys, and then checking to see if their associated ``User``
        instances have the field ``is_active`` set to ``False``; any
        ``User`` who is both inactive and has an expired activation
        key will be deleted.
        """
        days_valid = settings.ACCOUNT_ACTIVATION_DAYS
        expired = datetime.now() - timedelta(days=days_valid)
        prof_ids = self.filter(user__date_joined__lt=expired)
        prof_ids = prof_ids.values_list('id', flat=True)
        for chunk in chunked(prof_ids, 1000):
            _delete_registration_profiles_chunk.apply_async(args=[chunk])
Exemple #16
0
    def handle(self, **options):
        # Get all questions (id) with a vote in the last week.
        recent = datetime.now() - timedelta(days=7)
        q = QuestionVote.objects.filter(created__gte=recent)
        q = q.values_list('question_id', flat=True).order_by('question')
        q = q.distinct()
        q_with_recent_votes = list(q)

        # Get all questions with num_votes_past_week > 0
        q = Question.objects.filter(num_votes_past_week__gt=0)
        q = q.values_list('id', flat=True)
        q_with_nonzero_votes = list(q)

        # Union.
        qs_to_update = list(set(q_with_recent_votes + q_with_nonzero_votes))

        # Chunk them for tasks.
        for chunk in chunked(qs_to_update, 50):
            update_question_vote_chunk.apply_async(args=[chunk])
Exemple #17
0
def get_customercare_stats():
    """
    Generate customer care stats from the Replies table.

    This gets cached in Redis as a sorted list of contributors, stored as JSON.

    Example Top Contributor data:

    [
        {
            'twitter_username': '******',
            'avatar': 'http://twitter.com/path/to/the/avatar.png',
            'avatar_https': 'https://twitter.com/path/to/the/avatar.png',
            'all': 5211,
            '1m': 230,
            '1w': 33,
            '1d': 3,
        },
        { ... },
        { ... },
    ]
    """
    if settings.STAGE:
        return

    contributor_stats = {}

    now = datetime.now()
    one_month_ago = now - timedelta(days=30)
    one_week_ago = now - timedelta(days=7)
    yesterday = now - timedelta(days=1)

    for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()):
        for reply in chunk:
            user = reply.twitter_username
            if user not in contributor_stats:
                raw = json.loads(reply.raw_json)
                if 'from_user' in raw:  # For tweets collected using v1 API
                    user_data = raw
                else:
                    user_data = raw['user']

                contributor_stats[user] = {
                    'twitter_username': user,
                    'avatar': user_data['profile_image_url'],
                    'avatar_https': user_data['profile_image_url_https'],
                    'all': 0,
                    '1m': 0,
                    '1w': 0,
                    '1d': 0,
                }
            contributor = contributor_stats[reply.twitter_username]

            contributor['all'] += 1
            if reply.created > one_month_ago:
                contributor['1m'] += 1
                if reply.created > one_week_ago:
                    contributor['1w'] += 1
                    if reply.created > yesterday:
                        contributor['1d'] += 1

    sort_key = settings.CC_TOP_CONTRIB_SORT
    limit = settings.CC_TOP_CONTRIB_LIMIT
    # Sort by whatever is in settings, break ties with 'all'
    contributor_stats = sorted(contributor_stats.values(),
                               key=lambda c: (c[sort_key], c['all']),
                               reverse=True)[:limit]

    try:
        redis = redis_client(name='default')
        key = settings.CC_TOP_CONTRIB_CACHE_KEY
        redis.set(key, json.dumps(contributor_stats))
    except RedisError as e:
        statsd.incr('redis.error')
        log.error('Redis error: %s' % e)

    return contributor_stats
Exemple #18
0
    def handle(self, **options):
        """
        This gets cached in Redis as a sorted list of contributors, stored as JSON.

        Example Top Contributor data:

        [
            {
                'twitter_username': '******',
                'avatar': 'http://twitter.com/path/to/the/avatar.png',
                'avatar_https': 'https://twitter.com/path/to/the/avatar.png',
                'all': 5211,
                '1m': 230,
                '1w': 33,
                '1d': 3,
            },
            { ... },
            { ... },
        ]
        """
        if settings.STAGE:
            return

        contributor_stats = {}

        now = datetime.now()
        one_month_ago = now - timedelta(days=30)
        one_week_ago = now - timedelta(days=7)
        yesterday = now - timedelta(days=1)

        for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()):
            for reply in chunk:
                user = reply.twitter_username
                if user not in contributor_stats:
                    raw = json.loads(reply.raw_json)
                    if "from_user" in raw:  # For tweets collected using v1 API
                        user_data = raw
                    else:
                        user_data = raw["user"]

                    contributor_stats[user] = {
                        "twitter_username": user,
                        "avatar": user_data["profile_image_url"],
                        "avatar_https": user_data["profile_image_url_https"],
                        "all": 0,
                        "1m": 0,
                        "1w": 0,
                        "1d": 0,
                    }
                contributor = contributor_stats[reply.twitter_username]

                contributor["all"] += 1
                if reply.created > one_month_ago:
                    contributor["1m"] += 1
                    if reply.created > one_week_ago:
                        contributor["1w"] += 1
                        if reply.created > yesterday:
                            contributor["1d"] += 1

        sort_key = settings.CC_TOP_CONTRIB_SORT
        limit = settings.CC_TOP_CONTRIB_LIMIT
        # Sort by whatever is in settings, break ties with 'all'
        contributor_stats = sorted(
            list(contributor_stats.values()),
            key=lambda c: (c[sort_key], c["all"]),
            reverse=True,
        )[:limit]

        try:
            redis = redis_client(name="default")
            key = settings.CC_TOP_CONTRIB_CACHE_KEY
            redis.set(key, json.dumps(contributor_stats))
        except RedisError as e:
            log.error("Redis error: %s" % e)

        return contributor_stats
Exemple #19
0
def get_customercare_stats():
    """
    Generate customer care stats from the Replies table.

    This gets cached in Redis as a sorted list of contributors, stored as JSON.

    Example Top Contributor data:

    [
        {
            'twitter_username': '******',
            'avatar': 'http://twitter.com/path/to/the/avatar.png',
            'avatar_https': 'https://twitter.com/path/to/the/avatar.png',
            'all': 5211,
            '1m': 230,
            '1w': 33,
            '1d': 3,
        },
        { ... },
        { ... },
    ]
    """
    if settings.STAGE:
        print ('Skipped get_customercare_stats(). '
               'Set settings.STAGE to False to run it for real.')
        return

    contributor_stats = {}

    now = datetime.now()
    one_month_ago = now - timedelta(days=30)
    one_week_ago = now - timedelta(days=7)
    yesterday = now - timedelta(days=1)

    for chunk in chunked(Reply.objects.all(), 2500, Reply.objects.count()):
        for reply in chunk:
            user = reply.twitter_username
            if user not in contributor_stats:
                raw = json.loads(reply.raw_json)
                if 'from_user' in raw:  # For tweets collected using v1 API
                    user_data = raw
                else:
                    user_data = raw['user']

                contributor_stats[user] = {
                    'twitter_username': user,
                    'avatar': user_data['profile_image_url'],
                    'avatar_https': user_data['profile_image_url_https'],
                    'all': 0, '1m': 0, '1w': 0, '1d': 0,
                }
            contributor = contributor_stats[reply.twitter_username]

            contributor['all'] += 1
            if reply.created > one_month_ago:
                contributor['1m'] += 1
                if reply.created > one_week_ago:
                    contributor['1w'] += 1
                    if reply.created > yesterday:
                        contributor['1d'] += 1

    sort_key = settings.CC_TOP_CONTRIB_SORT
    limit = settings.CC_TOP_CONTRIB_LIMIT
    # Sort by whatever is in settings, break ties with 'all'
    contributor_stats = sorted(contributor_stats.values(),
                               key=lambda c: (c[sort_key], c['all']),
                               reverse=True)[:limit]

    try:
        redis = redis_client(name='default')
        key = settings.CC_TOP_CONTRIB_CACHE_KEY
        redis.set(key, json.dumps(contributor_stats))
    except RedisError as e:
        statsd.incr('redis.error')
        log.error('Redis error: %s' % e)

    return contributor_stats
Exemple #20
0
def add_short_links(data):
    """Create short_url's for a list of docs."""
    for chunk in chunked(data, 100):
        _add_short_links_chunked.apply_async(args=[chunk])