Ejemplo n.º 1
0
def populate_spam_filtered():
    from v1.lib.db.queries import get_spam_links, get_spam_comments
    from v1.lib.db.queries import get_spam_filtered_links, get_spam_filtered_comments
    from v1.models.query_cache import CachedQueryMutator

    def was_filtered(thing):
        if thing._spam and not thing._deleted and \
           getattr(thing, 'verdict', None) != 'mod-removed':
            return True
        else:
            return False

    q = Subverbify._query(sort=asc('_date'))
    for sr in fetch_things2(q):
        print 'Processing %s' % sr.name
        links = Thing._by_fullname(get_spam_links(sr),
                                   data=True,
                                   return_dict=False)
        comments = Thing._by_fullname(get_spam_comments(sr),
                                      data=True,
                                      return_dict=False)
        insert_links = [l for l in links if was_filtered(l)]
        insert_comments = [c for c in comments if was_filtered(c)]
        with CachedQueryMutator() as m:
            m.insert(get_spam_filtered_links(sr), insert_links)
            m.insert(get_spam_filtered_comments(sr), insert_comments)
Ejemplo n.º 2
0
def backfill(after=None):
    q = Subverbify._query(sort=asc('_date'))
    if after:
        sr = Subverbify._by_name(after)
        q = q._after(sr)

    for sr in fetch_things2(q):
        backfill_sr(sr)
Ejemplo n.º 3
0
def backfill_campaign_targets():
    from v1.lib.db.operators import desc
    from v1.lib.utils import fetch_things2

    q = PromoCampaign._query(sort=desc("_date"), data=True)
    for campaign in fetch_things2(q):
        sr_name = campaign.sr_name or Frontpage.name
        campaign.target = Target(sr_name)
        campaign._commit()
Ejemplo n.º 4
0
    def load_accounts(inbox_rel):
        accounts = set()
        q = inbox_rel._query(eager_load=False, data=False, sort=desc("_date"))
        if min_date:
            q._filter(inbox_rel.c._date > min_date)

        for i in fetch_things2(q):
            accounts.add(i._thing1_id)

        return accounts
Ejemplo n.º 5
0
def add_allow_top_to_srs():
    "Add the allow_top property to all stored subverbifys"
    from v1.models import Subverbify
    from v1.lib.db.operators import desc
    from v1.lib.utils import fetch_things2

    q = Subverbify._query(Subverbify.c._spam == (True, False),
                          sort=desc('_date'))
    for sr in fetch_things2(q):
        sr.allow_top = True
        sr._commit()
Ejemplo n.º 6
0
def backfill_deleted_accounts(resume_id=None):
    del_accts = Account._query(Account.c._deleted == True, sort=desc('_date'))
    if resume_id:
        del_accts._filter(Account.c._id < resume_id)

    for i, account in enumerate(progress(fetch_things2(del_accts))):
        # Don't kill the rabbit! Wait for the relevant queues to calm down.
        if i % 1000 == 0:
            del_len = get_queue_length('del_account_q')
            cs_len = get_queue_length('cloudsearch_changes')
            while (del_len > 1000 or
                    cs_len > 10000):
                sys.stderr.write(("CS: %d, DEL: %d" % (cs_len, del_len)) + "\n")
                sys.stderr.flush()
                time.sleep(1)
                del_len = get_queue_length('del_account_q')
                cs_len = get_queue_length('cloudsearch_changes')
        amqp.add_item('account_deleted', account._fullname)
Ejemplo n.º 7
0
def port_deleted_links(after_id=None):
    from v1.models import Link
    from v1.lib.db.operators import desc
    from v1.models.query_cache import CachedQueryMutator
    from v1.lib.db.queries import get_deleted_links
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True,
                    Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Ejemplo n.º 8
0
def load_all_verbifys():
    query_cache = {}

    q = Subverbify._query(Subverbify.c.type == 'public',
                         Subverbify.c._spam == False,
                         Subverbify.c._downs > 1,
                         sort = (desc('_downs'), desc('_ups')),
                         data = True)
    for sr in utils.fetch_things2(q):
        if sr.quarantine:
            continue
        name = sr.name.lower()
        for i in xrange(len(name)):
            prefix = name[:i + 1]
            names = query_cache.setdefault(prefix, [])
            if len(names) < 10:
                names.append((sr.name, sr.over_18))

    for name_prefix, subverbifys in query_cache.iteritems():
        SubverbifysByPartialName._set_values(name_prefix, {'tups': subverbifys})
Ejemplo n.º 9
0
def port_cassaurls(after_id=None, estimate=15231317):
    from v1.models import Link, LinksByUrlAndSubverbify
    from v1.lib.db import tdb_cassandra
    from v1.lib.db.operators import desc
    from v1.lib.db.tdb_cassandra import CL
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubverbify.add_link(l)
Ejemplo n.º 10
0
def rebuild_link_index(start_at=None,
                       sleeptime=1,
                       cls=Link,
                       uploader=LinkUploader,
                       doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000,
                       chunk_size=1000):
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = v1utils.fetch_things2(q, chunk_size=chunk_size)
    q = v1utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in v1utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
Ejemplo n.º 11
0
def _populate(after_id=None, estimate=54301242):
    from v1.models import desc
    from v1.lib.db import tdb_cassandra
    from v1.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False),
                       Comment.c._deleted == (True, False),
                       sort=desc('_date'))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, 'link_id'), chunk)
        add_comments(chunk)
Ejemplo n.º 12
0
def get_srmembers(after_user_id):
    previous_user_id = None

    while True:
        # there isn't a good index on rel_id so we need to get a new query
        # for each batch rather than relying solely on fetch_things2
        q = get_query(after_user_id)
        users_seen = 0

        for rel in fetch_things2(q):
            user_id = rel._thing2_id

            if user_id != previous_user_id:
                if users_seen >= 20:
                    # set after_user_id to the previous id so we will pick up
                    # the query at this same point
                    after_user_id = previous_user_id
                    break

                users_seen += 1
                previous_user_id = user_id

            yield rel
Ejemplo n.º 13
0
def get_sr_counts():
    srs = utils.fetch_things2(Subverbify._query(sort=desc("_date")))

    return dict((sr._fullname, sr._ups) for sr in srs)
Ejemplo n.º 14
0
    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_verbifys()

        yield queries.get_all_comments().iden

        l_q = Link._query(
            Link.c._spam == (True, False),
            Link.c._deleted == (True, False),
            sort=desc('_date'),
            data=True,
        )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(
            Account.c._spam == (True, False),
            sort=desc('_date'),
        )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subverbify._query(
            Subverbify.c._spam == (True, False),
            sort=desc('_date'),
        )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr,
                                            sort,
                                            time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subverbify_messages(sr).iden
            yield queries.get_unread_subverbify_messages(sr).iden
Ejemplo n.º 15
0
#
# The Original Developer is the Initial Developer.  The Initial Developer of
# the Original Code is verbify Inc.
#
# All portions of the code written by verbify are Copyright (c) 2006-2015 verbify
# Inc. All Rights Reserved.
###############################################################################
"""Ensure modmsgtime is properly set on all accounts.

See the comment in Account.is_moderator_somewhere for possible values of this
attribute now.

"""

from v1.lib.db.operators import desc
from v1.lib.utils import fetch_things2, progress
from v1.models import Account, Subverbify

all_accounts = Account._query(sort=desc("_date"))
for account in progress(fetch_things2(all_accounts)):
    is_moderator_somewhere = bool(Subverbify.reverse_moderator_ids(account))
    if is_moderator_somewhere:
        if not account.modmsgtime:
            account.modmsgtime = False
        else:
            # the account already has a date for modmsgtime meaning unread mail
            pass
    else:
        account.modmsgtime = None
    account._commit()
Ejemplo n.º 16
0
def all_sodium_users():
    q = Account._query(Account.c.sodium == True,
                       Account.c._spam == (True, False),
                       data=True,
                       sort="_id")
    return fetch_things2(q)
Ejemplo n.º 17
0
# Inc. All Rights Reserved.
###############################################################################

import urllib2

from pylons import app_globals as g

from v1.lib.db.operators import desc
from v1.lib.utils import fetch_things2
from v1.lib.media import upload_media
from v1.models.subverbify import Subverbify
from v1.models.wiki import WikiPage, ImagesByWikiPage


all_subverbifys = Subverbify._query(sort=desc("_date"))
for sr in fetch_things2(all_subverbifys):
    images = sr.images.copy()
    images.pop("/empties/", None)

    if not images:
        continue

    print 'Processing /r/%s (id36: %s)' % (sr.name, sr._id36)

    # upgrade old-style image ids to urls
    for name, image_url in images.items():
        if not isinstance(image_url, int):
            continue

        print "  upgrading image %r" % image_url
        url = "http://%s/%s_%d.png" % (g.s3_old_thumb_bucket,
Ejemplo n.º 18
0
        Link.c.sildings != 0,
        Link.c._date > LINK_SILDING_START,
        data=True,
        sort=desc('_date'),
    ),
    Comment._query(
        Comment.c.sildings != 0,
        Comment.c._date > COMMENT_SILDING_START,
        data=True,
        sort=desc('_date'),
    ),
]

seconds_by_srid = defaultdict(int)
silding_price = g.sodium_month_price.pennies

for q in queries:
    for things in fetch_things2(q, chunks=True, chunk_size=100):
        print things[0]._fullname

        for thing in things:
            seconds_per_silding = calculate_server_seconds(
                silding_price, thing._date)
            seconds_by_srid[thing.sr_id] += int(thing.sildings *
                                                seconds_per_silding)

for sr_id, seconds in seconds_by_srid:
    sr = Subverbify._byID(sr_id, data=True)
    print "%s: %s seconds" % (sr.name, seconds)
    sr._incr("silding_server_seconds", seconds)
Ejemplo n.º 19
0
        return False

    # don't show user their own unread stuff
    if msg.author_id == account._id:
        return False

    return True

resume_id = long(sys.argv[1]) if len(sys.argv) > 1 else None

msg_accounts = Account._query(sort=desc("_date"), data=True)

if resume_id:
    msg_accounts._filter(Account.c._id < resume_id)

for account in progress(fetch_things2(msg_accounts), estimate=resume_id):
    current_inbox_count = account.inbox_count
    unread_messages = list(queries.get_unread_inbox(account))

    if account._id % 100000 == 0:
        g.reset_caches()

    if not len(unread_messages):
        if current_inbox_count:
            account._incr('inbox_count', -current_inbox_count)
    else:
        msgs = Message._by_fullname(
            unread_messages,
            data=True,
            return_dict=False,
            ignore_missing=True,