Beispiel #1
0
def subverbify_sitemaps(subverbifys):
    """Create an array of sitemaps.

    Each sitemap has up to 50000 links, being the maximum allowable number of
    links according to the sitemap standard.
    """
    for subverbify_chunks in in_chunks(subverbifys, LINKS_PER_SITEMAP):
        yield _subverbify_sitemap(subverbify_chunks)
Beispiel #2
0
    def simple_get_multi(self, keys):
        results = {}
        category_bundles = {}
        for key in keys:
            category, ids = self._split_key(key)
            category_bundles.setdefault(category, []).append(ids)

        for category in category_bundles:
            idses = category_bundles[category]
            chunks = in_chunks(idses, size=50)
            for chunk in chunks:
                new_results = self.backend.get_multi(category, chunk)
                results.update(new_results)

        return results
Beispiel #3
0
def port_deleted_links(after_id=None):
    from v1.models import Link
    from v1.lib.db.operators import desc
    from v1.models.query_cache import CachedQueryMutator
    from v1.lib.db.queries import get_deleted_links
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._deleted == True,
                    Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, verbosity=1000)

    for chunk in in_chunks(q):
        with CachedQueryMutator() as m:
            for link in chunk:
                query = get_deleted_links(link.author_id)
                m.insert(query, [link])
Beispiel #4
0
def port_cassaurls(after_id=None, estimate=15231317):
    from v1.models import Link, LinksByUrlAndSubverbify
    from v1.lib.db import tdb_cassandra
    from v1.lib.db.operators import desc
    from v1.lib.db.tdb_cassandra import CL
    from v1.lib.utils import fetch_things2, in_chunks, progress

    q = Link._query(Link.c._spam == (True, False),
                    sort=desc('_date'),
                    data=True)
    if after_id:
        q._after(Link._byID(after_id, data=True))
    q = fetch_things2(q, chunk_size=500)
    q = progress(q, estimate=estimate)
    q = (l for l in q if getattr(l, 'url', 'self') != 'self'
         and not getattr(l, 'is_self', False))
    chunks = in_chunks(q, 500)

    for chunk in chunks:
        for l in chunk:
            LinksByUrlAndSubverbify.add_link(l)
Beispiel #5
0
def rebuild_link_index(start_at=None,
                       sleeptime=1,
                       cls=Link,
                       uploader=LinkUploader,
                       doc_api='CLOUDSEARCH_DOC_API',
                       estimate=50000000,
                       chunk_size=1000):
    doc_api = getattr(g, doc_api)
    uploader = uploader(doc_api)

    q = cls._query(cls.c._deleted == (True, False), sort=desc('_date'))

    if start_at:
        after = cls._by_fullname(start_at)
        assert isinstance(after, cls)
        q._after(after)

    q = v1utils.fetch_things2(q, chunk_size=chunk_size)
    q = v1utils.progress(q,
                         verbosity=1000,
                         estimate=estimate,
                         persec=True,
                         key=_progress_key)
    for chunk in v1utils.in_chunks(q, size=chunk_size):
        uploader.things = chunk
        for x in range(5):
            try:
                uploader.inject()
            except httplib.HTTPException as err:
                print "Got %s, sleeping %s secs" % (err, x)
                time.sleep(x)
                continue
            else:
                break
        else:
            raise err
        last_update = chunk[-1]
        print "last updated %s" % last_update._fullname
        time.sleep(sleeptime)
Beispiel #6
0
def _location_by_ips(ips):
    if not hasattr(g, 'geoip_location'):
        g.log.warning("g.geoip_location not set. skipping GeoIP lookup.")
        return {}

    ret = {}
    for batch in in_chunks(ips, MAX_IPS_PER_GROUP):
        ip_string = '+'.join(batch)
        url = os.path.join(g.geoip_location, 'geoip', ip_string)

        try:
            response = urllib2.urlopen(url=url, timeout=3)
            json_data = response.read()
        except (urllib2.URLError, httplib.HTTPException, socket.error) as e:
            g.log.warning("Failed to fetch GeoIP information: %r" % e)
            continue

        try:
            ret.update(json.loads(json_data))
        except ValueError, e:
            g.log.warning("Invalid JSON response for GeoIP lookup: %r" % e)
            continue
Beispiel #7
0
def _populate(after_id=None, estimate=54301242):
    from v1.models import desc
    from v1.lib.db import tdb_cassandra
    from v1.lib import utils

    # larger has a chance to decrease the number of Cassandra writes,
    # but the probability is low
    chunk_size = 5000

    q = Comment._query(Comment.c._spam == (True, False),
                       Comment.c._deleted == (True, False),
                       sort=desc('_date'))

    if after_id is not None:
        q._after(Comment._byID(after_id))

    q = utils.fetch_things2(q, chunk_size=chunk_size)
    q = utils.progress(q, verbosity=chunk_size, estimate=estimate)

    for chunk in utils.in_chunks(q, chunk_size):
        chunk = filter(lambda x: hasattr(x, 'link_id'), chunk)
        add_comments(chunk)
Beispiel #8
0
def backfill_vote_details(cls):
    ninety_days = timedelta(days=90).total_seconds()
    for chunk in in_chunks(cls._all(), size=100):
        detail_chunk = defaultdict(dict)
        try:
            with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b:
                for vote_list in chunk:
                    thing_id36 = vote_list._id
                    thing_fullname = vote_list.votee_fullname
                    details = vote_list.decode_details()
                    for detail in details:
                        voter_id36 = detail["voter_id"]
                        if "ip" in detail and detail["ip"]:
                            ip = detail["ip"]
                            redacted = dict(detail)
                            del redacted["ip"]
                            cast = detail["date"]
                            now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz))
                            ttl = ninety_days - (now - cast)
                            oneweek = ""
                            if ttl < 3600 * 24 * 7:
                                oneweek = "(<= one week left)"
                            print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek)
                            detail_chunk[thing_id36][voter_id36] = json.dumps(redacted)
                            if ttl <= 0:
                                print "Skipping bogus ttl for %s: %d" % (redacted, ttl)
                                continue
                            b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl)
        except Exception:
            # Getting some really weird spurious errors here; complaints about negative
            # TTLs even though they can't possibly be negative, errors from cass
            # that have an explanation of "(why=')"
            # Just going to brute-force this through.  We might lose 100 here and there
            # but mostly it'll be intact.
            pass
        for votee_id36, valuedict in detail_chunk.iteritems():
            cls._set_values(votee_id36, valuedict)
Beispiel #9
0
def pushup_permacache(verbosity=1000):
    """When putting cassandra into the permacache chain, we need to
       push everything up into the rest of the chain, so this is
       everything that uses the permacache, as of that check-in."""
    from pylons import app_globals as g
    from v1.models import Link, Subverbify, Account
    from v1.lib.db.operators import desc
    from v1.lib.comment_tree import comments_key, messages_key
    from v1.lib.utils import fetch_things2, in_chunks
    from v1.lib.utils import last_modified_key
    from v1.lib.promote import promoted_memo_key
    from v1.lib.subverbify_search import load_all_verbifys
    from v1.lib.db import queries
    from v1.lib.cache import CassandraCacheChain

    authority = g.permacache.caches[-1]
    nonauthority = CassandraCacheChain(g.permacache.caches[1:-1])

    def populate(keys):
        vals = authority.simple_get_multi(keys)
        if vals:
            nonauthority.set_multi(vals)

    def gen_keys():
        yield promoted_memo_key

        # just let this one do its own writing
        load_all_verbifys()

        yield queries.get_all_comments().iden

        l_q = Link._query(
            Link.c._spam == (True, False),
            Link.c._deleted == (True, False),
            sort=desc('_date'),
            data=True,
        )
        for link in fetch_things2(l_q, verbosity):
            yield comments_key(link._id)
            yield last_modified_key(link, 'comments')

        a_q = Account._query(
            Account.c._spam == (True, False),
            sort=desc('_date'),
        )
        for account in fetch_things2(a_q, verbosity):
            yield messages_key(account._id)
            yield last_modified_key(account, 'overview')
            yield last_modified_key(account, 'commented')
            yield last_modified_key(account, 'submitted')
            yield last_modified_key(account, 'liked')
            yield last_modified_key(account, 'disliked')
            yield queries.get_comments(account, 'new', 'all').iden
            yield queries.get_submitted(account, 'new', 'all').iden
            yield queries.get_liked(account).iden
            yield queries.get_disliked(account).iden
            yield queries.get_hidden(account).iden
            yield queries.get_saved(account).iden
            yield queries.get_inbox_messages(account).iden
            yield queries.get_unread_messages(account).iden
            yield queries.get_inbox_comments(account).iden
            yield queries.get_unread_comments(account).iden
            yield queries.get_inbox_selfreply(account).iden
            yield queries.get_unread_selfreply(account).iden
            yield queries.get_sent(account).iden

        sr_q = Subverbify._query(
            Subverbify.c._spam == (True, False),
            sort=desc('_date'),
        )
        for sr in fetch_things2(sr_q, verbosity):
            yield last_modified_key(sr, 'stylesheet_contents')
            yield queries.get_links(sr, 'hot', 'all').iden
            yield queries.get_links(sr, 'new', 'all').iden

            for sort in 'top', 'controversial':
                for time in 'hour', 'day', 'week', 'month', 'year', 'all':
                    yield queries.get_links(sr,
                                            sort,
                                            time,
                                            merge_batched=False).iden
            yield queries.get_spam_links(sr).iden
            yield queries.get_spam_comments(sr).iden
            yield queries.get_reported_links(sr).iden
            yield queries.get_reported_comments(sr).iden
            yield queries.get_subverbify_messages(sr).iden
            yield queries.get_unread_subverbify_messages(sr).iden

    done = 0
    for keys in in_chunks(gen_keys(), verbosity):
        g.reset_caches()
        done += len(keys)
        print 'Done %d: %r' % (done, keys[-1])
        populate(keys)