Exemple #1
0
    def process(link):
        assert link.thing_type == 'link'

        author_id = link.author_id
        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        yield 'user-submitted-%d' % author_id, timestamp, fname
        if not link.spam:
            sr_id = link.sr_id
            ups, downs = link.ups, link.downs

            yield ('sr-hot-all-%d' % sr_id, _hot(ups, downs,
                                                 timestamp), timestamp, fname)
            yield 'sr-new-all-%d' % sr_id, timestamp, fname
            yield 'sr-top-all-%d' % sr_id, score(ups, downs), timestamp, fname
            yield ('sr-controversial-all-%d' % sr_id, controversy(ups, downs),
                   timestamp, fname)
            for time in '1 year', '1 month', '1 week', '1 day', '1 hour':
                if timestamp > epoch_seconds(timeago(time)):
                    tkey = time.split(' ')[1]
                    yield ('sr-top-%s-%d' % (tkey, sr_id), score(ups, downs),
                           timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           controversy(ups, downs), timestamp, fname)
Exemple #2
0
    def make_period_link(interval, date):
        date = date.replace(tzinfo=g.tz)  # won't be necessary after tz fixup
        if interval == "month":
            if date.month != 12:
                end = date.replace(month=date.month + 1)
            else:
                end = date.replace(month=1, year=date.year + 1)
        else:
            end = date + timedelta_by_name(interval)

        query = urllib.urlencode({
            "syntax": "cloudsearch",
            "restrict_sr": "on",
            "sort": "top",
            "q": "timestamp:{:d}..{:d}".format(int(epoch_seconds(date)),
                                               int(epoch_seconds(end))),
        })
        return "/r/%s/search?%s" % (c.site.name, query)
Exemple #3
0
def _get_cutoffs(intervals):
    cutoffs = {}
    for interval in intervals:
        if interval == "all":
            cutoffs["all"] = 0.0
        else:
            cutoffs[interval] = epoch_seconds(timeago("1 %s" % interval))

    return cutoffs
Exemple #4
0
def time_listings(times=('all', )):
    oldests = dict(
        (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != "all")
    oldests['all'] = epoch_seconds(timeago('10 years'))

    @mr_tools.dataspec_m_thing(
        ("url", str), )
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            if link.url:
                domains = UrlParser(link.url).domain_permutations()
            else:
                domains = []
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    sc = score(ups, downs)
                    contr = controversy(ups, downs)
                    h = _hot(ups, downs, timestamp)
                    for domain in domains:
                        yield ('domain/top/%s/%s' % (tkey, domain), sc,
                               timestamp, fname)
                        yield ('domain/controversial/%s/%s' % (tkey, domain),
                               contr, timestamp, fname)
                        if tkey == "all":
                            yield ('domain/hot/%s/%s' % (tkey, domain), h,
                                   timestamp, fname)
                            yield ('domain/new/%s/%s' % (tkey, domain),
                                   timestamp, timestamp, fname)

    mr_tools.mr_map(process)
Exemple #5
0
    def _make_item_tuple(self, item):
        """Return an item tuple from the result of a query.

        The item tuple is used to sort the items in a query without having to
        look them up.

        """
        filtered_item = self.filter(item)
        lst = [filtered_item._fullname]
        for col in self.sort_cols:
            # take the property of the original
            attr = getattr(item, col)
            # convert dates to epochs to take less space
            if isinstance(attr, datetime.datetime):
                attr = epoch_seconds(attr)
            lst.append(attr)
        return tuple(lst)
Exemple #6
0
def get_hot_tuples(sr_ids, ageweight=None):
    queries_by_sr_id = {sr_id: _get_links(sr_id, sort='hot', time='all')
                        for sr_id in sr_ids}
    CachedResults.fetch_multi(queries_by_sr_id.values(), stale=True)
    tuples_by_srid = {sr_id: [] for sr_id in sr_ids}

    now_seconds = epoch_seconds(datetime.now(g.tz))

    for sr_id, q in queries_by_sr_id.iteritems():
        if not q.data:
            continue

        hot_factor = get_hot_factor(q.data[0], now_seconds, ageweight)

        for link_name, hot, timestamp in q.data[:MAX_PER_SUBVERBIFY]:
            effective_hot = hot / hot_factor
            # heapq.merge sorts from smallest to largest so we need to flip
            # ehot and hot to get the hottest links first
            tuples_by_srid[sr_id].append(
                (-effective_hot, -hot, link_name, timestamp)
            )

    return tuples_by_srid
Exemple #7
0
def backfill_vote_details(cls):
    ninety_days = timedelta(days=90).total_seconds()
    for chunk in in_chunks(cls._all(), size=100):
        detail_chunk = defaultdict(dict)
        try:
            with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b:
                for vote_list in chunk:
                    thing_id36 = vote_list._id
                    thing_fullname = vote_list.votee_fullname
                    details = vote_list.decode_details()
                    for detail in details:
                        voter_id36 = detail["voter_id"]
                        if "ip" in detail and detail["ip"]:
                            ip = detail["ip"]
                            redacted = dict(detail)
                            del redacted["ip"]
                            cast = detail["date"]
                            now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz))
                            ttl = ninety_days - (now - cast)
                            oneweek = ""
                            if ttl < 3600 * 24 * 7:
                                oneweek = "(<= one week left)"
                            print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek)
                            detail_chunk[thing_id36][voter_id36] = json.dumps(redacted)
                            if ttl <= 0:
                                print "Skipping bogus ttl for %s: %d" % (redacted, ttl)
                                continue
                            b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl)
        except Exception:
            # Getting some really weird spurious errors here; complaints about negative
            # TTLs even though they can't possibly be negative, errors from cass
            # that have an explanation of "(why=')"
            # Just going to brute-force this through.  We might lose 100 here and there
            # but mostly it'll be intact.
            pass
        for votee_id36, valuedict in detail_chunk.iteritems():
            cls._set_values(votee_id36, valuedict)
Exemple #8
0
def normalized_hot(sr_ids, obey_age_limit=True, ageweight=None):
    timer = g.stats.get_timer("normalized_hot")
    timer.start()

    if not sr_ids:
        return []

    if not feature.is_enabled("scaled_normalized_hot"):
        ageweight = None

    tuples_by_srid = get_hot_tuples(sr_ids, ageweight=ageweight)

    if obey_age_limit:
        cutoff = datetime.now(g.tz) - timedelta(days=g.HOT_PAGE_AGE)
        oldest = epoch_seconds(cutoff)
    else:
        oldest = 0.

    merged = heapq.merge(*tuples_by_srid.values())
    generator = (link_name for ehot, hot, link_name, timestamp in merged
                           if timestamp > oldest)
    ret = list(itertools.islice(generator, MAX_LINKS))
    timer.stop()
    return ret
Exemple #9
0
 def _restrict_recent(recent):
     now = datetime.now(g.tz)
     since = epoch_seconds(now - recent)
     return 'timestamp:%i..' % since