def process(link): assert link.thing_type == 'link' author_id = link.author_id timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) yield 'user-submitted-%d' % author_id, timestamp, fname if not link.spam: sr_id = link.sr_id ups, downs = link.ups, link.downs yield ('sr-hot-all-%d' % sr_id, _hot(ups, downs, timestamp), timestamp, fname) yield 'sr-new-all-%d' % sr_id, timestamp, fname yield 'sr-top-all-%d' % sr_id, score(ups, downs), timestamp, fname yield ('sr-controversial-all-%d' % sr_id, controversy(ups, downs), timestamp, fname) for time in '1 year', '1 month', '1 week', '1 day', '1 hour': if timestamp > epoch_seconds(timeago(time)): tkey = time.split(' ')[1] yield ('sr-top-%s-%d' % (tkey, sr_id), score(ups, downs), timestamp, fname) yield ('sr-controversial-%s-%d' % (tkey, sr_id), controversy(ups, downs), timestamp, fname)
def make_period_link(interval, date): date = date.replace(tzinfo=g.tz) # won't be necessary after tz fixup if interval == "month": if date.month != 12: end = date.replace(month=date.month + 1) else: end = date.replace(month=1, year=date.year + 1) else: end = date + timedelta_by_name(interval) query = urllib.urlencode({ "syntax": "cloudsearch", "restrict_sr": "on", "sort": "top", "q": "timestamp:{:d}..{:d}".format(int(epoch_seconds(date)), int(epoch_seconds(end))), }) return "/r/%s/search?%s" % (c.site.name, query)
def _get_cutoffs(intervals): cutoffs = {} for interval in intervals: if interval == "all": cutoffs["all"] = 0.0 else: cutoffs[interval] = epoch_seconds(timeago("1 %s" % interval)) return cutoffs
def time_listings(times=('all', )): oldests = dict( (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != "all") oldests['all'] = epoch_seconds(timeago('10 years')) @mr_tools.dataspec_m_thing( ("url", str), ) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/controversial/%s/%s' % (tkey, domain), contr, timestamp, fname) if tkey == "all": yield ('domain/hot/%s/%s' % (tkey, domain), h, timestamp, fname) yield ('domain/new/%s/%s' % (tkey, domain), timestamp, timestamp, fname) mr_tools.mr_map(process)
def _make_item_tuple(self, item): """Return an item tuple from the result of a query. The item tuple is used to sort the items in a query without having to look them up. """ filtered_item = self.filter(item) lst = [filtered_item._fullname] for col in self.sort_cols: # take the property of the original attr = getattr(item, col) # convert dates to epochs to take less space if isinstance(attr, datetime.datetime): attr = epoch_seconds(attr) lst.append(attr) return tuple(lst)
def get_hot_tuples(sr_ids, ageweight=None): queries_by_sr_id = {sr_id: _get_links(sr_id, sort='hot', time='all') for sr_id in sr_ids} CachedResults.fetch_multi(queries_by_sr_id.values(), stale=True) tuples_by_srid = {sr_id: [] for sr_id in sr_ids} now_seconds = epoch_seconds(datetime.now(g.tz)) for sr_id, q in queries_by_sr_id.iteritems(): if not q.data: continue hot_factor = get_hot_factor(q.data[0], now_seconds, ageweight) for link_name, hot, timestamp in q.data[:MAX_PER_SUBVERBIFY]: effective_hot = hot / hot_factor # heapq.merge sorts from smallest to largest so we need to flip # ehot and hot to get the hottest links first tuples_by_srid[sr_id].append( (-effective_hot, -hot, link_name, timestamp) ) return tuples_by_srid
def backfill_vote_details(cls): ninety_days = timedelta(days=90).total_seconds() for chunk in in_chunks(cls._all(), size=100): detail_chunk = defaultdict(dict) try: with VoterIPByThing._cf.batch(write_consistency_level=cls._write_consistency_level) as b: for vote_list in chunk: thing_id36 = vote_list._id thing_fullname = vote_list.votee_fullname details = vote_list.decode_details() for detail in details: voter_id36 = detail["voter_id"] if "ip" in detail and detail["ip"]: ip = detail["ip"] redacted = dict(detail) del redacted["ip"] cast = detail["date"] now = epoch_seconds(datetime.utcnow().replace(tzinfo=g.tz)) ttl = ninety_days - (now - cast) oneweek = "" if ttl < 3600 * 24 * 7: oneweek = "(<= one week left)" print "Inserting %s with IP ttl %d %s" % (redacted, ttl, oneweek) detail_chunk[thing_id36][voter_id36] = json.dumps(redacted) if ttl <= 0: print "Skipping bogus ttl for %s: %d" % (redacted, ttl) continue b.insert(thing_fullname, {voter_id36: ip}, ttl=ttl) except Exception: # Getting some really weird spurious errors here; complaints about negative # TTLs even though they can't possibly be negative, errors from cass # that have an explanation of "(why=')" # Just going to brute-force this through. We might lose 100 here and there # but mostly it'll be intact. pass for votee_id36, valuedict in detail_chunk.iteritems(): cls._set_values(votee_id36, valuedict)
def normalized_hot(sr_ids, obey_age_limit=True, ageweight=None): timer = g.stats.get_timer("normalized_hot") timer.start() if not sr_ids: return [] if not feature.is_enabled("scaled_normalized_hot"): ageweight = None tuples_by_srid = get_hot_tuples(sr_ids, ageweight=ageweight) if obey_age_limit: cutoff = datetime.now(g.tz) - timedelta(days=g.HOT_PAGE_AGE) oldest = epoch_seconds(cutoff) else: oldest = 0. merged = heapq.merge(*tuples_by_srid.values()) generator = (link_name for ehot, hot, link_name, timestamp in merged if timestamp > oldest) ret = list(itertools.islice(generator, MAX_LINKS)) timer.stop() return ret
def _restrict_recent(recent): now = datetime.now(g.tz) since = epoch_seconds(now - recent) return 'timestamp:%i..' % since