def link_listings(): @dataspec_m_thing(('author_id', int), ('sr_id', int)) def process(link): assert link.thing_type == 'link' author_id = link.author_id timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) yield 'user-submitted-%d' % author_id, timestamp, fname if not link.spam: sr_id = link.sr_id ups, downs = link.ups, link.downs yield ('sr-hot-all-%d' % sr_id, _hot(ups, downs, timestamp), timestamp, fname) yield 'sr-new-all-%d' % sr_id, timestamp, fname yield 'sr-top-all-%d' % sr_id, score(ups, downs), timestamp, fname yield ('sr-controversial-all-%d' % sr_id, controversy(ups, downs), timestamp, fname) for time in '1 year', '1 month', '1 week', '1 day', '1 hour': if timestamp > epoch_seconds(timeago(time)): tkey = time.split(' ')[1] yield ('sr-top-%s-%d' % (tkey, sr_id), score(ups, downs), timestamp, fname) yield ('sr-controversial-%s-%d' % (tkey, sr_id), controversy(ups, downs), timestamp, fname) mr_tools.mr_map(process)
def time_listings(times = ('year','month','week','day','hour')): oldests = dict((t, epoch_seconds(timeago('1 %s' % t))) for t in times) @mr_tools.dataspec_m_thing(("url", str),('sr_id', int),) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: sr_id = link.sr_id if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) yield ('sr-top-%s-%d' % (tkey, sr_id), sc, timestamp, fname) yield ('sr-controversial-%s-%d' % (tkey, sr_id), contr, timestamp, fname) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/controversial/%s/%s' % (tkey, domain), contr, timestamp, fname) mr_tools.mr_map(process)
def year_listings(): """ With an 'all' dump, generate the top and controversial per user per year """ @mr_tools.dataspec_m_thing( ('author_id', int), ) def process(link): if not link.deleted: author_id = link.author_id ups = link.ups downs = link.downs sc = score(ups, downs) contr = controversy(ups, downs) if link.thing_type == 'link': fname = make_fullname(Link, link.thing_id) else: fname = make_fullname(Comment, link.thing_id) timestamp = link.timestamp date = datetime.datetime.utcfromtimestamp(timestamp) yield ('user-top-%s-%d' % (date.year, author_id), sc, timestamp, fname) yield ('user-controversial-%s-%d' % (date.year, author_id), contr, timestamp, fname) mr_tools.mr_map(process)
def link_listings(): @dataspec_m_thing(("author_id", int), ("sr_id", int)) def process(link): assert link.thing_type == "link" author_id = link.author_id timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) yield "user-submitted-%d" % author_id, timestamp, fname if not link.spam: sr_id = link.sr_id ups, downs = link.ups, link.downs yield ("sr-hot-all-%d" % sr_id, _hot(ups, downs, timestamp), timestamp, fname) yield "sr-new-all-%d" % sr_id, timestamp, fname yield "sr-top-all-%d" % sr_id, score(ups, downs), timestamp, fname yield ("sr-controversial-all-%d" % sr_id, controversy(ups, downs), timestamp, fname) for time in "1 year", "1 month", "1 week", "1 day", "1 hour": if timestamp > epoch_seconds(timeago(time)): tkey = time.split(" ")[1] yield ("sr-top-%s-%d" % (tkey, sr_id), score(ups, downs), timestamp, fname) yield ("sr-controversial-%s-%d" % (tkey, sr_id), controversy(ups, downs), timestamp, fname) mr_tools.mr_map(process)
def time_listings(times = ('year','month','week','day','hour', 'all')): oldests = dict((t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != 'all') if 'all' in times: oldests['all'] = 0 @mr_tools.dataspec_m_thing(('author_id', int),) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: author_id = link.author_id ups, downs = link.ups, link.downs sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for tkey, oldest in oldests.iteritems(): if timestamp > oldest: yield ('user-top-%s-%d' % (tkey, author_id), sc, timestamp, fname) yield ('user-controversial-%s-%d' % (tkey, author_id), contr, timestamp, fname) if tkey == 'all': yield ('user-new-%s-%d' % (tkey, author_id), timestamp, timestamp, fname) yield ('user-hot-%s-%d' % (tkey, author_id), h, timestamp, fname) mr_tools.mr_map(process)
def time_listings(times=('year', 'month', 'week', 'day', 'hour', 'all')): oldests = dict( (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != 'all') if 'all' in times: oldests['all'] = 0 @mr_tools.dataspec_m_thing( ('author_id', int), ) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: author_id = link.author_id ups, downs = link.ups, link.downs sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for tkey, oldest in oldests.iteritems(): if timestamp > oldest: yield ('user-top-%s-%d' % (tkey, author_id), sc, timestamp, fname) yield ('user-controversial-%s-%d' % (tkey, author_id), contr, timestamp, fname) if tkey == 'all': yield ('user-new-%s-%d' % (tkey, author_id), timestamp, timestamp, fname) yield ('user-hot-%s-%d' % (tkey, author_id), h, timestamp, fname) mr_tools.mr_map(process)
def comment_listings(): @dataspec_m_thing(("author_id", int)) def process(comment): assert comment.thing_type == "comment" yield ("user-commented-%d" % comment.author_id, comment.timestamp, make_fullname(Comment, comment.thing_id)) mr_tools.mr_map(process)
def listings(): @mr_tools.dataspec_m_thing(("url", str),) def process(link): if link.url: yield (Link.by_url_key_new(link.url), link.timestamp, link.thing_id) mr_tools.mr_map(process)
def time_listings(self, intervals): cutoff_by_interval = self._get_cutoffs(intervals) spec = self.fields.items() @mr_tools.dataspec_m_thing(*spec) def process(thing): return self.time_listing_iter(thing, cutoff_by_interval) mr_tools.mr_map(process, fd=self.fd, out=self.out)
def rel_listings(names, thing2_cls = Link): # names examples: {'1': 'liked', # '-1': 'disliked'} @dataspec_m_rel() def process(rel): if rel.name in names: yield ('%s-%s' % (names[rel.name], rel.thing1_id), rel.timestamp, make_fullname(thing2_cls, rel.thing2_id)) mr_tools.mr_map(process)
def listings(): @mr_tools.dataspec_m_thing( ("url", str), ) def process(link): if link.url: yield (Link.by_url_key_new(link.url), link.timestamp, link.thing_id) mr_tools.mr_map(process)
def comment_listings(): @dataspec_m_thing(('author_id', int),) def process(comment): assert comment.thing_type == 'comment' yield ('user-commented-%d' % comment.author_id, comment.timestamp, make_fullname(Comment, comment.thing_id)) mr_tools.mr_map(process)
def rel_listings(names, thing2_cls=Link): # names examples: {'1': 'liked', # '-1': 'disliked'} @dataspec_m_rel() def process(rel): if rel.name in names: yield ("%s-%s" % (names[rel.name], rel.thing1_id), rel.timestamp, make_fullname(thing2_cls, rel.thing2_id)) mr_tools.mr_map(process)
def time_listings(intervals): cutoff_by_interval = _get_cutoffs(intervals) @mr_tools.dataspec_m_thing( ("url", str), ("sr_id", int), ("author_id", int), ) def process(thing): if thing.deleted: return thing_cls = thingcls_by_name[thing.thing_type] fname = make_fullname(thing_cls, thing.thing_id) thing_score = score(thing.ups, thing.downs) thing_controversy = controversy(thing.ups, thing.downs) for interval, cutoff in cutoff_by_interval.iteritems(): if thing.timestamp < cutoff: continue yield ("user/%s/top/%s/%d" % (thing.thing_type, interval, thing.author_id), thing_score, thing.timestamp, fname) yield ("user/%s/controversial/%s/%d" % (thing.thing_type, interval, thing.author_id), thing_controversy, thing.timestamp, fname) if thing.spam: continue if thing.thing_type == "link": yield ("sr/link/top/%s/%d" % (interval, thing.sr_id), thing_score, thing.timestamp, fname) yield ("sr/link/controversial/%s/%d" % (interval, thing.sr_id), thing_controversy, thing.timestamp, fname) if thing.url: try: parsed = UrlParser(thing.url) except ValueError: continue for domain in parsed.domain_permutations(): yield ("domain/link/top/%s/%s" % (interval, domain), thing_score, thing.timestamp, fname) yield ("domain/link/controversial/%s/%s" % (interval, domain), thing_controversy, thing.timestamp, fname) mr_tools.mr_map(process)
def time_listings(times=('all', )): oldests = dict( (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != "all") oldests['all'] = epoch_seconds(timeago('10 years')) @mr_tools.dataspec_m_thing( ("url", str), ) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) upvotes = upvotes(ups) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/%s/%s/%s' % (g.voting_upvote_path, tkey, domain), upvotes, timestamp, fname) yield ('domain/%s/%s/%s' % (g.voting_controversial_path, tkey, domain), contr, timestamp, fname) if tkey == "all": yield ('domain/hot/%s/%s' % (tkey, domain), h, timestamp, fname) yield ('domain/new/%s/%s' % (tkey, domain), timestamp, timestamp, fname) mr_tools.mr_map(process)
def time_listings(times = ('all',)): oldests = dict((t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != "all") oldests['all'] = epoch_seconds(timeago('10 years')) @mr_tools.dataspec_m_thing(("url", str),) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/controversial/%s/%s' % (tkey, domain), contr, timestamp, fname) if tkey == "all": yield ('domain/hot/%s/%s' % (tkey, domain), h, timestamp, fname) yield ('domain/new/%s/%s' % (tkey, domain), timestamp, timestamp, fname) mr_tools.mr_map(process)
def year_listings(): """ With an 'all' dump, generate the top and controversial per user per year """ @mr_tools.dataspec_m_thing(('author_id', int),) def process(link): if not link.deleted: author_id = link.author_id ups = link.ups downs = link.downs sc = score(ups, downs) contr = controversy(ups, downs) if link.thing_type == 'link': fname = make_fullname(Link, link.thing_id) else: fname = make_fullname(Comment, link.thing_id) timestamp = link.timestamp date = datetime.datetime.utcfromtimestamp(timestamp) yield ('user-top-%s-%d' % (date.year, author_id), sc, timestamp, fname) yield ('user-controversial-%s-%d' % (date.year, author_id), contr, timestamp, fname) mr_tools.mr_map(process)
def time_listings(times = ('year','month','week','day','hour')): oldests = dict((t, epoch_seconds(timeago('1 %s' % t))) for t in times) @mr_tools.dataspec_m_thing(('sr_id', int),) def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam: sr_id = link.sr_id ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: yield ('sr-top-%s-%d' % (tkey, sr_id), score(ups, downs), timestamp, fname) yield ('sr-controversial-%s-%d' % (tkey, sr_id), controversy(ups, downs), timestamp, fname) mr_tools.mr_map(process)
def combine_links(): @mr_tools.dataspec_m_thing(('link_id', int)) def _process(t): thing_id = t.thing_id id36 = to36(thing_id) link_id = t.link_id link_id36 = to36(link_id) ups, downs, timestamp = t.ups, t.downs, t.timestamp yield link_id36+'_controversy', id36, sorts.controversy(ups, downs) yield link_id36+'_hot', id36, sorts._hot(ups, downs, timestamp) yield link_id36+'_confidence', id36, sorts.confidence(ups, downs) yield link_id36+'_score', id36, sorts.score(ups, downs) yield link_id36+'_date', id36, timestamp return mr_tools.mr_map(_process)