Exemple #1
0
def link_listings():
    @dataspec_m_thing(('author_id', int),
                      ('sr_id', int))
    def process(link):
        assert link.thing_type == 'link'

        author_id = link.author_id
        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        yield 'user-submitted-%d' % author_id, timestamp, fname
        if not link.spam:
            sr_id = link.sr_id
            ups, downs = link.ups, link.downs

            yield ('sr-hot-all-%d' % sr_id, _hot(ups, downs, timestamp),
                   timestamp, fname)
            yield 'sr-new-all-%d' % sr_id, timestamp, fname
            yield 'sr-top-all-%d' % sr_id, score(ups, downs), timestamp, fname
            yield ('sr-controversial-all-%d' % sr_id,
                   controversy(ups, downs), timestamp, fname)
            for time in '1 year', '1 month', '1 week', '1 day', '1 hour':
                if timestamp > epoch_seconds(timeago(time)):
                    tkey = time.split(' ')[1]
                    yield ('sr-top-%s-%d' % (tkey, sr_id),
                           score(ups, downs), timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           controversy(ups, downs),
                           timestamp, fname)

    mr_tools.mr_map(process)
Exemple #2
0
def link_listings():
    @dataspec_m_thing(('author_id', int), ('sr_id', int))
    def process(link):
        assert link.thing_type == 'link'

        author_id = link.author_id
        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        yield 'user-submitted-%d' % author_id, timestamp, fname
        if not link.spam:
            sr_id = link.sr_id
            ups, downs = link.ups, link.downs

            yield ('sr-hot-all-%d' % sr_id, _hot(ups, downs,
                                                 timestamp), timestamp, fname)
            yield 'sr-new-all-%d' % sr_id, timestamp, fname
            yield 'sr-top-all-%d' % sr_id, score(ups, downs), timestamp, fname
            yield ('sr-controversial-all-%d' % sr_id, controversy(ups, downs),
                   timestamp, fname)
            for time in '1 year', '1 month', '1 week', '1 day', '1 hour':
                if timestamp > epoch_seconds(timeago(time)):
                    tkey = time.split(' ')[1]
                    yield ('sr-top-%s-%d' % (tkey, sr_id), score(ups, downs),
                           timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           controversy(ups, downs), timestamp, fname)

    mr_tools.mr_map(process)
Exemple #3
0
def time_listings(times = ('year','month','week','day','hour')):
    oldests = dict((t, epoch_seconds(timeago('1 %s' % t)))
                   for t in times)

    @mr_tools.dataspec_m_thing(("url", str),('sr_id', int),)
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            sr_id = link.sr_id
            if link.url:
                domains = UrlParser(link.url).domain_permutations()
            else:
                domains = []
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    sc = score(ups, downs)
                    contr = controversy(ups, downs)
                    yield ('sr-top-%s-%d' % (tkey, sr_id),
                           sc, timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           contr, timestamp, fname)
                    for domain in domains:
                        yield ('domain/top/%s/%s' % (tkey, domain),
                               sc, timestamp, fname)
                        yield ('domain/controversial/%s/%s' % (tkey, domain),
                               contr, timestamp, fname)

    mr_tools.mr_map(process)
Exemple #4
0
def time_listings(times = ('year','month','week','day','hour')):
    oldests = dict((t, epoch_seconds(timeago('1 %s' % t)))
                   for t in times)

    @mr_tools.dataspec_m_thing(("url", str),('sr_id', int),)
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            sr_id = link.sr_id
            if link.url:
                domains = UrlParser(link.url).domain_permutations()
            else:
                domains = []
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    sc = score(ups, downs)
                    contr = controversy(ups, downs)
                    yield ('sr-top-%s-%d' % (tkey, sr_id),
                           sc, timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           contr, timestamp, fname)
                    for domain in domains:
                        yield ('domain/top/%s/%s' % (tkey, domain),
                               sc, timestamp, fname)
                        yield ('domain/controversial/%s/%s' % (tkey, domain),
                               contr, timestamp, fname)

    mr_tools.mr_map(process)
Exemple #5
0
def year_listings():
    """
    With an 'all' dump, generate the top and controversial per user per year
    """
    @mr_tools.dataspec_m_thing(
        ('author_id', int), )
    def process(link):
        if not link.deleted:
            author_id = link.author_id
            ups = link.ups
            downs = link.downs
            sc = score(ups, downs)
            contr = controversy(ups, downs)
            if link.thing_type == 'link':
                fname = make_fullname(Link, link.thing_id)
            else:
                fname = make_fullname(Comment, link.thing_id)
            timestamp = link.timestamp
            date = datetime.datetime.utcfromtimestamp(timestamp)
            yield ('user-top-%s-%d' % (date.year, author_id), sc, timestamp,
                   fname)
            yield ('user-controversial-%s-%d' % (date.year, author_id), contr,
                   timestamp, fname)

    mr_tools.mr_map(process)
def link_listings():
    @dataspec_m_thing(("author_id", int), ("sr_id", int))
    def process(link):
        assert link.thing_type == "link"

        author_id = link.author_id
        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        yield "user-submitted-%d" % author_id, timestamp, fname
        if not link.spam:
            sr_id = link.sr_id
            ups, downs = link.ups, link.downs

            yield ("sr-hot-all-%d" % sr_id, _hot(ups, downs, timestamp), timestamp, fname)
            yield "sr-new-all-%d" % sr_id, timestamp, fname
            yield "sr-top-all-%d" % sr_id, score(ups, downs), timestamp, fname
            yield ("sr-controversial-all-%d" % sr_id, controversy(ups, downs), timestamp, fname)
            for time in "1 year", "1 month", "1 week", "1 day", "1 hour":
                if timestamp > epoch_seconds(timeago(time)):
                    tkey = time.split(" ")[1]
                    yield ("sr-top-%s-%d" % (tkey, sr_id), score(ups, downs), timestamp, fname)
                    yield ("sr-controversial-%s-%d" % (tkey, sr_id), controversy(ups, downs), timestamp, fname)

    mr_tools.mr_map(process)
Exemple #7
0
def time_listings(times = ('year','month','week','day','hour', 'all')):
    oldests = dict((t, epoch_seconds(timeago('1 %s' % t)))
                   for t in times if t != 'all')
    if 'all' in times:
        oldests['all'] = 0

    @mr_tools.dataspec_m_thing(('author_id', int),)
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            author_id = link.author_id
            ups, downs = link.ups, link.downs

            sc = score(ups, downs)
            contr = controversy(ups, downs)
            h = _hot(ups, downs, timestamp)

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    yield ('user-top-%s-%d' % (tkey, author_id),
                           sc, timestamp, fname)
                    yield ('user-controversial-%s-%d' % (tkey, author_id),
                           contr, timestamp, fname)
                    if tkey == 'all':
                        yield ('user-new-%s-%d' % (tkey, author_id),
                               timestamp, timestamp, fname)
                        yield ('user-hot-%s-%d' % (tkey, author_id),
                               h, timestamp, fname)


    mr_tools.mr_map(process)
Exemple #8
0
def time_listings(times=('year', 'month', 'week', 'day', 'hour', 'all')):
    oldests = dict(
        (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != 'all')
    if 'all' in times:
        oldests['all'] = 0

    @mr_tools.dataspec_m_thing(
        ('author_id', int), )
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            author_id = link.author_id
            ups, downs = link.ups, link.downs

            sc = score(ups, downs)
            contr = controversy(ups, downs)
            h = _hot(ups, downs, timestamp)

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    yield ('user-top-%s-%d' % (tkey, author_id), sc, timestamp,
                           fname)
                    yield ('user-controversial-%s-%d' % (tkey, author_id),
                           contr, timestamp, fname)
                    if tkey == 'all':
                        yield ('user-new-%s-%d' % (tkey, author_id), timestamp,
                               timestamp, fname)
                        yield ('user-hot-%s-%d' % (tkey, author_id), h,
                               timestamp, fname)

    mr_tools.mr_map(process)
def comment_listings():
    @dataspec_m_thing(("author_id", int))
    def process(comment):
        assert comment.thing_type == "comment"

        yield ("user-commented-%d" % comment.author_id, comment.timestamp, make_fullname(Comment, comment.thing_id))

    mr_tools.mr_map(process)
Exemple #10
0
def listings():
    @mr_tools.dataspec_m_thing(("url", str),)
    def process(link):
        if link.url:
            yield (Link.by_url_key_new(link.url), link.timestamp,
                   link.thing_id)

    mr_tools.mr_map(process)
Exemple #11
0
    def time_listings(self, intervals):
        cutoff_by_interval = self._get_cutoffs(intervals)
        spec = self.fields.items()

        @mr_tools.dataspec_m_thing(*spec)
        def process(thing):
            return self.time_listing_iter(thing, cutoff_by_interval)

        mr_tools.mr_map(process, fd=self.fd, out=self.out)
Exemple #12
0
    def time_listings(self, intervals):
        cutoff_by_interval = self._get_cutoffs(intervals)
        spec = self.fields.items()

        @mr_tools.dataspec_m_thing(*spec)
        def process(thing):
            return self.time_listing_iter(thing, cutoff_by_interval)

        mr_tools.mr_map(process, fd=self.fd, out=self.out)
Exemple #13
0
def rel_listings(names, thing2_cls = Link):
    # names examples: {'1': 'liked',
    #                  '-1': 'disliked'}
    @dataspec_m_rel()
    def process(rel):
        if rel.name in names:
            yield ('%s-%s' % (names[rel.name], rel.thing1_id), rel.timestamp,
                   make_fullname(thing2_cls, rel.thing2_id))
    mr_tools.mr_map(process)
Exemple #14
0
def listings():
    @mr_tools.dataspec_m_thing(
        ("url", str), )
    def process(link):
        if link.url:
            yield (Link.by_url_key_new(link.url), link.timestamp,
                   link.thing_id)

    mr_tools.mr_map(process)
Exemple #15
0
def comment_listings():
    @dataspec_m_thing(('author_id', int),)
    def process(comment):
        assert comment.thing_type == 'comment'

        yield ('user-commented-%d' % comment.author_id,
               comment.timestamp, make_fullname(Comment, comment.thing_id))

    mr_tools.mr_map(process)
def rel_listings(names, thing2_cls=Link):
    # names examples: {'1': 'liked',
    #                  '-1': 'disliked'}
    @dataspec_m_rel()
    def process(rel):
        if rel.name in names:
            yield ("%s-%s" % (names[rel.name], rel.thing1_id), rel.timestamp, make_fullname(thing2_cls, rel.thing2_id))

    mr_tools.mr_map(process)
Exemple #17
0
def comment_listings():
    @dataspec_m_thing(('author_id', int),)
    def process(comment):
        assert comment.thing_type == 'comment'

        yield ('user-commented-%d' % comment.author_id,
               comment.timestamp, make_fullname(Comment, comment.thing_id))

    mr_tools.mr_map(process)
Exemple #18
0
def time_listings(intervals):
    cutoff_by_interval = _get_cutoffs(intervals)

    @mr_tools.dataspec_m_thing(
        ("url", str),
        ("sr_id", int),
        ("author_id", int),
    )
    def process(thing):
        if thing.deleted:
            return

        thing_cls = thingcls_by_name[thing.thing_type]
        fname = make_fullname(thing_cls, thing.thing_id)
        thing_score = score(thing.ups, thing.downs)
        thing_controversy = controversy(thing.ups, thing.downs)

        for interval, cutoff in cutoff_by_interval.iteritems():
            if thing.timestamp < cutoff:
                continue

            yield ("user/%s/top/%s/%d" %
                   (thing.thing_type, interval, thing.author_id), thing_score,
                   thing.timestamp, fname)
            yield ("user/%s/controversial/%s/%d" %
                   (thing.thing_type, interval, thing.author_id),
                   thing_controversy, thing.timestamp, fname)

            if thing.spam:
                continue

            if thing.thing_type == "link":
                yield ("sr/link/top/%s/%d" % (interval, thing.sr_id),
                       thing_score, thing.timestamp, fname)
                yield ("sr/link/controversial/%s/%d" % (interval, thing.sr_id),
                       thing_controversy, thing.timestamp, fname)

                if thing.url:
                    try:
                        parsed = UrlParser(thing.url)
                    except ValueError:
                        continue

                    for domain in parsed.domain_permutations():
                        yield ("domain/link/top/%s/%s" % (interval, domain),
                               thing_score, thing.timestamp, fname)
                        yield ("domain/link/controversial/%s/%s" %
                               (interval, domain), thing_controversy,
                               thing.timestamp, fname)

    mr_tools.mr_map(process)
Exemple #19
0
def time_listings(intervals):
    cutoff_by_interval = _get_cutoffs(intervals)

    @mr_tools.dataspec_m_thing(
        ("url", str),
        ("sr_id", int),
        ("author_id", int),
    )
    def process(thing):
        if thing.deleted:
            return

        thing_cls = thingcls_by_name[thing.thing_type]
        fname = make_fullname(thing_cls, thing.thing_id)
        thing_score = score(thing.ups, thing.downs)
        thing_controversy = controversy(thing.ups, thing.downs)

        for interval, cutoff in cutoff_by_interval.iteritems():
            if thing.timestamp < cutoff:
                continue

            yield ("user/%s/top/%s/%d" % (thing.thing_type, interval, thing.author_id),
                   thing_score, thing.timestamp, fname)
            yield ("user/%s/controversial/%s/%d" % (thing.thing_type, interval, thing.author_id),
                   thing_controversy, thing.timestamp, fname)

            if thing.spam:
                continue

            if thing.thing_type == "link":
                yield ("sr/link/top/%s/%d" % (interval, thing.sr_id),
                       thing_score, thing.timestamp, fname)
                yield ("sr/link/controversial/%s/%d" % (interval, thing.sr_id),
                       thing_controversy, thing.timestamp, fname)

                if thing.url:
                    try:
                        parsed = UrlParser(thing.url)
                    except ValueError:
                        continue

                    for domain in parsed.domain_permutations():
                        yield ("domain/link/top/%s/%s" % (interval, domain),
                               thing_score, thing.timestamp, fname)
                        yield ("domain/link/controversial/%s/%s" % (interval, domain),
                               thing_controversy, thing.timestamp, fname)

    mr_tools.mr_map(process)
Exemple #20
0
def time_listings(times=('all', )):
    oldests = dict(
        (t, epoch_seconds(timeago('1 %s' % t))) for t in times if t != "all")
    oldests['all'] = epoch_seconds(timeago('10 years'))

    @mr_tools.dataspec_m_thing(
        ("url", str), )
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            if link.url:
                domains = UrlParser(link.url).domain_permutations()
            else:
                domains = []
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    sc = score(ups, downs)
                    contr = controversy(ups, downs)
                    h = _hot(ups, downs, timestamp)
                    upvotes = upvotes(ups)
                    for domain in domains:
                        yield ('domain/top/%s/%s' % (tkey, domain), sc,
                               timestamp, fname)
                        yield ('domain/%s/%s/%s' %
                               (g.voting_upvote_path, tkey, domain), upvotes,
                               timestamp, fname)
                        yield ('domain/%s/%s/%s' %
                               (g.voting_controversial_path, tkey, domain),
                               contr, timestamp, fname)
                        if tkey == "all":
                            yield ('domain/hot/%s/%s' % (tkey, domain), h,
                                   timestamp, fname)
                            yield ('domain/new/%s/%s' % (tkey, domain),
                                   timestamp, timestamp, fname)

    mr_tools.mr_map(process)
Exemple #21
0
def time_listings(times = ('all',)):
    oldests = dict((t, epoch_seconds(timeago('1 %s' % t)))
                   for t in times if t != "all")
    oldests['all'] = epoch_seconds(timeago('10 years'))

    @mr_tools.dataspec_m_thing(("url", str),)
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam and not link.deleted:
            if link.url:
                domains = UrlParser(link.url).domain_permutations()
            else:
                domains = []
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    sc = score(ups, downs)
                    contr = controversy(ups, downs)
                    h = _hot(ups, downs, timestamp)
                    for domain in domains:
                        yield ('domain/top/%s/%s' % (tkey, domain),
                               sc, timestamp, fname)
                        yield ('domain/controversial/%s/%s' % (tkey, domain),
                               contr, timestamp, fname)
                        if tkey == "all":
                            yield ('domain/hot/%s/%s' % (tkey, domain),
                                   h, timestamp, fname)
                            yield ('domain/new/%s/%s' % (tkey, domain),
                                   timestamp, timestamp, fname)

    mr_tools.mr_map(process)
def year_listings():
    """
    With an 'all' dump, generate the top and controversial per user per year
    """
    @mr_tools.dataspec_m_thing(('author_id', int),)
    def process(link):
        if not link.deleted:
            author_id = link.author_id
            ups = link.ups
            downs = link.downs
            sc = score(ups, downs)
            contr = controversy(ups, downs)
            if link.thing_type == 'link':
                fname = make_fullname(Link, link.thing_id)
            else:
                fname = make_fullname(Comment, link.thing_id)
            timestamp = link.timestamp
            date = datetime.datetime.utcfromtimestamp(timestamp)
            yield ('user-top-%s-%d' % (date.year, author_id),
                   sc, timestamp, fname)
            yield ('user-controversial-%s-%d' % (date.year, author_id),
                   contr, timestamp, fname)

    mr_tools.mr_map(process)
Exemple #23
0
def time_listings(times = ('year','month','week','day','hour')):
    oldests = dict((t, epoch_seconds(timeago('1 %s' % t)))
                   for t in times)

    @mr_tools.dataspec_m_thing(('sr_id', int),)
    def process(link):
        assert link.thing_type == 'link'

        timestamp = link.timestamp
        fname = make_fullname(Link, link.thing_id)

        if not link.spam:
            sr_id = link.sr_id
            ups, downs = link.ups, link.downs

            for tkey, oldest in oldests.iteritems():
                if timestamp > oldest:
                    yield ('sr-top-%s-%d' % (tkey, sr_id),
                           score(ups, downs), timestamp, fname)
                    yield ('sr-controversial-%s-%d' % (tkey, sr_id),
                           controversy(ups, downs),
                           timestamp, fname)

    mr_tools.mr_map(process)
Exemple #24
0
def combine_links():
    @mr_tools.dataspec_m_thing(('link_id', int))
    def _process(t):
        thing_id = t.thing_id
        id36 = to36(thing_id)

        link_id = t.link_id
        link_id36 = to36(link_id)

        ups, downs, timestamp = t.ups, t.downs, t.timestamp

        yield link_id36+'_controversy', id36, sorts.controversy(ups, downs)
        yield link_id36+'_hot',         id36, sorts._hot(ups, downs, timestamp)
        yield link_id36+'_confidence',  id36, sorts.confidence(ups, downs)
        yield link_id36+'_score',       id36, sorts.score(ups, downs)
        yield link_id36+'_date',        id36, timestamp

    return mr_tools.mr_map(_process)
Exemple #25
0
def combine_links():
    @mr_tools.dataspec_m_thing(('link_id', int))
    def _process(t):
        thing_id = t.thing_id
        id36 = to36(thing_id)

        link_id = t.link_id
        link_id36 = to36(link_id)

        ups, downs, timestamp = t.ups, t.downs, t.timestamp

        yield link_id36+'_controversy', id36, sorts.controversy(ups, downs)
        yield link_id36+'_hot',         id36, sorts._hot(ups, downs, timestamp)
        yield link_id36+'_confidence',  id36, sorts.confidence(ups, downs)
        yield link_id36+'_score',       id36, sorts.score(ups, downs)
        yield link_id36+'_date',        id36, timestamp

    return mr_tools.mr_map(_process)