Exemple #1
0
    def test_mr_reduce(self):
        stdin = StringIO("\n".join(["foo\tbar\tbar1", "baz\tbad\tbad1\tbad2"]))
        stdout = StringIO()

        def process(key, vals):
            return [[key, len(list(vals)[0])]]

        mr_reduce(process, fd=stdin, out=stdout)
        self.assertEqual(stdout.getvalue(), "foo\t2\nbaz\t3\n")
Exemple #2
0
def join_things(fields, deleted=False, spam=True):
    """A reducer that joins thing table dumps and data table dumps"""
    # Because of how Python handles scope, if we want to modify these outside
    # the closure function below, they need to be inside a mutable object.
    # http://stackoverflow.com/a/23558809/120999
    counters = {
        'processed': 0,
        'skipped': 0,
    }
    def process(thing_id, vals):
        data = {}
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(val,
                                        ['data_type', # e.g. 'thing'
                                         'thing_type', # e.g. 'link'
                                         'ups',
                                         'downs',
                                         'deleted',
                                         'spam',
                                         'timestamp'])
            elif val[0] == 'data':
                val = format_dataspec(val,
                                      ['data_type', # e.g. 'data'
                                       'thing_type', # e.g. 'link'
                                       'key', # e.g. 'sr_id'
                                       'value'])
                if val.key in fields:
                    data[val.key] = val.value

        if (
            # silently ignore if we didn't see the 'thing' row
            thing is not None

            # remove spam and deleted as appriopriate
            and (deleted or thing.deleted == 'f')
            and (spam or thing.spam == 'f')

            # and silently ignore items that don't have all of the
            # data that we need
            and all(field in data for field in fields)):

            counters['processed'] += 1
            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp)
                   + tuple(data[field] for field in fields))
        else:
            counters['skipped'] += 1

    mr_reduce(process)
    # Print to stderr to avoid getting this caught up in the pipe of
    # compute_time_listings.
    print >> sys.stderr, '%s items processed, %s skipped' % (
                         counters['processed'], counters['skipped'])
Exemple #3
0
def join_things(fields, deleted=False, spam=True):
    """A reducer that joins thing table dumps and data table dumps"""
    # Because of how Python handles scope, if we want to modify these outside
    # the closure function below, they need to be inside a mutable object.
    # http://stackoverflow.com/a/23558809/120999
    counters = {
        'processed': 0,
        'skipped': 0,
    }
    def process(thing_id, vals):
        data = {}
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(val,
                                        ['data_type', # e.g. 'thing'
                                         'thing_type', # e.g. 'link'
                                         'ups',
                                         'downs',
                                         'deleted',
                                         'spam',
                                         'timestamp'])
            elif val[0] == 'data':
                val = format_dataspec(val,
                                      ['data_type', # e.g. 'data'
                                       'thing_type', # e.g. 'link'
                                       'key', # e.g. 'sr_id'
                                       'value'])
                if val.key in fields:
                    data[val.key] = val.value

        if (
            # silently ignore if we didn't see the 'thing' row
            thing is not None

            # remove spam and deleted as appriopriate
            and (deleted or thing.deleted == 'f')
            and (spam or thing.spam == 'f')

            # and silently ignore items that don't have all of the
            # data that we need
            and all(field in data for field in fields)):

            counters['processed'] += 1
            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp)
                   + tuple(data[field] for field in fields))
        else:
            counters['skipped'] += 1

    mr_reduce(process)
    # Print to stderr to avoid getting this caught up in the pipe of
    # compute_time_listings.
    print >> sys.stderr, '%s items processed, %s skipped' % (
                         counters['processed'], counters['skipped'])
Exemple #4
0
    def test_mr_reduce(self):
        stdin = StringIO("\n".join([
            "foo\tbar\tbar1",
            "baz\tbad\tbad1\tbad2",
        ]))
        stdout = StringIO()

        def process(key, vals):
            return [[key, len(list(vals)[0])]]

        mr_reduce(process, fd=stdin, out=stdout)
        self.assertEqual(stdout.getvalue(), "foo\t2\nbaz\t3\n")
Exemple #5
0
def join_things(fields, deleted=False, spam=True):
    """A reducer that joins thing table dumps and data table dumps"""
    def process(thing_id, vals):
        data = {}
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(
                    val,
                    [
                        'data_type',  # e.g. 'thing'
                        'thing_type',  # e.g. 'link'
                        'ups',
                        'downs',
                        'deleted',
                        'spam',
                        'timestamp'
                    ])
            elif val[0] == 'data':
                val = format_dataspec(
                    val,
                    [
                        'data_type',  # e.g. 'data'
                        'thing_type',  # e.g. 'link'
                        'key',  # e.g. 'sr_id'
                        'value'
                    ])
                if val.key in fields:
                    data[val.key] = val.value

        if (
                # silently ignore if we didn't see the 'thing' row
                thing is not None

                # remove spam and deleted as appriopriate
                and (deleted or thing.deleted == 'f') and
            (spam or thing.spam == 'f')

                # and silently ignore items that don't have all of the
                # data that we need
                and all(field in data for field in fields)):

            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp) +
                   tuple(data[field] for field in fields))

    mr_reduce(process)
Exemple #6
0
def join_things(fields, deleted=False, spam=True):
    """A reducer that joins thing table dumps and data table dumps"""

    def process(thing_id, vals):
        data = {}
        thing = None

        for val in vals:
            if val[0] == "thing":
                thing = format_dataspec(
                    val,
                    [
                        "data_type",  # e.g. 'thing'
                        "thing_type",  # e.g. 'link'
                        "ups",
                        "downs",
                        "deleted",
                        "spam",
                        "timestamp",
                    ],
                )
            elif val[0] == "data":
                val = format_dataspec(
                    val, ["data_type", "thing_type", "key", "value"]  # e.g. 'data'  # e.g. 'link'  # e.g. 'sr_id'
                )
                if val.key in fields:
                    data[val.key] = val.value

        if (
            # silently ignore if we didn't see the 'thing' row
            thing is not None
            # remove spam and deleted as appriopriate
            and (deleted or thing.deleted == "f")
            and (spam or thing.spam == "f")
            # and silently ignore items that don't have all of the
            # data that we need
            and all(field in data for field in fields)
        ):

            yield (
                (thing_id, thing.thing_type, thing.ups, thing.downs, thing.deleted, thing.spam, thing.timestamp)
                + tuple(data[field] for field in fields)
            )

    mr_reduce(process)
Exemple #7
0
def join_things(fields, deleted=False, spam=True):
    """A reducer that joins thing table dumps and data table dumps"""
    def process(thing_id, vals):
        data = {}
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(val,
                                        ['data_type', # e.g. 'thing'
                                         'thing_type', # e.g. 'link'
                                         'ups',
                                         'downs',
                                         'deleted',
                                         'spam',
                                         'timestamp'])
            elif val[0] == 'data':
                val = format_dataspec(val,
                                      ['data_type', # e.g. 'data'
                                       'thing_type', # e.g. 'link'
                                       'key', # e.g. 'sr_id'
                                       'value'])
                if val.key in fields:
                    data[val.key] = val.value

        if (
            # silently ignore if we didn't see the 'thing' row
            thing is not None

            # remove spam and deleted as appriopriate
            and (deleted or thing.deleted == 'f')
            and (spam or thing.spam == 'f')

            # and silently ignore items that don't have all of the
            # data that we need
            and all(field in data for field in fields)):

            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp)
                   + tuple(data[field] for field in fields))

    mr_reduce(process)
Exemple #8
0
def join_things(
    fields,
    deleted=False,
    spam=True,
    fd=STDIN,
    out=STDOUT,
    err=STDERR,
    defaults=None,
):
    """A reducer that joins thing table dumps and data table dumps

    :param list fields: list of data fields that the resulting thing must
        contain.  Any things that missing these any of these fields (unless
        provided in the dump or by :param:`defaults`) will be silently dropped.
    :param bool deleted: Allow deleted items.
    :param bool spam: Allow spam items.
    :param file fd: Input stream.
    :param file out: Output stream.
    :param file err: Error stream.
    :param defaults: mapping of fieldnames to default values if not provided
        in the input stream.
    :type defaults: dict or None
    """
    # Because of how Python handles scope, if we want to modify these outside
    # the closure function below, they need to be inside a mutable object.
    # http://stackoverflow.com/a/23558809/120999
    counters = {
        'processed': 0,
        'skipped': 0,
    }
    def process(thing_id, vals):
        data = {}
        if defaults:
            data.update(defaults)
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(val,
                                        ['data_type', # e.g. 'thing'
                                         'thing_type', # e.g. 'link'
                                         'ups',
                                         'downs',
                                         'deleted',
                                         'spam',
                                         'timestamp'])
            elif val[0] == 'data':
                val = format_dataspec(val,
                                      ['data_type', # e.g. 'data'
                                       'thing_type', # e.g. 'link'
                                       'key', # e.g. 'sr_id'
                                       'value'])
                if val.key in fields:
                    data[val.key] = val.value

        if (
            # silently ignore if we didn't see the 'thing' row
            thing is not None

            # remove spam and deleted as appriopriate
            and (deleted or thing.deleted == 'f')
            and (spam or thing.spam == 'f')

            # and silently ignore items that don't have all of the
            # data that we need
            and all(field in data for field in fields)):

            counters['processed'] += 1
            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp)
                   + tuple(data[field] for field in fields))
        else:
            counters['skipped'] += 1

    mr_reduce(process, fd=fd, out=out)

    # Print to stderr to avoid getting this caught up in the pipe of
    # compute_time_listings.
    err.write(
        '%s items processed, %s skipped\n' % (
            counters['processed'], counters['skipped']
        )
    )
Exemple #9
0
def join_things(
    fields,
    deleted=False,
    spam=True,
    fd=STDIN,
    out=STDOUT,
    err=STDERR,
    defaults=None,
):
    """A reducer that joins thing table dumps and data table dumps

    :param list fields: list of data fields that the resulting thing must
        contain.  Any things that missing these any of these fields (unless
        provided in the dump or by :param:`defaults`) will be silently dropped.
    :param bool deleted: Allow deleted items.
    :param bool spam: Allow spam items.
    :param file fd: Input stream.
    :param file out: Output stream.
    :param file err: Error stream.
    :param defaults: mapping of fieldnames to default values if not provided
        in the input stream.
    :type defaults: dict or None
    """
    # Because of how Python handles scope, if we want to modify these outside
    # the closure function below, they need to be inside a mutable object.
    # http://stackoverflow.com/a/23558809/120999
    counters = {
        'processed': 0,
        'skipped': 0,
    }

    def process(thing_id, vals):
        data = {}
        if defaults:
            data.update(defaults)
        thing = None

        for val in vals:
            if val[0] == 'thing':
                thing = format_dataspec(
                    val,
                    [
                        'data_type',  # e.g. 'thing'
                        'thing_type',  # e.g. 'link'
                        'ups',
                        'downs',
                        'deleted',
                        'spam',
                        'timestamp'
                    ])
            elif val[0] == 'data':
                val = format_dataspec(
                    val,
                    [
                        'data_type',  # e.g. 'data'
                        'thing_type',  # e.g. 'link'
                        'key',  # e.g. 'sr_id'
                        'value'
                    ])
                if val.key in fields:
                    data[val.key] = val.value

        if (
                # silently ignore if we didn't see the 'thing' row
                thing is not None

                # remove spam and deleted as appriopriate
                and (deleted or thing.deleted == 'f') and
            (spam or thing.spam == 'f')

                # and silently ignore items that don't have all of the
                # data that we need
                and all(field in data for field in fields)):

            counters['processed'] += 1
            yield ((thing_id, thing.thing_type, thing.ups, thing.downs,
                    thing.deleted, thing.spam, thing.timestamp) +
                   tuple(data[field] for field in fields))
        else:
            counters['skipped'] += 1

    mr_reduce(process, fd=fd, out=out)

    # Print to stderr to avoid getting this caught up in the pipe of
    # compute_time_listings.
    err.write('%s items processed, %s skipped\n' %
              (counters['processed'], counters['skipped']))