Exemple #1
0
 def __run_test(self, mode, mapper_class, context_class):
     cmd_file = self.__write_cmd_file(mode)
     pp.run_task(
         pp.Factory(mapper_class=mapper_class), private_encoding=False,
         context_class=context_class, cmd_file=cmd_file
     )
     out_fn = cmd_file + '.out'
     out_records = []
     with open(out_fn) as ostream:
         for cmd, args in BinaryDownStreamFilter(ostream):
             if cmd == 'output':
                 name, color = args
                 out_records.append({'name': name, 'favorite_color': color})
     self.assertEqual(len(out_records), len(self.records))
     for out_r, r in zip(out_records, self.records):
         for k, v in out_r.iteritems():
             self.assertEqual(v, r[k])
Exemple #2
0
def run_local_avro(logger, avro_in='v', avro_out=None):
    mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out]
    schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None
    schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None
    file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN
    factory = pp.Factory(mapper_class=mapper, reducer_class=reducer)
    simulator = HadoopSimulatorLocal(factory, logger, logging.INFO,
                                     AvroContext, avro_in, avro_out,
                                     schema_k_out, schema_v_out)
    with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout:
        simulator.run(fin, fout, {}, num_reducers=1)
    dump_counters(simulator, logger)
    if avro_out:
        data_out_des = DATA_OUT + '-des'
        avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out)
        avro_check_results.main(USERS_CSV_FN, data_out_des)
    else:
        avro_check_results.main(USERS_CSV_FN, DATA_OUT)
Exemple #3
0
 def __run_test(self, mode, mapper_class, context_class):
     cmd_file = self.__write_cmd_file(mode)
     pp.run_task(pp.Factory(mapper_class=mapper_class),
                 private_encoding=False,
                 context_class=context_class,
                 cmd_file=cmd_file)
     out_fn = cmd_file + '.out'
     out_records = []
     with open(out_fn, 'rb') as f:
         bf = BinaryDownStreamAdapter(f)
         for cmd, args in bf:
             if cmd == bf.OUTPUT:
                 name, color = args
                 out_records.append({'name': name, 'favorite_color': color})
     self.assertEqual(len(out_records), len(self.records))
     for out_r, r in zip(out_records, self.records):
         for k, v in iteritems(out_r):
             self.assertEqual(v.decode('UTF-8'), r[k])
Exemple #4
0
        bneck_map = self.bneck_store.get_bnecks(top_dir)
        self.bnecks, self.gtruths = BottleneckStore.bnecks_map_to_vectors(
            bneck_map, BottleneckStore.assign_labels(top_dir))

    def map(self, context):
        LOGGER.info("testing %s" % (context.value))
        with tf.Session(graph=tf.Graph()) as session:
            models.load_checkpoint(context.value)
            graph = session.graph
            eval_step, prediction, bneck_input, gtruth_input = (
                self.model.get_eval_step(graph),
                self.model.get_prediction(graph),
                self.model.get_bneck_input(graph),
                self.model.get_gtruth_input(graph),
            )
            test_accuracy, predictions = session.run([eval_step, prediction],
                                                     feed_dict={
                                                         bneck_input:
                                                         self.bnecks,
                                                         gtruth_input:
                                                         self.gtruths
                                                     })
        context.emit(context.value, str(test_accuracy))


factory = pp.Factory(mapper_class=Mapper, record_reader_class=PathNameReader)


def __main__():
    pp.run_task(factory)
Exemple #5
0
# END_COPYRIGHT
"""\
Includes only the bare minimum required to run wordcount. See
wordcount-full.py for an example that uses counters, RecordReader, etc.
"""
# DOCS_INCLUDE_START
import pydoop.mapreduce.api as api
import pydoop.mapreduce.pipes as pipes


class Mapper(api.Mapper):
    def map(self, context):
        for w in context.value.split():
            context.emit(w, 1)


class Reducer(api.Reducer):
    def reduce(self, context):
        context.emit(context.key, sum(context.values))


FACTORY = pipes.Factory(mapper_class=Mapper, reducer_class=Reducer)


def main():
    pipes.run_task(FACTORY)


if __name__ == "__main__":
    main()
def __main__():
    pp.run_task(pp.Factory(mapper_class=Mapper), context_class=AvroContext)
Exemple #7
0
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import re

import pydoop.mapreduce.api as api
import pydoop.mapreduce.pipes as pp


class Mapper(api.Mapper):

    def map(self, context):
        words = re.sub('[^0-9a-zA-Z]+', ' ', context.getInputValue()).split()
        for w in words:
            context.emit(w, 1)


class Reducer(api.Reducer):

    def reduce(self, context):
        s = sum(context.values)
        context.emit(context.key, s)

factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer)


def __main__():
    pp.run_task(factory)
Exemple #8
0
def run_task(mapper_class, reducer_class=NoAvroColorCount):
    pp.run_task(pp.Factory(mapper_class=mapper_class,
                           reducer_class=reducer_class),
                private_encoding=True,
                context_class=AvroContext)
Exemple #9
0

class StupidReducer(api.Reducer):

    def __init__(self, context):
        super(StupidReducer, self).__init__(context)
        self.logger = LOGGER.getChild("Reducer")

    def reduce(self, context):
        fname = context.key
        recs = sorted(context.values, key=lambda _: _[0].offset)
        offset, length = recs[0][0].offset, recs[0][0].length
        lbndry, rbndry = recs[0][1]
        for r in recs[1:]:
            assert r[0].offset == offset + length
            assert rbndry <= r[1][0]
            offset, length = r[0].offset, r[0].length
            rbndry = r[1][1]
        context.emit(fname, [lbndry, rbndry])


factory = pp.Factory(
    mapper_class=StupidMapper,
    reducer_class=StupidReducer,
    record_reader_class=Reader,
)


def __main__():
    pp.run_task(factory)
Exemple #10
0
def __main__():
    pp.run_task(pp.Factory(
        mapper_class=Mapper,
        reducer_class=Reducer,
        combiner_class=Reducer
    ))
Exemple #11
0

class Partitioner(api.Partitioner):
    def __init__(self, context):
        super(Partitioner, self).__init__(context)
        self.logger = LOGGER.getChild("Partitioner")

    def partition(self, key, num_reduces):
        reducer_id = (hash(key) & sys.maxint) % num_reduces
        self.logger.debug("reducer_id: %r" % reducer_id)
        return reducer_id


FACTORY = pp.Factory(mapper_class=Mapper,
                     reducer_class=Reducer,
                     record_reader_class=Reader,
                     record_writer_class=Writer,
                     partitioner_class=Partitioner,
                     combiner_class=Reducer)


def main():
    pp.run_task(FACTORY)


if __name__ == "__main__":
    main()

# Local Variables:
# mode: python
# End:
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory, context_class=Context)
def __main__():
    pipes.run_task(pipes.Factory(mapper_class=Mapper))
Exemple #14
0
        super(AvroContext, self).set_job_conf(vals)
        schema = avro.schema.parse(self._job_conf[AVRO_SCHEMA_KEY])
        self.datum_reader = DatumReader(schema)

    def get_input_value(self):
        # FIXME reuse, reuse, reuse
        sys.stderr.write('value: %r\n' % self._value)
        f = StringIO(self._value)
        dec = BinaryDecoder(f)
        return self.datum_reader.read(dec)


class ColorPick(api.Mapper):
    def map(self, ctx):
        user = ctx.value
        color = user['favorite_color']
        sys.stderr.write('user: %r' % user)
        if color is not None:
            ctx.emit(user['office'], Counter({color: 1}))


class ColorCount(api.Reducer):
    def reduce(self, ctx):
        s = sum(ctx.values, Counter())
        ctx.emit(ctx.key, "%r" % s)


pp.run_task(pp.Factory(mapper_class=ColorPick, reducer_class=ColorCount),
            private_encoding=True,
            context_class=AvroContext)
Exemple #15
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# END_COPYRIGHT

import pydoop.mapreduce.api as api
import pydoop.mapreduce.pipes as pp
from pydoop.avrolib import AvroContext


class Mapper(api.Mapper):
    def map(self, context):
        context.emit('', context.value['population'])


class Reducer(api.Reducer):
    def reduce(self, context):
        context.emit('', sum(context.values))


FACTORY = pp.Factory(Mapper, Reducer)
CONTEXT = AvroContext


def __main__():
    pp.run_task(FACTORY, private_encoding=True, context_class=CONTEXT)
Exemple #16
0
    pass


class ColorWriter(AvroWriter):

    schema = parse(open("stats.avsc").read())

    def emit(self, key, value):
        self.writer.append({'office': key, 'counts': value})


class ColorPick(api.Mapper):
    def map(self, ctx):
        user = ctx.value
        color = user['favorite_color']
        if color is not None:
            ctx.emit(user['office'], Counter({color: 1}))


class ColorCount(api.Reducer):
    def reduce(self, ctx):
        s = sum(ctx.values, Counter())
        ctx.emit(ctx.key, s)


pp.run_task(pp.Factory(mapper_class=ColorPick,
                       reducer_class=ColorCount,
                       record_reader_class=UserReader,
                       record_writer_class=ColorWriter),
            private_encoding=True)
def __main__():
    pipes.run_task(
        pipes.Factory(
            mapper_class=Mapper,
            record_writer_class=Writer,
        ))
Exemple #18
0
def __main__():
    factory = pp.Factory(mapper_class=Mapper)
    pp.run_task(factory, context_class=AvroContext)
Exemple #19
0
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory, private_encoding=True)
Exemple #20
0
def __main__():
    """Main function to be executed by pydoop framework"""
    factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer, record_reader_class=Reader)
    pp.run_task(factory, private_encoding=True)
Exemple #21
0
        self.bytes_read = 0
        if self.isplit.offset > 0:
            discarded = self.file.readline()
            self.bytes_read += len(discarded)

    def close(self):
        self.file.close()
        self.file.fs.close()

    def next(self):
        if self.bytes_read > self.isplit.length:
            raise StopIteration
        key = serialize_to_string(self.isplit.offset + self.bytes_read)
        record = self.file.readline()
        if record == "":  # end of file
            raise StopIteration
        self.bytes_read += len(record)
        return (key, record)

    def get_progress(self):
        return min(float(self.bytes_read) / self.isplit.length, 1.0)


factory = pp.Factory(mapper_class=Mapper,
                     reducer_class=Reducer,
                     record_reader_class=Reader)


def __main__():
    pp.run_task(factory)
Exemple #22
0
        # TODO: look for a way to avoid the local write
        path, signal = context.key, context.value
        rr = utils.estimate_rainfall(signal)
        dt_string = os.path.splitext(hdfs.path.basename(path))[0]
        out_name = "%s.tif" % dt_string
        dt = datetime.strptime(dt_string, IN_FMT)
        metadata = {tiffio.DT_TAG: dt.strftime(tiffio.DT_FMT)}
        self.ga.save_as_gtiff(out_name, rr, metadata=metadata)
        with io.open(out_name, "rb") as f:
            value = f.read()
        context.emit(out_name, value)


class Writer(api.RecordWriter):
    def __init__(self, context):
        super().__init__(context)
        self.d = context.get_work_path()

    def emit(self, key, value):
        with hdfs.open(hdfs.path.join(self.d, key), "wb") as f:
            f.write(value)


factory = pp.Factory(mapper_class=Mapper,
                     record_reader_class=Reader,
                     record_writer_class=Writer)


def __main__():
    pp.run_task(factory)
Exemple #23
0
def __main__():
    pipes.run_task(
        pipes.Factory(Mapper,
                      record_writer_class=Writer,
                      record_reader_class=Reader))
Exemple #24
0
def __main__():
    factory = pp.Factory(mapper_class=Mapper, reducer_class=Reducer)
    pp.run_task(factory, private_encoding=True, context_class=AvroContext)
Exemple #25
0
def __main__():
    factory = pp.Factory(Mapper, Reducer)
    pp.run_task(factory)
Exemple #26
0
def __main__():
    pp.run_task(pp.Factory(Mapper, Reducer))
Exemple #27
0
    def map(self, context):
        i = context.key
        train_batch, val_batch = context.value
        train_bnecks, train_gtruths = self.__map_to_vectors(train_batch)
        val_bnecks, val_gtruths = self.__map_to_vectors(val_batch)
        self.retrainer.run_train_step(train_bnecks, train_gtruths)
        if (i % self.eval_step_interval == 0) or (i + 1 >= self.n_steps):
            train_accuracy, cross_entropy = self.retrainer.run_eval_step(
                train_bnecks, train_gtruths)
            LOGGER.info('step %d: train accuracy = %f%%, cross entropy = %f',
                        i, 100 * train_accuracy, cross_entropy)
            val_accuracy = self.retrainer.run_validation_step(
                val_bnecks, val_gtruths)
            LOGGER.info('step %d: validation accuracy = %f%%', i,
                        100 * val_accuracy)
            context.emit(
                i,
                "%s\t%s\t%s" % (cross_entropy, train_accuracy, val_accuracy))

    def __map_to_vectors(self, batch):
        return BottleneckStore.bnecks_map_to_vectors(batch, self.labels)


factory = pp.Factory(mapper_class=Mapper,
                     record_reader_class=BottleneckProjectionsReader)


def __main__():
    pp.run_task(factory)
Exemple #28
0

class StupidMapper(api.Mapper):
    def __init__(self, context):
        super(StupidMapper, self).__init__(context)
        self.logger = LOGGER.getChild("Mapper")

    def map(self, context):
        self.logger.debug('key: %s, val: %s', context.key, context.value)
        context.emit(context.key, context.value)


class StupidReducer(api.Reducer):
    def reduce(self, context):
        key = context.key
        for v in context.values:
            context.emit(key, v)


factory = pp.Factory(
    mapper_class=StupidMapper,
    reducer_class=StupidReducer,
    partitioner_class=Partitioner,
    record_reader_class=Reader,
    record_writer_class=Writer,
)


def __main__():
    pp.run_task(factory, private_encoding=False, auto_serialize=False)
Exemple #29
0
def __main__():
    pp.run_task(pp.Factory(Mapper, None))