def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def testProcessNamespace(self): """Test ProcessNamespace function.""" namespace_manager.set_namespace("1") TestEntity().put() namespace_manager.set_namespace(None) namespaces_jobs = utils.RunMapForKinds( self.operation, [TestEntity.kind()], 'Test job for %(kind)s%(namespace)s', '__main__.foo', self.reader_class_spec, {'test_param': 1}) testutil.execute_all_tasks(self.taskqueue) m = mox.Mox() m.StubOutWithMock(context, "get", use_mock_anything=True) ctx = context.Context( model.MapreduceState.get_by_job_id(namespaces_jobs[0]).mapreduce_spec, None) context.get().AndReturn(ctx) context.get().AndReturn(ctx) m.ReplayAll() try: jobs = utils.ProcessNamespace('1') jobs.extend(utils.ProcessNamespace('1')) m.VerifyAll() finally: m.UnsetStubs() testutil.execute_all_tasks(self.taskqueue) self.assertEquals(1, len(jobs)) job = jobs[0] state = model.MapreduceState.get_by_job_id(job) self.assertTrue(state) spec = state.mapreduce_spec self.assertTrue(spec) self.assertEquals("Test job for TestEntity in namespace 1", spec.name) mapper = spec.mapper self.assertTrue(mapper) self.assertEquals({'test_param': 1, 'entity_kind': TestEntity.kind(), 'namespaces': '1'}, mapper.params) self.assertEquals('__main__.foo', mapper.handler_spec) self.assertEquals(self.reader_class_spec, mapper.input_reader_spec)
def process(comment): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] program = GCIProgram.get_by_key_name(program_key) if comment.parent().program.key() != program.key(): yield operation.counters.Increment("prev_program_comment_not_converted") return if comment.title not in ACTION_TITLES: yield operation.counters.Increment("user_comment_not_converted") return comment_title = ACTION_TITLES[comment.title] changes = ACTION_TITLES[comment_title] # Task reopening is a special case which could have been performed # either by a mentor or by the automated system after the passing of # the deadline. So additional inference of the user has to be made. if comment_title == 'Task Reopened': if comment.created_by: user_info = ugettext('User-Mentor') else: user_info = ugettext('MelangeAutomatic') changes = [user_info] + changes comment.changes = changes yield operation.db.Put(comment) yield operation.counters.Increment("action_comment_converted")
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filehandles) pool = self._pools[file_index] if pool is None: filehandle = self._filehandles[file_index] pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx) self._pools[file_index] = pool proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) pool.append(proto.Encode())
def process(task): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params["program_key"] try: program = GCIProgram.get_by_key_name(program_key) except db.BadValueError: yield operation.counters.Increment("program_key_is_empty_or_invalid") return def subscribe_to_task_txn(task_key, subscribe): task = GCITask.get(task_key) task.subscribers = list(set(task.subscribers + subscribe)) task.put() return task if task.program.key() != program.key(): yield operation.counters.Increment("old_program_task_not_updated") return mentors = db.get(task.mentors) entities = mentors + [task.created_by, task.modified_by] subscribe = [ent.key() for ent in entities if ent.automatic_task_subscription] result = db.run_in_transaction(subscribe_to_task_txn, task.key(), subscribe) if result: yield operation.counters.Increment("task_updated") else: yield operation.counters.Increment("task_not_updated")
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if ctx.get_pool("file_pool") is None: ctx.register_pool("file_pool", _FilePool(ctx=ctx)) ctx.get_pool("file_pool").append(self._filename, str(data))
def initialize(self): if self.initialized: return mapper_params = context.get().mapreduce_spec.mapper.params kind_filter = mapper_params.get('kind_filter') self.kind_filter = set(kind_filter) if kind_filter else None original_app = mapper_params.get('original_app') if original_app and os.getenv('APPLICATION_ID') != original_app: self.app_id = os.getenv('APPLICATION_ID') self.initialized = True
def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get( "combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner(self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: self.current_values.append(value) else: self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if ctx.get_pool("records_pool") is None: ctx.register_pool( "records_pool", RecordsPool(self._filename, ctx=ctx, exclusive=True)) ctx.get_pool("records_pool").append(str(data))
def write(self, data): """Write data to the GoogleCloudStorage file. Args: data: string containing the data to be written. """ start_time = time.time() self._get_write_buffer().write(data) ctx = context.get() operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx) operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
def write(self, data): """Write data to the GoogleCloudStorage file. Args: data: string containing the data to be written. """ start_time = time.time() self._streaming_buffer.write(data) ctx = context.get() operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx) operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
def __iter__(self): ctx = context.get() combiner = None if ctx: combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec") if combiner_spec: combiner = util.handler_for_name(combiner_spec) self.current_key = None self.current_values = None for binary_record in super(_ReducerReader, self).__iter__(): proto = file_service_pb.KeyValues() proto.ParseFromString(binary_record) if self.current_key is None: self.current_key = proto.key() self.current_values = [] else: assert proto.key() == self.current_key, ( "inconsistent key sequence. Expected %s but got %s" % (self.current_key, proto.key())) if combiner: combiner_result = combiner( self.current_key, proto.value_list(), self.current_values) if not util.is_generator(combiner_result): raise errors.BadCombinerOutputError( "Combiner %s should yield values instead of returning them (%s)" % (combiner, combiner_result)) self.current_values = [] for value in combiner_result: if isinstance(value, operation.Operation): value(ctx) else: self.current_values.append(value) else: self.current_values.extend(proto.value_list()) if not proto.partial(): key = self.current_key values = self.current_values self.current_key = None self.current_values = None yield (key, values) else: yield input_readers.ALLOW_CHECKPOINT
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if ctx.get_pool("records_pool") is None: ctx.register_pool("records_pool", RecordsPool(self._filename, ctx=ctx, exclusive=True)) ctx.get_pool("records_pool").append(str(data))
def __iter__(self): ctx = context.get() while self._count: self._count -= 1 start_time = time.time() content = "".join(random.choice(string.ascii_lowercase) for _ in range(self._string_length)) if ctx: operation.counters.Increment( COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) yield content
def touch(key): # change entity app = key.namespace() kind = key.kind() id = key.id_or_name() ctx = context.get() params = ctx.mapreduce_spec.mapper.params matching_app = params["app_to_process"] if matching_app and matching_app != app: return metadata_entity = store._GetMetadataEntity(app) store.update_entity(app, kind, id, {}, metadata_entity, None, put_function=yield_put, rebuild_facets=True)
def process(task): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] program = GCIProgram.get_by_key_name(program_key) if (task.program.key() == program.key() and (task.status == 'Unapproved'or task.status == 'Unpublished')): task.status = 'Open' yield operation.db.Put(task) yield operation.counters.Increment("task_updated") yield operation.counters.Increment("task_not_updated")
def __iter__(self): ctx = context.get() while self._count: self._count -= 1 start_time = time.time() content = "".join( random.choice(string.ascii_lowercase) for _ in range(self._string_length)) if ctx: operation.counters.Increment( COUNTER_IO_READ_MSEC, int( (time.time() - start_time) * 1000))(ctx) operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) yield content
def ProcessNamespace(namespace): """Handler function for mapper over all namespaces. Starts mapper jobs specified by parameters over all passed kinds. Args: namespace: namespace to process. Returns: Started mapper job ids. Mapper framework ignores function value. Returning these for testing purposes only. """ ctx = context.get() mapreduce_spec = ctx.mapreduce_spec params = mapreduce_spec.params operation = DatastoreAdminOperation.get( params[DatastoreAdminOperation.PARAM_DATASTORE_ADMIN_OPERATION]) mapper_params = params['mapper_params'] jobs = [] for kind in params['kinds']: job_key_name = kind + "@" + namespace mapper_params['entity_kind'] = kind mapper_params['namespaces'] = namespace job_name = params['job_name'] % { 'kind': kind, 'namespace': ' in namespace ' + namespace } def tx(): if db.get(db.Key.from_path(operation.kind(), operation.key().id_or_name(), DatastoreAdminOperationJob.kind(), job_key_name)): return None DatastoreAdminOperationJob(key_name=job_key_name, parent=operation).put() return StartMap(operation, job_name, params['handler_spec'], params['reader_spec'], mapper_params, start_transaction=False) job = db.run_in_transaction(tx) if job: jobs.append(job) return jobs
def _read(self, entry): """Read entry content. Args: entry: zip file entry as zipfile.ZipInfo. Returns: Entry content as string. """ start_time = time.time() content = self._zip.read(entry.filename) ctx = context.get() if ctx: operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx) operation.counters.Increment( COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) return content
def process(org_app): ctx = context.get() params = ctx.mapreduce_spec.mapper.params program_key = params['program_key'] # TODO(SRabbelier): should have been a full url url = 'gci/profile/organization/%s' % program_key # TODO(SRabbelier): create a MapReduce/Task RequestData data = RequestData() data.program = GCIProgram.get_by_key_name(program_key) data.site = Site.get_by_key_name('site') if org_app.status == 'pre-accepted': org_app_logic.setStatus(data, org_app, 'accepted', url) yield operation.counters.Increment("proposals_accepted") elif org_app.status == 'pre-rejected': org_app_logic.setStatus(data, org_app, 'rejected', url) yield operation.counters.Increment("proposals_rejected") else: yield operation.counters.Increment("proposals_ignored")
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _sort_records(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) proto_records = [None] * l logging.info("parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) proto_records[i] = proto logging.info("sorting") proto_records.sort(cmp=_compare_keys) logging.info("writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for proto in proto_records: pool.append(proto.Encode()) logging.info("finalizing") files.finalize(output_path) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _sort_records(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) proto_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) proto_records[i] = proto logging.debug("Sorting") proto_records.sort(cmp=_compare_keys) logging.debug("Writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for proto in proto_records: pool.append(proto.Encode()) logging.debug("Finalizing") files.finalize(output_path) time.sleep(1) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def touch(key): # change entity app = key.namespace() kind = key.kind() id = key.id_or_name() ctx = context.get() params = ctx.mapreduce_spec.mapper.params matching_app = params['app_to_process'] if matching_app and matching_app != app: return metadata_entity = store._GetMetadataEntity(app) store.update_entity(app, kind, id, {}, metadata_entity, None, put_function=yield_put, rebuild_facets=True)
def __iter__(self): """Iterate over records in file. Yields records as strings. """ ctx = context.get() while self._reader: try: start_time = time.time() record = self._reader.read() if ctx: operation.counters.Increment( COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx) operation.counters.Increment(COUNTER_IO_READ_BYTES, len(record))(ctx) yield record except EOFError: self._filenames.pop(0) if not self._filenames: self._reader = None else: self._reader = records.RecordsReader( files.BufferedFile(self._filenames[0]))
def delete(entity): params = context.get().mapreduce_spec.mapper.params quiz_id = int(params['quiz_id']) if entity.quiz.key().id() == quiz_id: entity.is_archived = True yield operation.db.Delete(entity)
def yield_put(entity): f = op.db.Put(entity) f(context.get())
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def get_mapper_params(): """Return current mapreduce mapper params. Easily stubbed out for testing.""" return context.get().mapreduce_spec.mapper.params
def __init__(self): mapper_params = context.get().mapreduce_spec.mapper.params kind_filter = mapper_params.get('kind_filter') self.kind_filter = set(kind_filter) if kind_filter else None