def validate(cls, mapper_spec): """Check that the pipeline has passed all the parameters needed by the OutputWriter""" if mapper_spec.output_writer_class() != cls: raise BadWriterParamsError("Mapper output writer class mismatch") params = _get_params(mapper_spec) # Check hashtag parameter if cls.HASHTAG not in params: raise BadWriterParamsError("Must specify %s" % cls.HASHTAG) if not isinstance(str(params[cls.HASHTAG]), str): raise BadWriterParamsError("%s should be a string" % cls.HASHTAG) # Check session_id parameter if cls.SESSION_ID not in params: raise BadWriterParamsError("Must specify %s" % cls.SESSION_ID) if not isinstance(str(params[cls.SESSION_ID]), str): raise BadWriterParamsError("%s should be a string" % cls.SESSION_ID) # Check field paramter if cls.FIELD not in params: raise BadWriterParamsError("Must specify %s" % cls.FIELD) if not isinstance(str(params[cls.FIELD]), str): raise BadWriterParamsError("%s should be a string" % cls.FIELD)
def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None): mapper_spec = mr_spec.mapper params = _get_params(mapper_spec) return cls(host=params.get('host'), port=params.get('port'), database=params.get('database'), user=params.get('user'), password=params.get('password'))
def validate(cls, mapper_spec): required_params = ["host", "port", "database", "user", "password"] if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") params = _get_params(mapper_spec) if not all([arg in params for arg in required_params]): raise errors.BadWriterParamsError("Output writer requires parameters [{}]".format(', '.join(required_params))) if not isinstance(params.get("port"), int): raise errors.BadWriterParamsError("Parameter 'port' must be integer.")
def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None): """Class method to create a new OutputWriter""" # Get params for the Output Writer params = _get_params(mr_spec.mapper) hashtag = params.get(cls.HASHTAG) session_id = params.get(cls.SESSION_ID) field = params.get(cls.FIELD) # Return parameters needed to create a new instance of the Output Writer return cls(hashtag, session_id, field)
def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None): """Inherit docs.""" mapper_spec = mr_spec.mapper params = output_writers._get_params(mapper_spec) bucket_name = params.get(cls.BUCKET_NAME_PARAM) shards = mapper_spec.shard_count filehandles = [] filename = (mr_spec.name + "/" + mr_spec.mapreduce_id + "/shard-" + str(shard_number) + "-bucket-") for i in range(shards): full_filename = "/%s/%s%d" % (bucket_name, filename, i) filehandles.append(cloudstorage.open(full_filename, mode="w")) return cls(filehandles)
def validate(cls, mapper_spec): required_params = ["host", "port", "database", "user", "password"] if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") params = _get_params(mapper_spec) if not all([arg in params for arg in required_params]): raise errors.BadWriterParamsError( "Output writer requires parameters [{}]".format( ', '.join(required_params))) if not isinstance(params.get("port"), int): raise errors.BadWriterParamsError( "Parameter 'port' must be integer.")
def _to_map_job_config(cls, mr_spec, # TODO(user): Remove this parameter after it can be # read from mr_spec. queue_name): """Converts model.MapreduceSpec back to JobConfig. This method allows our internal methods to use JobConfig directly. This method also allows us to expose JobConfig as an API during execution, despite that it is not saved into datastore. Args: mr_spec: model.MapreduceSpec. queue_name: queue name. Returns: The JobConfig object for this job. """ mapper_spec = mr_spec.mapper # 0 means all the old APIs before api_version is introduced. api_version = mr_spec.params.get("api_version", 0) old_api = api_version == 0 # We can not always convert MapreduceSpec generated by older API # to JobConfig. Thus, mr framework should use/expose the returned JobConfig # object with caution when a job is started with an old API. # In this case, this method only tries not to blow up and assemble a # JobConfig object as accurate as possible. return cls(_lenient=old_api, job_name=mr_spec.name, job_id=mr_spec.mapreduce_id, # handler_spec from older API may not have map_job.Mapper type. mapper=util.for_name(mapper_spec.handler_spec), input_reader_cls=mapper_spec.input_reader_class(), input_reader_params=input_readers._get_params(mapper_spec), output_writer_cls=mapper_spec.output_writer_class(), output_writer_params=output_writers._get_params(mapper_spec), shard_count=mapper_spec.shard_count, queue_name=queue_name, user_params=mr_spec.params.get("user_params"), shard_max_attempts=mr_spec.params.get("shard_max_attempts"), done_callback_url=mr_spec.params.get("done_callback"), _force_writes=mr_spec.params.get("force_writes"), _base_path=mr_spec.params["base_path"], _task_max_attempts=mr_spec.params.get("task_max_attempts"), _task_max_data_processing_attempts=( mr_spec.params.get("task_max_data_processing_attempts")), _hooks_cls=util.for_name(mr_spec.hooks_class_name), _app=mr_spec.params.get("app_id"), _api_version=api_version)
def validate(cls, mapper_spec): """Validates mapper specification. Args: mapper_spec: an instance of model.MapperSpec to validate. Raises: BadWriterParamsError: when Output writer class mismatch. """ if mapper_spec.output_writer_class() != cls: raise errors.BadWriterParamsError("Output writer class mismatch") params = output_writers._get_params(mapper_spec) # Bucket Name is required if cls.BUCKET_NAME_PARAM not in params: raise errors.BadWriterParamsError( "%s is required for the _HashingGCSOutputWriter" % cls.BUCKET_NAME_PARAM)