Ejemplo n.º 1
0
 def testGlobalName(self):
   """Tests when the name has no dots in it."""
   try:
     util.for_name("this_is_a_bad_module_name")
   except ImportError, e:
     self.assertTrue(str(e).startswith(
         "Could not find 'this_is_a_bad_module_name' on path "))
Ejemplo n.º 2
0
    def run(self, job_name, sequence_num, namespace, output, complete_fn,
            mapreduce_pipeline_args):
        results = []
        try:
            iterator = input_readers.GoogleCloudStorageInputReader(output, 0)
            for file_reader in iterator:
                for item in file_reader:
                    # Map/reduce puts reducer output into blobstore files as a
                    # string obtained via "str(result)".  Use AST as a safe
                    # alternative to eval() to get the Python object back.
                    results.append(ast.literal_eval(item))
            if complete_fn:
                util.for_name(complete_fn)(mapreduce_pipeline_args, results)
            with Namespace(namespace):
                db.run_in_transaction(
                    DurableJobEntity._complete_job, job_name, sequence_num,
                    MapReduceJob.build_output(self.root_pipeline_id, results))

        # Don't know what exceptions are currently, or will be in future,
        # thrown from Map/Reduce or Pipeline libraries; these are under
        # active development.
        #
        # pylint: disable=broad-except
        except Exception, ex:
            logging.critical('Failed running map/reduce job %s: %s', job_name,
                             str(ex))
            common_utils.log_exception_origin()
            time_completed = time.time()
            with Namespace(namespace):
                db.run_in_transaction(
                    DurableJobEntity._fail_job, job_name, sequence_num,
                    MapReduceJob.build_output(self.root_pipeline_id, results,
                                              str(ex)))
Ejemplo n.º 3
0
 def testBadClass(self):
   """Tests when the class is found but the function name is missing."""
   try:
     util.for_name("__main__.TestHandlerWithArgs.missing")
   except ImportError, e:
     self.assertEquals(
         "Could not find 'missing' on path '__main__.TestHandlerWithArgs'",
         str(e))
Ejemplo n.º 4
0
 def testBadModule(self):
   """Tests when the module name is bogus."""
   try:
     util.for_name("this_is_a_bad_module_name.stuff")
   except ImportError, e:
     self.assertEquals(
         "Could not find 'stuff' on path 'this_is_a_bad_module_name'",
         str(e))
Ejemplo n.º 5
0
 def testBadFunction(self):
   """Tests when the module name is good but the function is missing."""
   try:
     util.for_name("__main__.does_not_exist")
   except ImportError, e:
     self.assertEquals(
         "Could not find 'does_not_exist' on path '__main__'",
         str(e))
Ejemplo n.º 6
0
    def validate(cls, mapper_spec):
        super(DjangoModelInputReader, cls).validate(mapper_spec)

        params = _get_params(mapper_spec)

        if cls.NAMESPACE_PARAM in params:
            raise BadReaderParamsError("Namespaces are not supported.")

        entity_kind_name = params[cls.ENTITY_KIND_PARAM]
        try:
            util.for_name(entity_kind_name)
        except ImportError, e:
            raise BadReaderParamsError("Bad entity kind: %s" % e)
Ejemplo n.º 7
0
  def _to_map_job_config(cls,
                         mr_spec,
                         # TODO(user): Remove this parameter after it can be
                         # read from mr_spec.
                         queue_name):
    """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
    mapper_spec = mr_spec.mapper
    # 0 means all the old APIs before api_version is introduced.
    api_version = mr_spec.params.get("api_version", 0)
    old_api = api_version == 0
    # We can not always convert MapreduceSpec generated by older API
    # to JobConfig. Thus, mr framework should use/expose the returned JobConfig
    # object with caution when a job is started with an old API.
    # In this case, this method only tries not to blow up and assemble a
    # JobConfig object as accurate as possible.
    return cls(_lenient=old_api,
               job_name=mr_spec.name,
               job_id=mr_spec.mapreduce_id,
               # handler_spec from older API may not have map_job.Mapper type.
               mapper=util.for_name(mapper_spec.handler_spec),
               input_reader_cls=mapper_spec.input_reader_class(),
               input_reader_params=input_readers._get_params(mapper_spec),
               output_writer_cls=mapper_spec.output_writer_class(),
               output_writer_params=output_writers._get_params(mapper_spec),
               shard_count=mapper_spec.shard_count,
               queue_name=queue_name,
               user_params=mr_spec.params.get("user_params"),
               shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
               done_callback_url=mr_spec.params.get("done_callback"),
               _force_writes=mr_spec.params.get("force_writes"),
               _base_path=mr_spec.params["base_path"],
               _task_max_attempts=mr_spec.params.get("task_max_attempts"),
               _task_max_data_processing_attempts=(
                   mr_spec.params.get("task_max_data_processing_attempts")),
               _hooks_cls=util.for_name(mr_spec.hooks_class_name),
               _app=mr_spec.params.get("app_id"),
               _api_version=api_version)
Ejemplo n.º 8
0
  def output_writer_class(self):
    """Get output writer class.

    Returns:
      output writer class object.
    """
    return self.output_writer_spec and util.for_name(self.output_writer_spec)
Ejemplo n.º 9
0
  def _get_params(self, validator_parameter, name_prefix):
    """Retrieves additional user-supplied params for the job and validates them.

    Args:
      validator_parameter: name of the request parameter which supplies
        validator for this parameter set.
      name_prefix: common prefix for all parameter names in the request.

    Raises:
      Any exception raised by the 'params_validator' request parameter if
      the params fail to validate.
    """
    params_validator = self.request.get(validator_parameter)

    user_params = {}
    for key in self.request.arguments():
      if key.startswith(name_prefix):
        values = self.request.get_all(key)
        adjusted_key = key[len(name_prefix):]
        if len(values) == 1:
          user_params[adjusted_key] = values[0]
        else:
          user_params[adjusted_key] = values

    if params_validator:
      resolved_validator = util.for_name(params_validator)
      resolved_validator(user_params)

    return user_params
Ejemplo n.º 10
0
  def input_reader_class(self):
    """Get input reader class.

    Returns:
      input reader class object.
    """
    return util.for_name(self.input_reader_spec)
Ejemplo n.º 11
0
def handler_for_name(fq_name):
    """Resolves and instantiates handler by fully qualified name.

    NOTE: This is a clone of a function in the map/reduce module which has
    also been taught that map and reduce functions may be marked with
    @classmethod, as opposed to only member functions of default-constructable
    classes or @staticmethod.  It is applied as a monkey-patch to fix the base
    library.

    First resolves the name using for_name call. Then if it resolves to a
    class, instantiates a class, if it resolves to a method - instantiates the
    class and binds method to the instance.

    Args:
      fq_name: fully qualified name of something to find.

    Returns:
      handler instance which is ready to be called.

    """
    resolved_name = mapreduce_util.for_name(fq_name)
    if isinstance(resolved_name, (type, types.ClassType)):
        # create new instance if this is type
        return resolved_name()
    elif (isinstance(resolved_name, types.MethodType) and
          resolved_name.im_self is None):
        # bind the method
        return getattr(resolved_name.im_class(), resolved_name.__name__)
    else:
        # Already bound -- classmethod or staticmethod.
        return resolved_name
 def run(self,
         job_name,
         mapper_spec,
         shuffler_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         shuffler_params=None,
         reducer_params=None,
         shards=None,
         combiner_spec=None):
   map_pipeline = yield MapPipeline(job_name,
                                    mapper_spec,
                                    input_reader_spec,
                                    params=mapper_params,
                                    shards=shards)
   shuffler_pipeline = yield util.for_name(shuffler_spec)(job_name, shuffler_params, map_pipeline)
   reducer_pipeline = yield mapreduce_pipeline.ReducePipeline(
       job_name,
       reducer_spec,
       output_writer_spec,
       reducer_params,
       shuffler_pipeline,
       combiner_spec=combiner_spec)
   with pipeline.After(reducer_pipeline):
     all_temp_files = yield pipeline_common.Extend(
         map_pipeline, shuffler_pipeline)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(reducer_pipeline)
Ejemplo n.º 13
0
  def __iter__(self):
    """Create a generator for model instances for entities.

    Iterating through entities moves query range past the consumed entities.

    Yields:
      next model instance.
    """
    while True:
      if self._current_key_range is None:
        break

      while True:
        query = self._current_key_range.make_ascending_query(
            util.for_name(self._entity_kind))
        results = query.fetch(limit=self._batch_size)

        if not results:
          self._advance_key_range()
          break

        for model_instance in results:
          key = model_instance.key()

          self._current_key_range.advance(key)
          yield model_instance
Ejemplo n.º 14
0
    def __iter__(self):
        """Create a generator for entities or keys in the range.

    Iterating through entries moves query range past the consumed entries.

    Yields:
      next entry.
    """
        while True:
            entries_query = self._key_range.make_ascending_query(
                util.for_name(self._entity_kind), self._keys_only)
            entries_list = entries_query.fetch(limit=self.batch_size)

            if not entries_list:
                return

            for entry in entries_list:
                if hasattr(entry, 'key'):
                    key = entry.key()
                else:
                    key = entry

                self._key_range = key_range.KeyRange(
                    key, self._key_range.key_end, self._key_range.direction,
                    False, self._key_range.include_end)
                yield entry
Ejemplo n.º 15
0
    def __iter__(self):
        k_range = self._key_range

        # Namespaces are not supported by djangoappengine
        if k_range.namespace:
            return

        model_class = util.for_name(self._query_spec.model_class_path)

        q = model_class.objects.all()

        if k_range.key_start:
            if k_range.include_start:
                q = q.filter(pk__gte=k_range.key_start.id_or_name())
            else:
                q = q.filter(pk__gt=k_range.key_start.id_or_name())

        if k_range.key_end:
            if k_range.include_end:
                q = q.filter(pk__lte=k_range.key_end.id_or_name())
            else:
                q = q.filter(pk__lt=k_range.key_end.id_or_name())

        q = q.order_by('pk')

        q = set_config(q, batch_size=self._query_spec.batch_size)

        if self._cursor:
            q = set_cursor(q, self._cursor)

        self._query = q

        for entity in self._query.iterator():
            yield entity
Ejemplo n.º 16
0
  def input_reader_class(self):
    """Get input reader class.

    Returns:
      input reader class object.
    """
    return util.for_name(self.input_reader_spec)
Ejemplo n.º 17
0
  def output_writer_class(self):
    """Get output writer class.

    Returns:
      output writer class object.
    """
    return self.output_writer_spec and util.for_name(self.output_writer_spec)
Ejemplo n.º 18
0
    def __iter__(self):
        k_range = self._key_range

        # Namespaces are not supported by djangoappengine
        if k_range.namespace:
            return

        model_class = util.for_name(self._query_spec.model_class_path)

        q = model_class.objects.all()

        if k_range.key_start:
            if k_range.include_start:
                q = q.filter(pk__gte=k_range.key_start.id_or_name())
            else:
                q = q.filter(pk__gt=k_range.key_start.id_or_name())

        if k_range.key_end:
            if k_range.include_end:
                q = q.filter(pk__lte=k_range.key_end.id_or_name())
            else:
                q = q.filter(pk__lt=k_range.key_end.id_or_name())

        q = q.order_by('pk')

        q = set_config(q, batch_size=self._query_spec.batch_size)

        if self._cursor:
            q = set_cursor(q, self._cursor)

        self._query = q

        for entity in self._query.iterator():
            yield entity
Ejemplo n.º 19
0
    def run(self, job_id, job_class_str, kwargs):
        # Disabling 4 space indentation checker for this docstring because this
        # "Yields:" section yields 2 objects and the Yields/Returns are
        # generally supposed to only yield 1 object which messes up the
        # indentation checking. This is the only case of this happening.
        """Returns a coroutine which runs the job pipeline and stores results.

        Args:
            job_id: str. The ID of the job to run.
            job_class_str: str. Should uniquely identify each type of job.
            kwargs: dict(str : object). Extra arguments used to build the
                MapreducePipeline.

        Yields:
            MapreducePipeline. Ready to start processing. Expects the output of
            that pipeline to be sent back.
            StoreMapReduceResults. Will be constructed with whatever output the
            caller sends back to the coroutine.
        """

        job_class = mapreduce_util.for_name(job_class_str)
        job_class.register_start(
            job_id,
            metadata={
                job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id  # pylint: disable=protected-access
            })

        # TODO(sll): Need try/except/mark-as-canceled here?
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_id, job_class_str, output)
Ejemplo n.º 20
0
  def __iter__(self):
    """Create a generator for entities or keys in the range.

    Iterating through entries moves query range past the consumed entries.

    Yields:
      next entry.
    """
    while True:
      entries_query = self._key_range.make_ascending_query(
          util.for_name(self._entity_kind), self._keys_only)
      entries_list = entries_query.fetch(limit=self.batch_size)

      if not entries_list:
        return

      for entry in entries_list:
        if hasattr(entry, 'key'):
          key = entry.key()
        else:
          key = entry

        self._key_range = key_range.KeyRange(key,
                                             self._key_range.key_end,
                                             self._key_range.direction,
                                             False,
                                             self._key_range.include_end)
        yield entry
Ejemplo n.º 21
0
  def _get_params(self, validator_parameter, name_prefix):
    """Retrieves additional user-supplied params for the job and validates them.

    Args:
      validator_parameter: name of the request parameter which supplies
        validator for this parameter set.
      name_prefix: common prefix for all parameter names in the request.

    Raises:
      Any exception raised by the 'params_validator' request parameter if
      the params fail to validate.
    """
    params_validator = self.request.get(validator_parameter)

    user_params = {}
    for key in self.request.arguments():
      if key.startswith(name_prefix):
        values = self.request.get_all(key)
        adjusted_key = key[len(name_prefix):]
        if len(values) == 1:
          user_params[adjusted_key] = values[0]
        else:
          user_params[adjusted_key] = values

    if params_validator:
      resolved_validator = util.for_name(params_validator)
      resolved_validator(user_params)

    return user_params
Ejemplo n.º 22
0
  def split_input(cls, mapper_spec):
    """Splits query into shards without fetching query results.

    Tries as best as it can to split the whole query result set into equal
    shards. Due to difficulty of making the perfect split, resulting shards'
    sizes might differ significantly from each other. The actual number of
    shards might also be less then requested (even 1), though it is never
    greater.

    Current implementation does key-lexicographic order splitting. It requires
    query not to specify any __key__-based ordering. If an index for
    query.order('-__key__') query is not present, an inaccurate guess at
    sharding will be made by splitting the full key range.

    Args:
      mapper_spec: MapperSpec with params containing 'entity_kind'.
        May also have 'batch_size' in the params to specify the number
        of entities to process in each batch.

    Returns:
      A list of InputReader objects of length <= number_of_shards. These
      may be DatastoreInputReader or DatastoreKeyInputReader objects.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
    if mapper_spec.input_reader_class() != cls:
      raise BadReaderParamsError("Input reader class mismatch")
    params = mapper_spec.params
    if cls.ENTITY_KIND_PARAM not in params:
      raise BadReaderParamsError("Missing mapper parameter 'entity_kind'")

    entity_kind_name = params[cls.ENTITY_KIND_PARAM]
    shard_count = mapper_spec.shard_count
    app = params.get(cls._APP_PARAM)
    # keys_only remains for backwards compatability. It may go away.
    keys_only = util.parse_bool(params.get(cls.KEYS_ONLY_PARAM, False))

    if keys_only:
      raise BadReaderParamsError("The keys_only parameter is obsolete. "
                                 "Use DatastoreKeyInputReader instead.")

    # Fail fast if Model cannot be located.
    util.for_name(entity_kind_name)

    return cls._split_input_from_params(
        app, entity_kind_name, params, shard_count)
 def _get_raw_entity_kind(cls, model_classpath):
   entity_type = util.for_name(model_classpath)
   if isinstance(entity_type, db.Model):
     return entity_type.kind()
   elif isinstance(entity_type, (ndb.Model, ndb.MetaModel)):
     # pylint: disable=protected-access
     return entity_type._get_kind()
   else:
     return util.get_short_name(model_classpath)
 def _get_raw_entity_kind(cls, model_classpath):
     entity_type = util.for_name(model_classpath)
     if isinstance(entity_type, db.Model):
         return entity_type.kind()
     elif isinstance(entity_type, (ndb.Model, ndb.MetaModel)):
         # pylint: disable=protected-access
         return entity_type._get_kind()
     else:
         return util.get_short_name(model_classpath)
Ejemplo n.º 25
0
    def run(self, job_id, job_class_str, kwargs):
        job_class = mapreduce_util.for_name(job_class_str)
        job_class.register_start(job_id, metadata={
            job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id
        })

        # TODO(sll): Need try/except/mark-as-canceled here?
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_id, job_class_str, output)
Ejemplo n.º 26
0
Archivo: jobs.py Proyecto: oulan/oppia
    def run(self, job_id, job_class_str, kwargs):
        job_class = mapreduce_util.for_name(job_class_str)
        job_class.register_start(job_id, metadata={
            job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id
        })

        # TODO(sll): Need try/except/mark-as-canceled here?
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_id, job_class_str, output)
Ejemplo n.º 27
0
    def split_input(cls, mapper_spec):
        """Splits query into shards without fetching query results.

    Tries as best as it can to split the whole query result set into equal
    shards. Due to difficulty of making the perfect split, resulting shards'
    sizes might differ significantly from each other. The actual number of
    shards might also be less then requested (even 1), though it is never
    greater.

    Current implementation does key-lexicographic order splitting. It requires
    query not to specify any __key__-based ordering. If an index for
    query.order('-__key__') query is not present, an inaccurate guess at
    sharding will be made by splitting the full key range.

    Args:
      mapper_spec: MapperSpec with params containing 'entity_kind'.
        May also have 'batch_size' in the params to specify the number
        of entities to process in each batch.

    Returns:
      A list of DatastoreInputReader objects of length <= number_of_shards.

    Raises:
      BadReaderParamsError if required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Input reader class mismatch")
        params = mapper_spec.params
        if "entity_kind" not in params:
            raise BadReaderParamsError(
                "Missing mapper parameter 'entity_kind'")

        entity_kind_name = params["entity_kind"]
        entity_kind = util.for_name(entity_kind_name)
        shard_count = mapper_spec.shard_count
        batch_size = int(params.get("batch_size", cls._BATCH_SIZE))
        keys_only = int(params.get("keys_only", False))

        ds_query = entity_kind.all()._get_query()
        ds_query.Order("__key__")
        first_entity = ds_query.Get(1)
        if not first_entity:
            return []
        else:
            first_entity_key = first_entity[0].key()

        ds_query.Order(("__key__", datastore.Query.DESCENDING))
        try:
            last_entity = ds_query.Get(1)
            last_entity_key = last_entity[0].key()
        except db.NeedIndexError, e:
            logging.warning(
                "Cannot create accurate approximation of keyspace, "
                "guessing instead. Please address this problem: %s", e)
            last_entity_key = key_range.KeyRange.guess_end_key(
                entity_kind.kind(), first_entity_key)
 def validate(cls, job_config):
   """Inherit docs."""
   super(ModelDatastoreInputReader, cls).validate(job_config)
   params = job_config.input_reader_params
   entity_kind = params[cls.ENTITY_KIND_PARAM]
   # Fail fast if Model cannot be located.
   try:
     model_class = util.for_name(entity_kind)
   except ImportError, e:
     raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
 def validate(cls, job_config):
     """Inherit docs."""
     super(ModelDatastoreInputReader, cls).validate(job_config)
     params = job_config.input_reader_params
     entity_kind = params[cls.ENTITY_KIND_PARAM]
     # Fail fast if Model cannot be located.
     try:
         model_class = util.for_name(entity_kind)
     except ImportError, e:
         raise errors.BadReaderParamsError("Bad entity kind: %s" % e)
Ejemplo n.º 30
0
  def split_input(cls, mapper_spec):
    """Splits query into shards without fetching query results.

    Tries as best as it can to split the whole query result set into equal
    shards. Due to difficulty of making the perfect split, resulting shards'
    sizes might differ significantly from each other. The actual number of
    shards might also be less then requested (even 1), though it is never
    greater.

    Current implementation does key-lexicographic order splitting. It requires
    query not to specify any __key__-based ordering. If an index for
    query.order('-__key__') query is not present, an inaccurate guess at
    sharding will be made by splitting the full key range.

    Args:
      mapper_spec: MapperSpec with params containing 'entity_kind'.
        May also have 'batch_size' in the params to specify the number
        of entities to process in each batch.

    Returns:
      A list of DatastoreInputReader objects of length <= number_of_shards.

    Raises:
      BadReaderParamsError if required parameters are missing or invalid.
    """
    if mapper_spec.input_reader_class() != cls:
      raise BadReaderParamsError("Input reader class mismatch")
    params = mapper_spec.params
    if "entity_kind" not in params:
      raise BadReaderParamsError("Missing mapper parameter 'entity_kind'")

    entity_kind_name = params["entity_kind"]
    entity_kind = util.for_name(entity_kind_name)
    shard_count = mapper_spec.shard_count
    batch_size = int(params.get("batch_size", cls._BATCH_SIZE))
    keys_only = int(params.get("keys_only", False))

    ds_query = entity_kind.all()._get_query()
    ds_query.Order("__key__")
    first_entity = ds_query.Get(1)
    if not first_entity:
      return []
    else:
      first_entity_key = first_entity[0].key()

    ds_query.Order(("__key__", datastore.Query.DESCENDING))
    try:
      last_entity = ds_query.Get(1)
      last_entity_key = last_entity[0].key()
    except db.NeedIndexError, e:
      logging.warning("Cannot create accurate approximation of keyspace, "
                      "guessing instead. Please address this problem: %s", e)
      last_entity_key = key_range.KeyRange.guess_end_key(
          entity_kind.kind(), first_entity_key)
Ejemplo n.º 31
0
  def run(self, key, values):
    if not self._combiner:
      ctx = context.get()
      params = ctx.mapreduce_spec.mapper.params
      combine_spec = params.get(_CombinePipeline.COMBINE_SPEC_PARAM)
      self._combiner = util.for_name(combine_spec)

    for combined_value in self._combiner(key, values, []):
      proto = file_service_pb.KeyValue()
      proto.set_key(key)
      proto.set_value(combined_value)
      yield proto.Encode()
Ejemplo n.º 32
0
    def validate(cls, mapper_spec):
        """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        cls._common_validate(mapper_spec)
        params = mapper_spec.params
        keys_only = util.parse_bool(params.get(cls.KEYS_ONLY_PARAM, False))
        if keys_only:
            raise BadReaderParamsError("The keys_only parameter is obsolete. "
                                       "Use DatastoreKeyInputReader instead.")

        entity_kind_name = params[cls.ENTITY_KIND_PARAM]
        # Fail fast if Model cannot be located.
        try:
            util.for_name(entity_kind_name)
        except ImportError, e:
            raise BadReaderParamsError("Bad entity kind: %s" % e)
Ejemplo n.º 33
0
  def validate(cls, mapper_spec):
    """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
    cls._common_validate(mapper_spec)
    params = mapper_spec.params
    keys_only = util.parse_bool(params.get(cls.KEYS_ONLY_PARAM, False))
    if keys_only:
      raise BadReaderParamsError("The keys_only parameter is obsolete. "
                                 "Use DatastoreKeyInputReader instead.")

    entity_kind_name = params[cls.ENTITY_KIND_PARAM]
    # Fail fast if Model cannot be located.
    try:
      util.for_name(entity_kind_name)
    except ImportError, e:
      raise BadReaderParamsError("Bad entity kind: %s" % e)
Ejemplo n.º 34
0
  def get_hooks(self):
    """Returns a hooks.Hooks class or None if no hooks class has been set."""
    if self.__hooks is None and self.hooks_class_name is not None:
      hooks_class = util.for_name(self.hooks_class_name)
      if not isinstance(hooks_class, type):
        raise ValueError("hooks_class_name must refer to a class, got %s" %
                         type(hooks_class).__name__)
      if not issubclass(hooks_class, hooks.Hooks):
        raise ValueError(
            "hooks_class_name must refer to a hooks.Hooks subclass")
      self.__hooks = hooks_class(self)

    return self.__hooks
Ejemplo n.º 35
0
  def get_hooks(self):
    """Returns a hooks.Hooks class or None if no hooks class has been set."""
    if self.__hooks is None and self.hooks_class_name is not None:
      hooks_class = util.for_name(self.hooks_class_name)
      if not isinstance(hooks_class, type):
        raise ValueError("hooks_class_name must refer to a class, got %s" %
                         type(hooks_class).__name__)
      if not issubclass(hooks_class, hooks.Hooks):
        raise ValueError(
            "hooks_class_name must refer to a hooks.Hooks subclass")
      self.__hooks = hooks_class(self)

    return self.__hooks
Ejemplo n.º 36
0
    def __init__(self, filters, model_class_path):
        """Init.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type should satisfy the property's type.
      model_class_path: full path to the model class in str.
    """
        self.filters = filters
        self.model_class_path = model_class_path
        self.model_class = util.for_name(self.model_class_path)
        self.prop, self.start, self.end = self._get_range_from_filters(
            self.filters, self.model_class)
Ejemplo n.º 37
0
Archivo: jobs.py Proyecto: oppia/oppia
    def run(self, job_id, job_class_str, output):
        job_class = mapreduce_util.for_name(job_class_str)

        try:
            iterator = input_readers.GoogleCloudStorageInputReader(output, 0)
            results_list = []
            for item_reader in iterator:
                for item in item_reader:
                    results_list.append(json.loads(item))
            job_class.register_completion(job_id, results_list)
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error("Job %s failed at %s" % (job_id, utils.get_current_time_in_millisecs()))
            job_class.register_failure(job_id, "%s\n%s" % (unicode(e), traceback.format_exc()))
Ejemplo n.º 38
0
 def run(self, job_name, sequence_num, time_started, namespace, output,
         complete_fn, kwargs):
     results = []
     # TODO(mgainer): Notice errors earlier in pipeline, and mark job
     # as failed in that case as well.
     try:
         iterator = input_readers.GoogleCloudStorageInputReader(output, 0)
         for file_reader in iterator:
             for item in file_reader:
                 # Map/reduce puts reducer output into blobstore files as a
                 # string obtained via "str(result)".  Use AST as a safe
                 # alternative to eval() to get the Python object back.
                 results.append(ast.literal_eval(item))
         if complete_fn:
             util.for_name(complete_fn)(kwargs, results)
         time_completed = time.time()
         with Namespace(namespace):
             db.run_in_transaction(
                 DurableJobEntity._complete_job, job_name, sequence_num,
                 MapReduceJob.build_output(self.root_pipeline_id, results),
                 long(time_completed - time_started))
     # Don't know what exceptions are currently, or will be in future,
     # thrown from Map/Reduce or Pipeline libraries; these are under
     # active development.
     #
     # pylint: disable=broad-except
     except Exception, ex:
         logging.critical('Failed running map/reduce job %s: %s', job_name,
                          str(ex))
         common_utils.log_exception_origin()
         time_completed = time.time()
         with Namespace(namespace):
             db.run_in_transaction(
                 DurableJobEntity._fail_job, job_name, sequence_num,
                 MapReduceJob.build_output(self.root_pipeline_id, results,
                                           str(ex)),
                 long(time_completed - time_started))
Ejemplo n.º 39
0
  def get_handler(self):
    """Get mapper handler instance.

    Returns:
      cached handler instance as callable.
    """
    if self.__handler is None:
      resolved_spec = util.for_name(self.handler_spec)
      if isinstance(resolved_spec, type):
        self.__handler = resolved_spec()
      elif isinstance(resolved_spec, types.MethodType):
        self.__handler = getattr(resolved_spec.im_class(),
                                 resolved_spec.__name__)
      else:
        self.__handler = resolved_spec
    return self.__handler
Ejemplo n.º 40
0
Archivo: jobs.py Proyecto: yarinf/oppia
    def run(self, job_id, job_class_str, output):
        job_class = mapreduce_util.for_name(job_class_str)

        try:
            iterator = input_readers.GoogleCloudStorageInputReader(output, 0)
            results_list = []
            for item_reader in iterator:
                for item in item_reader:
                    results_list.append(json.loads(item))
            job_class.register_completion(job_id, results_list)
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error('Job %s failed at %s' %
                          (job_id, utils.get_current_time_in_millisecs()))
            job_class.register_failure(
                job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))
Ejemplo n.º 41
0
  def __init__(self,
               filters,
               model_class_path):
    """Init.

    Args:
      filters: user supplied filters. Each filter should be a list or tuple of
        format (<property_name_as_str>, <query_operator_as_str>,
        <value_of_certain_type>). Value type should satisfy the property's type.
      model_class_path: full path to the model class in str.
    """
    self.filters = filters
    self.model_class_path = model_class_path
    self.model_class = util.for_name(self.model_class_path)
    self.prop, self.start, self.end = self._get_range_from_filters(
        self.filters, self.model_class)
Ejemplo n.º 42
0
  def _iter_key_range(self, k_range):
    cursor = None
    while True:
      query = k_range.make_ascending_query(
          util.for_name(self._entity_kind))
      if cursor:
        query.with_cursor(cursor)

      results = query.fetch(limit=self._batch_size)
      if not results:
        break

      for model_instance in results:
        key = model_instance.key()
        yield key, model_instance
      cursor = query.cursor()
Ejemplo n.º 43
0
    def run(self, job_id, job_class_str, output):
        job_class = mapreduce_util.for_name(job_class_str)

        try:
            iterator = input_readers.RecordsReader(output, 0)
            results_list = []
            for item in iterator:
                # Map/reduce puts reducer output into blobstore files as a
                # string obtained via "str(result)".  Use AST as a safe
                # alternative to eval() to get the Python object back.
                results_list.append(ast.literal_eval(item))
            job_class.register_completion(job_id, results_list)
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error('Job %s failed at %s' %
                          (job_id, utils.get_current_time_in_millisecs()))
            job_class.register_failure(
                job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))
Ejemplo n.º 44
0
    def _split_input_from_namespace(cls, app, namespace, entity_kind_name,
                                    shard_count):
        entity_kind = util.for_name(entity_kind_name)
        entity_kind_name = entity_kind.kind()

        hex_key_start = db.Key.from_path(entity_kind_name, 0)
        hex_key_end = db.Key.from_path(entity_kind_name, int('f' * 40,
                                                             base=16))
        hex_range = key_range.KeyRange(hex_key_start,
                                       hex_key_end,
                                       None,
                                       True,
                                       True,
                                       namespace=namespace,
                                       _app=app)

        key_range_list = [hex_range]
        number_of_half_splits = int(math.floor(math.log(shard_count, 2)))
        for index in xrange(0, number_of_half_splits):
            new_ranges = []
            for current_range in key_range_list:
                new_ranges.extend(current_range.split_range(1))
            key_range_list = new_ranges

        adjusted_range_list = []
        for current_range in key_range_list:
            adjusted_range = key_range.KeyRange(
                key_start=db.Key.from_path(current_range.key_start.kind(),
                                           'hash_%040x' %
                                           (current_range.key_start.id() or 0),
                                           _app=current_range._app),
                key_end=db.Key.from_path(current_range.key_end.kind(),
                                         'hash_%040x' %
                                         (current_range.key_end.id() or 0),
                                         _app=current_range._app),
                direction=current_range.direction,
                include_start=current_range.include_start,
                include_end=current_range.include_end,
                namespace=current_range.namespace,
                _app=current_range._app)

            adjusted_range_list.append(adjusted_range)

        return adjusted_range_list
Ejemplo n.º 45
0
  def get_handler(self):
    """Get mapper handler instance.

    Returns:
      cached handler instance as callable.
    """
    if self.__handler is None:
      logging.warn(self.handler_spec)
      resolved_spec = util.for_name(self.handler_spec)
      if isinstance(resolved_spec, type):
        # create new instance if this is type
        self.__handler = resolved_spec()
      elif isinstance(resolved_spec, types.MethodType):
        # bind the method
        self.__handler = getattr(resolved_spec.im_class(),
                                 resolved_spec.__name__)
      else:
        self.__handler = resolved_spec
    return self.__handler
Ejemplo n.º 46
0
  def __iter__(self):
    self._query = self._key_range.make_ascending_query(
        util.for_name(self._query_spec.model_class_path),
        filters=self._query_spec.filters)

    if isinstance(self._query, db.Query):
      if self._cursor:
        self._query.with_cursor(self._cursor)
      for model_instance in self._query.run(
          batch_size=self._query_spec.batch_size,
          keys_only=self._query_spec.keys_only):
        yield model_instance
    else:
      self._query = self._query.iter(batch_size=self._query_spec.batch_size,
                                     keys_only=self._query_spec.keys_only,
                                     start_cursor=self._cursor,
                                     produce_cursors=True)
      for model_instance in self._query:
        yield model_instance
Ejemplo n.º 47
0
    def get_handler(self):
        """Get mapper handler instance.

    Returns:
      cached handler instance as callable.
    """
        if self.__handler is None:
            logging.warn(self.handler_spec)
            resolved_spec = util.for_name(self.handler_spec)
            if isinstance(resolved_spec, type):
                # create new instance if this is type
                self.__handler = resolved_spec()
            elif isinstance(resolved_spec, types.MethodType):
                # bind the method
                self.__handler = getattr(resolved_spec.im_class(),
                                         resolved_spec.__name__)
            else:
                self.__handler = resolved_spec
        return self.__handler
Ejemplo n.º 48
0
  def __iter__(self):
    self._query = self._key_range.make_ascending_query(
        util.for_name(self._query_spec.model_class_path),
        filters=self._query_spec.filters)

    if isinstance(self._query, db.Query):
      if self._cursor:
        self._query.with_cursor(self._cursor)
      for model_instance in self._query.run(
          batch_size=self._query_spec.batch_size,
          keys_only=self._query_spec.keys_only):
        yield model_instance
    else:
      self._query = self._query.iter(batch_size=self._query_spec.batch_size,
                                     keys_only=self._query_spec.keys_only,
                                     start_cursor=self._cursor,
                                     produce_cursors=True)
      for model_instance in self._query:
        yield model_instance
Ejemplo n.º 49
0
  def _split_input_from_namespace(
      cls, app, namespace, entity_kind_name, shard_count):
    entity_kind = util.for_name(entity_kind_name)
    entity_kind_name = entity_kind.kind()

    hex_key_start = db.Key.from_path(
        entity_kind_name, 0)
    hex_key_end = db.Key.from_path(
        entity_kind_name, int('f' * 40, base=16))
    hex_range = key_range.KeyRange(
        hex_key_start, hex_key_end, None, True, True,
        namespace=namespace,
        _app=app)

    key_range_list = [hex_range]
    number_of_half_splits = int(math.floor(math.log(shard_count, 2)))
    for index in xrange(0, number_of_half_splits):
      new_ranges = []
      for current_range in key_range_list:
        new_ranges.extend(current_range.split_range(1))
      key_range_list = new_ranges

    adjusted_range_list = []
    for current_range in key_range_list:
      adjusted_range = key_range.KeyRange(
          key_start=db.Key.from_path(
              current_range.key_start.kind(),
              'hash_%040x' % (current_range.key_start.id() or 0),
              _app=current_range._app),
          key_end=db.Key.from_path(
              current_range.key_end.kind(),
              'hash_%040x' % (current_range.key_end.id() or 0),
              _app=current_range._app),
          direction=current_range.direction,
          include_start=current_range.include_start,
          include_end=current_range.include_end,
          namespace=current_range.namespace,
          _app=current_range._app)

      adjusted_range_list.append(adjusted_range)

    return adjusted_range_list
Ejemplo n.º 50
0
    def __iter__(self):
        """Create a generator for model instances for entities.

    Iterating through entities moves query range past the consumed entities.

    Yields:
      next model instance.
    """
        while True:
            query = self._key_range.make_ascending_query(
                util.for_name(self._entity_kind))
            results = query.fetch(limit=self._batch_size)

            if not results:
                break

            for model_instance in results:
                key = model_instance.key()

                self._key_range.advance(key)
                yield model_instance
Ejemplo n.º 51
0
    def run(self, job_id, job_class_str, output):
        """Extracts the results of a MR job and registers its completion.

        Args:
            job_id: str. The ID of the job to run.
            job_class_str: str. Should uniquely identify each type of job.
            output: str. The output produced by the job.
        """
        job_class = mapreduce_util.for_name(job_class_str)

        try:
            iterator = input_readers.GoogleCloudStorageInputReader(output, 0)
            results_list = []
            for item_reader in iterator:
                for item in item_reader:
                    results_list.append(json.loads(item))
            job_class.register_completion(job_id, results_list)
        except Exception as e:
            logging.exception('Job %s failed at %s' %
                              (job_id, utils.get_current_time_in_millisecs()))
            job_class.register_failure(
                job_id,
                '%s\n%s' % (python_utils.UNICODE(e), traceback.format_exc()))
Ejemplo n.º 52
0
 def _get_raw_entity_kind(cls, entity_kind):
     """Returns an datastore entity kind from a Django model."""
     model_class = util.for_name(entity_kind)
     return model_class._meta.db_table
Ejemplo n.º 53
0
  def _to_map_job_config(cls,
                         mr_spec,
                         # TODO(user): Remove this parameter after it can be
                         # read from mr_spec.
                         queue_name):
    """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
    mapper_spec = mr_spec.mapper
    # 0 means all the old APIs before api_version is introduced.
    api_version = mr_spec.params.get("api_version", 0)
    old_api = api_version == 0

    # Deserialize params from json if input_reader/output_writer are new API.
    input_reader_cls = mapper_spec.input_reader_class()
    input_reader_params = input_readers._get_params(mapper_spec)
    if issubclass(input_reader_cls, input_reader.InputReader):
      input_reader_params = input_reader_cls.params_from_json(
          input_reader_params)

    output_writer_cls = mapper_spec.output_writer_class()
    output_writer_params = output_writers._get_params(mapper_spec)
    # TODO(user): Call json (de)serialization for writer.
    # if (output_writer_cls and
    #     issubclass(output_writer_cls, output_writer.OutputWriter)):
    #   output_writer_params = output_writer_cls.params_from_json(
    #       output_writer_params)

    # We can not always convert MapreduceSpec generated by older API
    # to JobConfig. Thus, mr framework should use/expose the returned JobConfig
    # object with caution when a job is started with an old API.
    # In this case, this method only tries not to blow up and assemble a
    # JobConfig object as accurate as possible.
    return cls(_lenient=old_api,
               job_name=mr_spec.name,
               job_id=mr_spec.mapreduce_id,
               # handler_spec from older API may not have map_job.Mapper type.
               mapper=util.for_name(mapper_spec.handler_spec),
               input_reader_cls=input_reader_cls,
               input_reader_params=input_reader_params,
               output_writer_cls=output_writer_cls,
               output_writer_params=output_writer_params,
               shard_count=mapper_spec.shard_count,
               queue_name=queue_name,
               user_params=mr_spec.params.get("user_params"),
               shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
               done_callback_url=mr_spec.params.get("done_callback"),
               _force_writes=mr_spec.params.get("force_writes"),
               _base_path=mr_spec.params["base_path"],
               _task_max_attempts=mr_spec.params.get("task_max_attempts"),
               _task_max_data_processing_attempts=(
                   mr_spec.params.get("task_max_data_processing_attempts")),
               _hooks_cls=util.for_name(mr_spec.hooks_class_name),
               _app=mr_spec.params.get("app_id"),
               _api_version=api_version)
Ejemplo n.º 54
0
 def testClassName(self):
   """Test passing fq class name."""
   self.assertEquals(TestHandler, util.for_name("__main__.TestHandler"))
Ejemplo n.º 55
0
 def testMethodName(self):
   """Test passing method name."""
   self.assertEquals(TestHandler.process,
                     util.for_name("__main__.TestHandler.process"))
Ejemplo n.º 56
0
 def testClassWithArgs(self):
   """Test passing method name of class with constructor args."""
   self.assertEquals(TestHandlerWithArgs.process,
                     util.for_name("__main__.TestHandlerWithArgs.process"))
Ejemplo n.º 57
0
 def testFunctionName(self):
   """Test passing function name."""
   self.assertEquals(test_handler_function,
                     util.for_name("__main__.test_handler_function"))
Ejemplo n.º 58
0
from mapreduce import util
util.for_name('migrate.process')