def split_input(cls, job_config):
        """Inherit doc."""
        shard_count = job_config.shard_count
        params = job_config.input_reader_params
        query_spec = cls._get_query_spec(params)

        namespaces = None
        if query_spec.ns is not None:
            k_ranges = cls._to_key_ranges_by_shard(query_spec.app, [query_spec.ns], shard_count, query_spec)
        else:
            ns_keys = namespace_range.get_namespace_keys(query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1)
            # No namespace means the app may have some data but those data are not
            # visible yet. Just return.
            if not ns_keys:
                return
            # If the number of ns is small, we shard each ns by key and assign each
            # shard a piece of a ns.
            elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
                namespaces = [ns_key.name() or "" for ns_key in ns_keys]
                k_ranges = cls._to_key_ranges_by_shard(query_spec.app, namespaces, shard_count, query_spec)
            # When number of ns is large, we can only split lexicographically by ns.
            else:
                ns_ranges = namespace_range.NamespaceRange.split(
                    n=shard_count, contiguous=False, can_query=lambda: True, _app=query_spec.app
                )
                k_ranges = [key_ranges.KeyRangesFactory.create_from_ns_range(ns_range) for ns_range in ns_ranges]

        iters = [
            db_iters.RangeIteratorFactory.create_key_ranges_iterator(r, query_spec, cls._KEY_RANGE_ITER_CLS)
            for r in k_ranges
        ]

        return [cls(i) for i in iters]
    def split_input(cls, job_config):
        """Inherit docs."""
        params = job_config.input_reader_params
        shard_count = job_config.shard_count
        query_spec = cls._get_query_spec(params)

        if not property_range.should_shard_by_property_range(
                query_spec.filters):
            return super(ModelDatastoreInputReader,
                         cls).split_input(job_config)

        p_range = property_range.PropertyRange(query_spec.filters,
                                               query_spec.model_class_path)
        p_ranges = p_range.split(shard_count)

        # User specified a namespace.
        if query_spec.ns:
            ns_range = namespace_range.NamespaceRange(
                namespace_start=query_spec.ns,
                namespace_end=query_spec.ns,
                _app=query_spec.app)
            ns_ranges = [copy.copy(ns_range) for _ in p_ranges]
        else:
            ns_keys = namespace_range.get_namespace_keys(
                query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1)
            if not ns_keys:
                return
            # User doesn't specify ns but the number of ns is small.
            # We still split by property range.
            if len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
                ns_ranges = [
                    namespace_range.NamespaceRange(_app=query_spec.app)
                    for _ in p_ranges
                ]
            # Lots of namespaces. Split by ns.
            else:
                ns_ranges = namespace_range.NamespaceRange.split(
                    n=shard_count,
                    contiguous=False,
                    can_query=lambda: True,
                    _app=query_spec.app)
                p_ranges = [copy.copy(p_range) for _ in ns_ranges]

        assert len(p_ranges) == len(ns_ranges)

        iters = [
            db_iters.RangeIteratorFactory.create_property_range_iterator(
                p, ns, query_spec) for p, ns in zip(p_ranges, ns_ranges)
        ]
        return [cls(i) for i in iters]
  def split_input(cls, job_config):
    """Inherit docs."""
    params = job_config.input_reader_params
    shard_count = job_config.shard_count
    query_spec = cls._get_query_spec(params)

    if not property_range.should_shard_by_property_range(query_spec.filters):
      return super(ModelDatastoreInputReader, cls).split_input(job_config)

    p_range = property_range.PropertyRange(query_spec.filters,
                                           query_spec.model_class_path)
    p_ranges = p_range.split(shard_count)

    # User specified a namespace.
    if query_spec.ns:
      ns_range = namespace_range.NamespaceRange(
          namespace_start=query_spec.ns,
          namespace_end=query_spec.ns,
          _app=query_spec.app)
      ns_ranges = [copy.copy(ns_range) for _ in p_ranges]
    else:
      ns_keys = namespace_range.get_namespace_keys(
          query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD+1)
      if not ns_keys:
        return
      # User doesn't specify ns but the number of ns is small.
      # We still split by property range.
      if len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
        ns_ranges = [namespace_range.NamespaceRange(_app=query_spec.app)
                     for _ in p_ranges]
      # Lots of namespaces. Split by ns.
      else:
        ns_ranges = namespace_range.NamespaceRange.split(n=shard_count,
                                                         contiguous=False,
                                                         can_query=lambda: True,
                                                         _app=query_spec.app)
        p_ranges = [copy.copy(p_range) for _ in ns_ranges]

    assert len(p_ranges) == len(ns_ranges)

    iters = [
        db_iters.RangeIteratorFactory.create_property_range_iterator(
            p, ns, query_spec) for p, ns in zip(p_ranges, ns_ranges)]
    return [cls(i) for i in iters]
Example #4
0
    def split_input(cls, job_config):
        """Inherit doc."""
        shard_count = job_config.shard_count
        params = job_config.input_reader_params
        query_spec = cls._get_query_spec(params)

        namespaces = None
        if query_spec.ns is not None:
            k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                   [query_spec.ns],
                                                   shard_count, query_spec)
        else:
            ns_keys = namespace_range.get_namespace_keys(
                query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1)
            # No namespace means the app may have some data but those data are not
            # visible yet. Just return.
            if not ns_keys:
                return
            # If the number of ns is small, we shard each ns by key and assign each
            # shard a piece of a ns.
            elif len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD:
                namespaces = [ns_key.name() or "" for ns_key in ns_keys]
                k_ranges = cls._to_key_ranges_by_shard(query_spec.app,
                                                       namespaces, shard_count,
                                                       query_spec)
            # When number of ns is large, we can only split lexicographically by ns.
            else:
                ns_ranges = namespace_range.NamespaceRange.split(
                    n=shard_count,
                    contiguous=False,
                    can_query=lambda: True,
                    _app=query_spec.app)
                k_ranges = [
                    key_ranges.KeyRangesFactory.create_from_ns_range(ns_range)
                    for ns_range in ns_ranges
                ]

        iters = [
            db_iters.RangeIteratorFactory.create_key_ranges_iterator(
                r, query_spec, cls._KEY_RANGE_ITER_CLS) for r in k_ranges
        ]

        return [cls(i) for i in iters]