Example #1
0
 def validate(cls, mapper_spec):
     if mapper_spec.input_reader_class() != cls:
         raise input_readers.BadReaderParamsError("Input reader class mismatch")
     params = input_readers._get_params(mapper_spec)
     for param in cls.REQUIRED_PARAMS:
         if not param in params:
             raise input_readers.BadReaderParamsError("Parameter missing: %s" % param)
Example #2
0
 def validate(cls, mapper_spec):
     if mapper_spec.input_reader_class() != cls:
         raise input_readers.BadReaderParamsError("Input reader class mismatch")
     params = input_readers._get_params(mapper_spec)
     for param in cls.REQUIRED_PARAMS:
         if not param in params:
             raise input_readers.BadReaderParamsError("Parameter missing: %s" % param)
    def validate(cls, mapper_spec):
        """Validates mapper spec and all mapper parameters.

    Input reader parameters are expected to be passed as "input_reader"
    subdictionary in mapper_spec.params.

    Pre 1.6.4 API mixes input reader parameters with all other parameters. Thus
    to be compatible, input reader check mapper_spec.params as well and
    issue a warning if "input_reader" subdicationary is not present.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Input reader class mismatch")
        params = input_readers._get_params(mapper_spec)
        if cls.BLOB_KEY_PARAM not in params:
            raise BadReaderParamsError(
                "Must specify 'blob_key' for mapper input")
        blob_key = params[cls.BLOB_KEY_PARAM]
        blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
        if not blob_info:
            raise BadReaderParamsError("Could not find BlobInfo for key %s" %
                                       blob_key)
Example #4
0
    def split_input(cls, mapper_spec):
        ### init params
        params = _get_params(mapper_spec)
        entity_kind_name = params.pop(cls.ENTITY_KIND_PARAM)
        batch_size = int(params.get(cls.BATCH_SIZE_PARAM, cls._BATCH_SIZE))
        filters = params.get(cls.FILTERS_PARAM)
        shard_count = mapper_spec.shard_count

        ff_spec = params.get(cls.PRE_MAP_FF_SPEC_PARAM)
        ### init params

        entity_kind = mapreduce.util.for_name(entity_kind_name)
        splitter_query = KeyRange().make_ascending_query(
            entity_kind, keys_only=True, filters=filters)

        k_begin_iter, k_end_iter = itertools.tee(
            itertools.islice(splitter_query, None, None, batch_size))

        # because include_end=None the last pair one should be (key, None)
        # for that reason pop one from begining and add None to the end
        py5to7.next(k_end_iter, None)
        k_end_iter = itertools.chain(k_end_iter, [None])

        key_ranges = defaultdict(list)
        for (i, keys) in enumerate(itertools.izip(k_begin_iter, k_end_iter)):
            key_ranges[i % shard_count].append(
                KeyRange(keys[0], keys[1], include_end=False))

        return [cls(entity_kind_name,
                    key_ranges=k,
                    batch_size=batch_size,
                    filters=filters,
                    filter_factory_spec=ff_spec) for k in key_ranges.values()]
Example #5
0
  def validate(cls, mapper_spec):
    """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
    if mapper_spec.input_reader_class() != cls:
      raise input_readers.BadReaderParamsError("__RobotsLineInputReader:Mapper input reader class mismatch")
    params = input_readers._get_params(mapper_spec)
    if cls.BLOB_KEYS_PARAM not in params:
      raise input_readers.BadReaderParamsError("_RobotsLineInputReader:Must specify 'blob_keys' for mapper input")
    file_names = params[cls.BLOB_KEYS_PARAM]
    if isinstance(file_names, basestring):
      # This is a mechanism to allow multiple blob keys (which do not contain
      # commas) in a single string. It may go away.
      file_names = file_names.split(",")
    if len(file_names) > cls._MAX_BLOB_KEYS_COUNT:
      raise input_readers.BadReaderParamsError("_RobotsLineInputReader:Too many 'blob_keys' for mapper input")
    if not file_names:
      raise input_readers.BadReaderParamsError("_RobotsLineInputReader:No 'blob_keys' specified for mapper input")
    for file_name in file_names:
      blob_key = files.blobstore.get_blob_key(file_name)
      blob_key_str = str(blob_key)
      blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key_str))
      if not blob_info:
        raise input_readers.BadReaderParamsError("_RobotsLineInputReader:Could not find blobinfo for key %s" %
                                   blob_key_str)
Example #6
0
    def split_input(cls, mapper_spec):
        shard_count = mapper_spec.shard_count

        # Grab the input parameters for the split
        params = input_readers._get_params(mapper_spec)
        logging.info("Params: %s" % params)
        # Unpickle the query
        app, model = params['model'].split('.')
        model = model_cache.get_model(app, model)

        # Grab the lowest pk
        query = model.objects.all()
        query = query.order_by('pk')

        try:
            first_id = query.values_list('pk', flat=True)[:1][0]

            query = query.order_by('-pk')
            last_id = query.values_list('pk', flat=True)[:1][0]
        except IndexError:
            return [DjangoInputReader(0, 0, params['model'])]

        pk_range = last_id - first_id

        logging.info("Query range: %s - %s = %s" %
                     (first_id, last_id, pk_range))

        if pk_range < shard_count or shard_count == 1:
            return [DjangoInputReader(first_id - 1, last_id, params['model'])]

        readers = []
        max_shard_size = int(float(pk_range) / float(shard_count))
        if pk_range % shard_count:
            max_shard_size += 1

        shard_id = 1
        # Splitting could be much smarter by taking a __scatter__ sample and
        # clustering, which is how the DatastoreInputWriter from the mapreduce
        # splits on pks
        for i in itertools.count(first_id - 1, max_shard_size):
            if i >= last_id:
                break

            shard_start_id = i
            shard_end_id = i + max_shard_size
            if shard_end_id > last_id:
                shard_end_id = last_id

            logging.info("Creating shard: %s - %s" %
                         (shard_start_id, shard_end_id))
            reader = DjangoInputReader(shard_start_id, shard_end_id,
                                       params['model'])
            reader.shard_id = shard_id
            readers.append(reader)
            shard_id += 1
        return readers
Example #7
0
    def split_input(cls, mapper_spec):
        shard_count = mapper_spec.shard_count

        # Grab the input parameters for the split
        params = input_readers._get_params(mapper_spec)
        logging.info("Params: %s" % params)

        db = params['db']
        # Unpickle the query
        app, model = params['model'].split('.')
        model = apps.get_model(app, model)

        # Grab the lowest pk
        query = model.objects.using(db).all()
        query = query.order_by('pk')

        try:
            first_id = query.values_list('pk', flat=True)[:1][0]

            query = query.order_by('-pk')
            last_id = query.values_list('pk', flat=True)[:1][0]
        except IndexError:
            return [DjangoInputReader(0, 0, params['model'], db=db)]

        pk_range = last_id - first_id

        logging.info("Query range: %s - %s = %s" % (first_id, last_id, pk_range))

        if pk_range < shard_count or shard_count == 1:
            return [DjangoInputReader(first_id-1, last_id, params['model'], db=db)]

        readers = []
        max_shard_size = int(float(pk_range) / float(shard_count))
        if pk_range % shard_count:
            max_shard_size += 1

        shard_id = 1
        # Splitting could be much smarter by taking a __scatter__ sample and
        # clustering, which is how the DatastoreInputWriter from the mapreduce
        # splits on pks
        for i in itertools.count(first_id-1, max_shard_size):
            if i >= last_id:
                break

            shard_start_id = i
            shard_end_id = i + max_shard_size
            if shard_end_id > last_id:
                shard_end_id = last_id

            logging.info("Creating shard: %s - %s" % (shard_start_id, shard_end_id))
            reader = DjangoInputReader(shard_start_id, shard_end_id, params['model'], db=db)
            reader.shard_id = shard_id
            readers.append(reader)
            shard_id += 1
        return readers
Example #8
0
    def split_input(cls, mapper_spec):
        """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
        reader_spec = input_readers._get_params(mapper_spec, allow_old=False)
        readsetId = reader_spec[cls.READSET_ID_PARAM]
        sequenceName = reader_spec[cls.SEQUENCE_NAME_PARAM]
        sequenceStart = reader_spec.get(cls.SEQUEQNCE_START_PARAM)
        sequenceEnd = reader_spec.get(cls.SEQUEQNCE_END_PARAM)
        useMockData = reader_spec.get(cls.USE_MOCK_DATA_PARAM)

        # TODO if you are doing all sequences then you need to take sequence name
        # into account as well.
        # For now assume we are only doing a single sequence name.

        # Divide the range by the shard count to get the step.
        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        range_length = ((sequenceEnd + 1) - sequenceStart) // shard_count
        if range_length == 0:
            range_length = 1
        logging.debug("GenomicsAPIInputReader split_input() "
                      "shards: %d range_length: %d" %
                      (mapper_spec.shard_count, range_length))

        # Split into shards
        readers = []
        for position in xrange(shard_count - 1):
            start = sequenceStart + (range_length * position)
            end = start + range_length - 1
            logging.debug(
                "GenomicsAPIInputReader split_input() start: %d end: %d." %
                (start, end))
            readers.append(
                cls(readsetId, sequenceName, start, end, useMockData))
        start = sequenceStart + (range_length * (shard_count - 1))
        end = sequenceEnd
        logging.debug(
            "GenomicsAPIInputReader split_input() start: %d end: %d." %
            (start, end))
        readers.append(cls(readsetId, sequenceName, start, end, useMockData))

        return readers
Example #9
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'blob_keys' parameter with one or more blob keys.

    Returns:
      A list of BlobstoreInputReaders corresponding to the specified shards.
    """
        params = input_readers._get_params(mapper_spec)
        file_names = params[cls.BLOB_KEYS_PARAM]
        if isinstance(file_names, basestring):
            # This is a mechanism to allow multiple filenames (which do not contain
            # commas) in a single string. It may go away.
            file_names = file_names.split(",")

        blob_sizes = {}
        for file_name in file_names:
            blob_key = files.blobstore.get_blob_key(file_name)
            blob_key_str = str(blob_key)
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key_str))
            blob_sizes[blob_key_str] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(file_names)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob
            for i in xrange(shards_per_blob - 1):
                chunks.append(
                    input_readers.BlobstoreLineInputReader.from_json({
                        cls.BLOB_KEY_PARAM:
                        blob_key,
                        cls.INITIAL_POSITION_PARAM:
                        blob_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        blob_chunk_size * (i + 1)
                    }))
            chunks.append(
                input_readers.BlobstoreLineInputReader.from_json({
                    cls.BLOB_KEY_PARAM:
                    blob_key,
                    cls.INITIAL_POSITION_PARAM:
                    blob_chunk_size * (shards_per_blob - 1),
                    cls.END_POSITION_PARAM:
                    blob_size
                }))
        return chunks
Example #10
0
    def validate(cls, mapper_spec):
        super(DjangoModelInputReader, cls).validate(mapper_spec)

        params = _get_params(mapper_spec)

        if cls.NAMESPACE_PARAM in params:
            raise BadReaderParamsError("Namespaces are not supported.")

        entity_kind_name = params[cls.ENTITY_KIND_PARAM]
        try:
            util.for_name(entity_kind_name)
        except ImportError, e:
            raise BadReaderParamsError("Bad entity kind: %s" % e)
    def validate(cls, mapper_spec):
        super(DjangoModelInputReader, cls).validate(mapper_spec)

        params = _get_params(mapper_spec)

        if cls.NAMESPACE_PARAM in params:
            raise BadReaderParamsError("Namespaces are not supported.")

        entity_kind_name = params[cls.ENTITY_KIND_PARAM]
        try:
            util.for_name(entity_kind_name)
        except ImportError, e:
            raise BadReaderParamsError("Bad entity kind: %s" % e)
Example #12
0
    def split_input(cls, mapper_spec):
        params = input_readers._get_params(mapper_spec)
        urls = list(enumerate(params["urls"]))
        shard_count = mapper_spec.shard_count
        if len(urls) > shard_count:
            shard_len = len(urls) / shard_count
        else:
            shard_len = 1

        shards = []
        while urls:
            shards.append(cls(urls[0:shard_len]))
            urls = urls[shard_len:]
        return shards
Example #13
0
    def split_input(cls, mapper_spec):
        params = input_readers._get_params(mapper_spec)
        urls = list(enumerate(params["urls"]))
        shard_count = mapper_spec.shard_count
        if len(urls) > shard_count:
            shard_len = len(urls) / shard_count
        else:
            shard_len = 1

        shards = []
        while urls:
            shards.append(cls(urls[0: shard_len]))
            urls = urls[shard_len:]
        return shards
Example #14
0
  def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_spec = input_readers._get_params(mapper_spec, allow_old=False)
    readsetId = reader_spec[cls.READSET_ID_PARAM]
    sequenceName = reader_spec[cls.SEQUENCE_NAME_PARAM]
    sequenceStart = reader_spec.get(cls.SEQUEQNCE_START_PARAM)
    sequenceEnd = reader_spec.get(cls.SEQUEQNCE_END_PARAM)
    useMockData = reader_spec.get(cls.USE_MOCK_DATA_PARAM)

    # TODO if you are doing all sequences then you need to take sequence name
    # into account as well.
    # For now assume we are only doing a single sequence name.

    # Divide the range by the shard count to get the step.
    shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
    range_length = ((sequenceEnd + 1) - sequenceStart) // shard_count
    if range_length == 0:
      range_length = 1
    logging.debug(
      "GenomicsAPIInputReader split_input() shards: %d range_length: %d",
      mapper_spec.shard_count, range_length)

    # Split into shards
    readers = []
    for position in xrange(shard_count - 1):
      start = sequenceStart + (range_length * position)
      end = start + range_length - 1
      logging.debug("GenomicsAPIInputReader split_input() start: %d end: %d.",
                    start, end)
      readers.append(cls(readsetId, sequenceName, start, end, useMockData))
    start = sequenceStart + (range_length * (shard_count - 1))
    end = sequenceEnd
    logging.debug("GenomicsAPIInputReader split_input() start: %d end: %d.",
                  start, end)
    readers.append(cls(readsetId, sequenceName, start, end, useMockData))

    return readers
Example #15
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

        Args:
          mapper_spec: The mapper specification to split from. Must contain
              'blob_keys' parameter with one or more blob keys.

        Returns:
          A list of BlobstoreInputReaders corresponding to the specified shards.
        """
        params = input_readers._get_params(mapper_spec)
        blob_keys = params[cls.BLOB_KEYS_PARAM]
        if isinstance(blob_keys, basestring):
            blob_keys = blob_keys.split(",")

        blob_sizes = {}
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            blob_sizes[blob_key] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(blob_keys)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob

            for i in xrange(shards_per_blob - 1):
                chunks.append(
                    BlobstoreUniversalLineInputReader.from_json({
                        cls.BLOB_KEY_PARAM:
                        blob_key,
                        cls.INITIAL_POSITION_PARAM:
                        blob_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        blob_chunk_size * (i + 1)
                    }))
            chunks.append(
                BlobstoreUniversalLineInputReader.from_json({
                    cls.BLOB_KEY_PARAM:
                    blob_key,
                    cls.INITIAL_POSITION_PARAM:
                    blob_chunk_size * (shards_per_blob - 1),
                    cls.END_POSITION_PARAM:
                    blob_size
                }))
        return chunks
Example #16
0
  def _to_map_job_config(cls,
                         mr_spec,
                         # TODO(user): Remove this parameter after it can be
                         # read from mr_spec.
                         queue_name):
    """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
    mapper_spec = mr_spec.mapper
    # 0 means all the old APIs before api_version is introduced.
    api_version = mr_spec.params.get("api_version", 0)
    old_api = api_version == 0
    # We can not always convert MapreduceSpec generated by older API
    # to JobConfig. Thus, mr framework should use/expose the returned JobConfig
    # object with caution when a job is started with an old API.
    # In this case, this method only tries not to blow up and assemble a
    # JobConfig object as accurate as possible.
    return cls(_lenient=old_api,
               job_name=mr_spec.name,
               job_id=mr_spec.mapreduce_id,
               # handler_spec from older API may not have map_job.Mapper type.
               mapper=util.for_name(mapper_spec.handler_spec),
               input_reader_cls=mapper_spec.input_reader_class(),
               input_reader_params=input_readers._get_params(mapper_spec),
               output_writer_cls=mapper_spec.output_writer_class(),
               output_writer_params=output_writers._get_params(mapper_spec),
               shard_count=mapper_spec.shard_count,
               queue_name=queue_name,
               user_params=mr_spec.params.get("user_params"),
               shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
               done_callback_url=mr_spec.params.get("done_callback"),
               _force_writes=mr_spec.params.get("force_writes"),
               _base_path=mr_spec.params["base_path"],
               _task_max_attempts=mr_spec.params.get("task_max_attempts"),
               _task_max_data_processing_attempts=(
                   mr_spec.params.get("task_max_data_processing_attempts")),
               _hooks_cls=util.for_name(mr_spec.hooks_class_name),
               _app=mr_spec.params.get("app_id"),
               _api_version=api_version)
Example #17
0
    def validate(cls, mapper_spec):
        """Validate mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec

    Raises:
      BadReaderParamsError if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        reader_spec = input_readers._get_params(mapper_spec, allow_old=False)

        # Readset id is required.
        if cls.READSET_ID_PARAM not in reader_spec:
            raise errors.BadReaderParamsError(
                "%s is required for the Genomics API" % cls.READSET_ID_PARAM)
Example #18
0
  def validate(cls, mapper_spec):
    """Validate mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec

    Raises:
      BadReaderParamsError if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
    reader_spec = input_readers._get_params(mapper_spec, allow_old=False)

    # Readset id is required.
    if cls.READSET_ID_PARAM not in reader_spec:
      raise errors.BadReaderParamsError("%s is required for the Genomics API" %
                                        cls.READSET_ID_PARAM)
Example #19
0
    def split_input(cls, mapper_spec):
        """Returns a list of input readers.

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders.
    """
        reader_spec = input_readers._get_params(mapper_spec, allow_old=False)
        readsetId = reader_spec[cls.READSET_ID_PARAM]
        sequenceName = reader_spec[cls.SEQUENCE_NAME_PARAM]
        sequenceStart = reader_spec.get(cls.SEQUEQNCE_START_PARAM)
        sequenceEnd = reader_spec.get(cls.SEQUEQNCE_END_PARAM)

        # TODO if you are doing all sequences then you need to take sequence name
        # into account as well.
        # For now assume we are only doing a single sequence name.

        # Divide the range by the shard count to get the step.
        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        range_length = ((sequenceEnd + 1) - sequenceStart) // shard_count
        if range_length == 0:
            range_length = 1

        # Split into shards
        readers = []
        for position in xrange(shard_count - 1):
            start = sequenceStart + (range_length * position)
            end = start + range_length - 1
            logging.debug(
                "GenomicsAPIInputReader split_input() start: %d end: %d.",
                start, end)
            readers.append(cls(readsetId, sequenceName, start, end))
        start = sequenceStart + (range_length * (shard_count - 1))
        end = sequenceEnd

        logging.debug(
            "GenomicsAPIInputReader split_input() start: %d end: %d.", start,
            end)
        readers.append(cls(readsetId, sequenceName, start, end))

        return readers
    def split_input(cls, mapper_spec):
        """Returns a list of input readers.

    This method creates a list of input readers, each for one shard.
    It attempts to split inputs among readers evenly.

    Args:
      mapper_spec: model.MapperSpec specifies the inputs and additional
        parameters to define the behavior of input readers.

    Returns:
      A list of InputReaders. None or [] when no input data can be found.
    """
        params = input_readers._get_params(mapper_spec)
        blob_key = params[cls.BLOB_KEY_PARAM]
        blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
        if not blob_info:
            return None
        return [cls(blob_key, 0, blob_info.size)]  # one shard per blob
Example #21
0
    def validate(cls, mapper_spec):
        """Check that the pipeline give all the parameters needed by the InputReader"""

        if mapper_spec.input_reader_class() != cls:
            raise BadReaderParamsError("Mapper input reader class mismatch")

        params = _get_params(mapper_spec)

        # Check hashtag parameter
        if cls.HASHTAG not in params:
            raise BadReaderParamsError("Must specify %s" % cls.HASHTAG)
        if not isinstance(str(params[cls.HASHTAG]), str):
            raise BadReaderParamsError("%s should be a string" % cls.HASHTAG)

        # Check tweets parameter
        if cls.TWEETS not in params:
            raise BadReaderParamsError("Must specify %s" % cls.TWEETS)
        if not isinstance(str(params[cls.TWEETS]), str):
            raise BadReaderParamsError("%s should be a string" % cls.TWEETS)
Example #22
0
    def split_input(cls, mapper_spec):
        """
        """
        params = input_readers._get_params(mapper_spec)
        db = params.get('db', None)
        try:
            app, model = params['model'].split('.')
        except ValueError:
            app, model = params['model'].split(',')
        model = apps.get_model(app, model)
        query = params.get('query', None)
        if query is not None:
            # FIXME why we have to cast to a str here? comes back as unicode
            query = cPickle.loads(str(query))

        shard_count = mapper_spec.shard_count
        if db:
            scatter_query = model.objects.using(db)
        else:
            scatter_query = model.objects

        scatter_query = scatter_query.all()
        scatter_query = scatter_query.values_list('pk').order_by('__scatter__')
        oversampling_factor = 32
        # FIXME values
        keys = [x[0] for x in scatter_query[:shard_count * oversampling_factor]]
        keys.sort()

        if len(keys) > shard_count:
            keys = cls._choose_split_points(keys, shard_count)
        keyranges = []
        if len(keys) > 1:
            keyranges.append(DjangoInputReader(params['model'], pk__lte=keys[0], query=query, shard_id=0, db=db))
            for x in range((len(keys) - 1)):
                keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[x], pk__lte=keys[x+1], query=query, shard_id=x+1, db=db))
            keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[x+1], query=query, shard_id=x+2, db=db))
        elif len(keys) == 1:
            keyranges.append(DjangoInputReader(params['model'], pk__lte=keys[0], query=query, shard_id=0, db=db))
            keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[0], query=query, shard_id=1, db=db))
        else:
            keyranges.append(DjangoInputReader(params['model'], query=query, shard_id=0, db=db))
        return keyranges
Example #23
0
  def split_input(cls, mapper_spec):
    """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'blob_keys' parameter with one or more blob keys.

    Returns:
      A list of BlobstoreInputReaders corresponding to the specified shards.
    """
    params = input_readers._get_params(mapper_spec)
    file_names = params[cls.BLOB_KEYS_PARAM]
    if isinstance(file_names, basestring):
      # This is a mechanism to allow multiple filenames (which do not contain
      # commas) in a single string. It may go away.
      file_names = file_names.split(",")

    blob_sizes = {}
    for file_name in file_names:
      blob_key = files.blobstore.get_blob_key(file_name)
      blob_key_str = str(blob_key)
      blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key_str))
      blob_sizes[blob_key_str] = blob_info.size

    shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
    shards_per_blob = shard_count // len(file_names)
    if shards_per_blob == 0:
      shards_per_blob = 1

    chunks = []
    for blob_key, blob_size in blob_sizes.items():
      blob_chunk_size = blob_size // shards_per_blob
      for i in xrange(shards_per_blob - 1):
        chunks.append(input_readers.BlobstoreLineInputReader.from_json(
            {cls.BLOB_KEY_PARAM: blob_key,
             cls.INITIAL_POSITION_PARAM: blob_chunk_size * i,
             cls.END_POSITION_PARAM: blob_chunk_size * (i + 1)}))
      chunks.append(input_readers.BlobstoreLineInputReader.from_json(
          {cls.BLOB_KEY_PARAM: blob_key,
           cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1),
           cls.END_POSITION_PARAM: blob_size}))
    return chunks
Example #24
0
    def split_input(cls, mapper_spec):
        """
        """
        params = input_readers._get_params(mapper_spec)
        db = params.get('db', None)
        try:
            app, model = params['model'].split('.')
        except ValueError:
            app, model = params['model'].split(',')
        model = apps.get_model(app, model)
        query = params.get('query', None)
        if query is not None:
            # FIXME why we have to cast to a str here? comes back as unicode
            query = cPickle.loads(str(query))

        shard_count = mapper_spec.shard_count
        if db:
            scatter_query = model.objects.using(db)
        else:
            scatter_query = model.objects

        scatter_query = scatter_query.all()
        scatter_query = scatter_query.values_list('pk').order_by('__scatter__')
        oversampling_factor = 32
        # FIXME values
        keys = [x[0] for x in scatter_query[:shard_count * oversampling_factor]]
        keys.sort()

        if len(keys) > shard_count:
            keys = cls._choose_split_points(keys, shard_count)
        keyranges = []
        if len(keys) > 1:
            keyranges.append(DjangoInputReader(params['model'], pk__lte=keys[0], query=query, shard_id=0, db=db))
            for x in xrange((len(keys) - 1)):
                keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[x], pk__lte=keys[x+1], query=query, shard_id=x+1, db=db))
            keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[x+1], query=query, shard_id=x+2, db=db))
        elif len(keys) == 1:
            keyranges.append(DjangoInputReader(params['model'], pk__lte=keys[0], query=query, shard_id=0, db=db))
            keyranges.append(DjangoInputReader(params['model'], pk__gt=keys[0], query=query, shard_id=1, db=db))
        else:
            keyranges.append(DjangoInputReader(params['model'], query=query, shard_id=0, db=db))
        return keyranges
Example #25
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = kv_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Example #26
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

        Args:
          mapper_spec: The mapper specification to split from. Must contain
              'blob_keys' parameter with one or more blob keys.

        Returns:
          A list of BlobstoreInputReaders corresponding to the specified shards.
        """
        params = input_readers._get_params(mapper_spec)
        blob_keys = params[cls.BLOB_KEYS_PARAM]
        if isinstance(blob_keys, basestring):
            blob_keys = blob_keys.split(",")

        blob_sizes = {}
        for blob_key in blob_keys:
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key))
            blob_sizes[blob_key] = blob_info.size

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_blob = shard_count // len(blob_keys)
        if shards_per_blob == 0:
            shards_per_blob = 1

        chunks = []
        for blob_key, blob_size in blob_sizes.items():
            blob_chunk_size = blob_size // shards_per_blob

            for i in xrange(shards_per_blob - 1):
                chunks.append(BlobstoreUniversalLineInputReader.from_json({
                    cls.BLOB_KEY_PARAM: blob_key,
                    cls.INITIAL_POSITION_PARAM: blob_chunk_size * i,
                    cls.END_POSITION_PARAM: blob_chunk_size * (i + 1)
                }))
            chunks.append(BlobstoreUniversalLineInputReader.from_json({
                cls.BLOB_KEY_PARAM: blob_key,
                cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1),
                cls.END_POSITION_PARAM: blob_size
            }))
        return chunks
Example #27
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = kv_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  mapper_spec = ctx.mapreduce_spec.mapper
  params = input_readers._get_params(mapper_spec)
  bucket_name = params.get("bucket_name")
  filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
              ctx.shard_id + "-" + str(int(time.time())))
  full_filename = "/%s/%s" % (bucket_name, filename)
  filehandle = cloudstorage.open(full_filename, mode="w")
  with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  filehandle.close()

  entity = _OutputFile(key_name=full_filename,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Example #28
0
  def split_input(cls, mapper_spec):
    """Returns a list of input readers.

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Returns:
      A list of InputReaders.
    """
    reader_spec = input_readers._get_params(mapper_spec, allow_old=False)
    readsetId = reader_spec[cls.READSET_ID_PARAM]
    sequenceName = reader_spec[cls.SEQUENCE_NAME_PARAM]
    sequenceStart = reader_spec.get(cls.SEQUEQNCE_START_PARAM)
    sequenceEnd = reader_spec.get(cls.SEQUEQNCE_END_PARAM)

    # TODO if you are doing all sequences then you need to take sequence name
    # into account as well.
    # For now assume we are only doing a single sequence name.

    # Divide the range by the shard count to get the step.
    shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
    range_length = ((sequenceEnd + 1) - sequenceStart) // shard_count
    if range_length == 0:
      range_length = 1

    # Split into shards
    readers = []
    for position in xrange(shard_count - 1):
      start = sequenceStart + (range_length * position)
      end = start + range_length - 1
      logging.debug("GenomicsAPIInputReader split_input() start: %d end: %d.",
                   start, end)
      readers.append(cls(readsetId, sequenceName, start, end))
    start = sequenceStart + (range_length * (shard_count - 1))
    end = sequenceEnd

    logging.debug("GenomicsAPIInputReader split_input() start: %d end: %d.",
                 start, end)
    readers.append(cls(readsetId, sequenceName, start, end))

    return readers
Example #29
0
    def split_input(cls, mapper_spec):
        """Split the data into different InputReader"""

        # Get Input Reader parameters
        params = _get_params(mapper_spec)
        hashtag = params[cls.HASHTAG]
        tweets = TweetManager.jsonToTweets(params[cls.TWEETS])

        # Get number of lines processed by each shard
        shard_count = mapper_spec.shard_count
        tweet_nbr = sum(1 for elem in tweets)
        tweet_per_shard = tweet_nbr // shard_count

        # Create the list of input readers
        mr_input_readers = [cls(tweet_per_shard, i*tweet_per_shard, hashtag, tweets) for i in range(shard_count)]

        # Check if there are lines not assigned to a shard, and create another input reader if so
        left = tweet_nbr - tweet_per_shard*shard_count
        if left > 0:
            mr_input_readers.append(cls(left, tweet_per_shard*shard_count, hashtag, tweets))

        return mr_input_readers
Example #30
0
    def validate(cls, mapper_spec):
        """Validates mapper spec and all mapper parameters.

    Args:
      mapper_spec: The MapperSpec for this InputReader.

    Raises:
      BadReaderParamsError: required parameters are missing or invalid.
    """
        if mapper_spec.input_reader_class() != cls:
            raise input_readers.BadReaderParamsError(
                "__RobotsLineInputReader:Mapper input reader class mismatch")
        params = input_readers._get_params(mapper_spec)
        if cls.BLOB_KEYS_PARAM not in params:
            raise input_readers.BadReaderParamsError(
                "_RobotsLineInputReader:Must specify 'blob_keys' for mapper input"
            )
        file_names = params[cls.BLOB_KEYS_PARAM]
        if isinstance(file_names, basestring):
            # This is a mechanism to allow multiple blob keys (which do not contain
            # commas) in a single string. It may go away.
            file_names = file_names.split(",")
        if len(file_names) > cls._MAX_BLOB_KEYS_COUNT:
            raise input_readers.BadReaderParamsError(
                "_RobotsLineInputReader:Too many 'blob_keys' for mapper input")
        if not file_names:
            raise input_readers.BadReaderParamsError(
                "_RobotsLineInputReader:No 'blob_keys' specified for mapper input"
            )
        for file_name in file_names:
            blob_key = files.blobstore.get_blob_key(file_name)
            blob_key_str = str(blob_key)
            blob_info = blobstore.BlobInfo.get(blobstore.BlobKey(blob_key_str))
            if not blob_info:
                raise input_readers.BadReaderParamsError(
                    "_RobotsLineInputReader:Could not find blobinfo for key %s"
                    % blob_key_str)