Beispiel #1
0
    def __init__(self, reader_options, aws_access_key_id,
                 aws_secret_access_key):
        self.source_bucket = get_bucket(reader_options.get('bucket'),
                                        aws_access_key_id,
                                        aws_secret_access_key)
        self.pattern = reader_options.get('pattern', None)

        prefix = reader_options.get('prefix', '')
        prefix_pointer = reader_options.get('prefix_pointer', '')
        prefix_format_using_date = reader_options.get(
            'prefix_format_using_date')

        unformatted_prefixes = self._get_prefixes(prefix, prefix_pointer)
        try:
            start, end = self._get_prefix_formatting_dates(
                prefix_format_using_date)
        except ValueError:
            raise ConfigurationError('The option prefix_format_using_date '
                                     'should be either a date string or two '
                                     'date strings in a list/tuple')
        try:
            self.prefixes = format_prefixes(unformatted_prefixes, start, end)
        except InvalidDateRangeError:
            raise ConfigurationError('The end date should be greater or equal '
                                     'to the start date for the '
                                     'prefix_format_using_date option')

        self.logger = logging.getLogger('s3-reader')
        self.logger.setLevel(logging.INFO)
Beispiel #2
0
    def _get_input_files(cls, input_specification):
        """Get list of input files according to input definition.

        Input definition can be:

        - str: specifying a filename

        - list of str: specifying list a of filenames

        - dict with "dir" and optional "pattern" parameters: specifying the
        toplevel directory under which input files will be sought and an optional
        filepath pattern

        """
        if isinstance(input_specification, (basestring, dict)):
            input_specification = [input_specification]
        elif not isinstance(input_specification, list):
            raise ConfigurationError(
                "Input specification must be string, list or dict.")

        out = []
        for input_unit in input_specification:
            if isinstance(input_unit, basestring):
                out.append(input_unit)
            elif isinstance(input_unit, dict):
                missing = object()
                directory = input_unit.get('dir', missing)
                dir_pointer = input_unit.get('dir_pointer', missing)
                if directory is missing and dir_pointer is missing:
                    raise ConfigurationError(
                        'Input directory dict must contain'
                        ' "dir" or "dir_pointer" element (but not both)')
                if directory is not missing and dir_pointer is not missing:
                    raise ConfigurationError(
                        'Input directory dict must not contain'
                        ' both "dir" and "dir_pointer" elements')
                if dir_pointer is not missing:
                    directory = cls._get_pointer(dir_pointer)

                out.extend(
                    cls._get_directory_files(directory=directory,
                                             pattern=input_unit.get('pattern'),
                                             include_dot_files=input_unit.get(
                                                 'include_dot_files', False)))
            else:
                raise ConfigurationError(
                    'Input must only contain strings or dicts')
        return out
Beispiel #3
0
 def _get_compression_format(self):
     compression = self.read_option('compression')
     if compression not in FILE_COMPRESSION:
         raise ConfigurationError('The compression format can only be '
                                  'one of the following:  "{}"'
                                  ''.format(FILE_COMPRESSION.keys()))
     return compression
Beispiel #4
0
 def _get_fields(self):
     if self.read_option('fields'):
         return self.read_option('fields')
     elif not self.read_option('schema'):
         raise ConfigurationError(
             'CSV formatter requires at least one of: fields or schema')
     return self._get_fields_from_schema()
Beispiel #5
0
 def _load_module(self, options, metadata, module_type, **kwargs):
     module_name = options['name']
     try:
         instance = self._instantiate_class(module_name, options, metadata, **kwargs)
     except ConfigurationError as e:
         raise ConfigurationError('Error in configuration for module %s: %s' % (module_name, e))
     if not isinstance(instance, module_type):
         raise TypeError('Module must inherit from ' + str(module_type))
     return instance
Beispiel #6
0
    def _get_prefixes(self, prefix, prefix_pointer):
        if prefix and prefix_pointer:
            raise ConfigurationError('prefix and prefix_pointer options '
                                     'cannot be used together')

        prefixes = [prefix] if isinstance(prefix, six.string_types) else prefix
        if prefix_pointer:
            prefixes = self._fetch_prefixes_from_pointer(prefix_pointer)
        return prefixes
Beispiel #7
0
def compile_reduce_function(reduce_code, source_path=None):
    # XXX: potential security hole -- only use this in contained environments
    ns = {}
    exec(compile(reduce_code, source_path or '<string>', 'exec'), {}, ns)
    try:
        return ns['reduce_function']
    except KeyError:
        raise ConfigurationError(
            "Missing definition of reduce_function(item, accumulator=None)")
Beispiel #8
0
    def _get_collection(self):
        collection_url = self.read_option('collection_url')
        match = re.match(COLLECTION_REGEX, collection_url)
        if not match:
            raise ConfigurationError("Invalid collection_url: %s" % collection_url)
        project, collection_name = match.groups()

        import hubstorage
        client = hubstorage.HubstorageClient(self.read_option('apikey'))
        return client.get_project(project).collections.new_store(collection_name)
Beispiel #9
0
 def check_options(self):
     for option_name, option_spec in self.supported_options.iteritems():
         option_value = maybe_cast_list(self.read_option(option_name),
                                        option_spec['type'])
         if option_value and not isinstance(option_value,
                                            option_spec['type']):
             raise ConfigurationError(
                 'Value for option %s should be of type: %s' %
                 (option_name, option_spec['type']))
         if 'default' in option_spec:
             continue
         if 'env_fallback' in option_spec and option_value is None:
             if not os.environ.get(option_spec['env_fallback']):
                 raise ConfigurationError(
                     'Missing value for option {}. (tried also: {} from env)'
                     .format(option_name, option_spec['env_fallback']))
         elif option_value is None:
             raise ConfigurationError('Missing value for option %s' %
                                      option_name)