def __init__(self, reader_options, aws_access_key_id, aws_secret_access_key): self.source_bucket = get_bucket(reader_options.get('bucket'), aws_access_key_id, aws_secret_access_key) self.pattern = reader_options.get('pattern', None) prefix = reader_options.get('prefix', '') prefix_pointer = reader_options.get('prefix_pointer', '') prefix_format_using_date = reader_options.get( 'prefix_format_using_date') unformatted_prefixes = self._get_prefixes(prefix, prefix_pointer) try: start, end = self._get_prefix_formatting_dates( prefix_format_using_date) except ValueError: raise ConfigurationError('The option prefix_format_using_date ' 'should be either a date string or two ' 'date strings in a list/tuple') try: self.prefixes = format_prefixes(unformatted_prefixes, start, end) except InvalidDateRangeError: raise ConfigurationError('The end date should be greater or equal ' 'to the start date for the ' 'prefix_format_using_date option') self.logger = logging.getLogger('s3-reader') self.logger.setLevel(logging.INFO)
def _get_input_files(cls, input_specification): """Get list of input files according to input definition. Input definition can be: - str: specifying a filename - list of str: specifying list a of filenames - dict with "dir" and optional "pattern" parameters: specifying the toplevel directory under which input files will be sought and an optional filepath pattern """ if isinstance(input_specification, (basestring, dict)): input_specification = [input_specification] elif not isinstance(input_specification, list): raise ConfigurationError( "Input specification must be string, list or dict.") out = [] for input_unit in input_specification: if isinstance(input_unit, basestring): out.append(input_unit) elif isinstance(input_unit, dict): missing = object() directory = input_unit.get('dir', missing) dir_pointer = input_unit.get('dir_pointer', missing) if directory is missing and dir_pointer is missing: raise ConfigurationError( 'Input directory dict must contain' ' "dir" or "dir_pointer" element (but not both)') if directory is not missing and dir_pointer is not missing: raise ConfigurationError( 'Input directory dict must not contain' ' both "dir" and "dir_pointer" elements') if dir_pointer is not missing: directory = cls._get_pointer(dir_pointer) out.extend( cls._get_directory_files(directory=directory, pattern=input_unit.get('pattern'), include_dot_files=input_unit.get( 'include_dot_files', False))) else: raise ConfigurationError( 'Input must only contain strings or dicts') return out
def _get_compression_format(self): compression = self.read_option('compression') if compression not in FILE_COMPRESSION: raise ConfigurationError('The compression format can only be ' 'one of the following: "{}"' ''.format(FILE_COMPRESSION.keys())) return compression
def _get_fields(self): if self.read_option('fields'): return self.read_option('fields') elif not self.read_option('schema'): raise ConfigurationError( 'CSV formatter requires at least one of: fields or schema') return self._get_fields_from_schema()
def _load_module(self, options, metadata, module_type, **kwargs): module_name = options['name'] try: instance = self._instantiate_class(module_name, options, metadata, **kwargs) except ConfigurationError as e: raise ConfigurationError('Error in configuration for module %s: %s' % (module_name, e)) if not isinstance(instance, module_type): raise TypeError('Module must inherit from ' + str(module_type)) return instance
def _get_prefixes(self, prefix, prefix_pointer): if prefix and prefix_pointer: raise ConfigurationError('prefix and prefix_pointer options ' 'cannot be used together') prefixes = [prefix] if isinstance(prefix, six.string_types) else prefix if prefix_pointer: prefixes = self._fetch_prefixes_from_pointer(prefix_pointer) return prefixes
def compile_reduce_function(reduce_code, source_path=None): # XXX: potential security hole -- only use this in contained environments ns = {} exec(compile(reduce_code, source_path or '<string>', 'exec'), {}, ns) try: return ns['reduce_function'] except KeyError: raise ConfigurationError( "Missing definition of reduce_function(item, accumulator=None)")
def _get_collection(self): collection_url = self.read_option('collection_url') match = re.match(COLLECTION_REGEX, collection_url) if not match: raise ConfigurationError("Invalid collection_url: %s" % collection_url) project, collection_name = match.groups() import hubstorage client = hubstorage.HubstorageClient(self.read_option('apikey')) return client.get_project(project).collections.new_store(collection_name)
def check_options(self): for option_name, option_spec in self.supported_options.iteritems(): option_value = maybe_cast_list(self.read_option(option_name), option_spec['type']) if option_value and not isinstance(option_value, option_spec['type']): raise ConfigurationError( 'Value for option %s should be of type: %s' % (option_name, option_spec['type'])) if 'default' in option_spec: continue if 'env_fallback' in option_spec and option_value is None: if not os.environ.get(option_spec['env_fallback']): raise ConfigurationError( 'Missing value for option {}. (tried also: {} from env)' .format(option_name, option_spec['env_fallback'])) elif option_value is None: raise ConfigurationError('Missing value for option %s' % option_name)