def __init__(self, config): super(CsvDataSource, self).__init__(config) attrs = self.config.attrib self.headers = [] for t in self.config.xpath('header'): self.headers.append((t.attrib['name'], int(t.attrib['index']))) if 'src' not in attrs: raise ImportHandlerException('No source given') self.src = attrs['src'] if 'delimiter' in attrs: self.delimiter = attrs['delimiter'] if self.delimiter == '\\t': self.delimiter = '\t' else: self.delimiter = ',' try: self.offset = int(attrs.get('offset', 0)) self.count = int(attrs.get('count', sys.maxint)) except (ValueError, TypeError) as e: raise ImportHandlerException('offset and count should be integers', e) logging.info('In csv datasource {0} there are ' 'offset {1} and count {2}'.format(self.name, self.offset, self.count))
def _get_iter(self, query=None, query_target=None, params=None): """ Returns datasource iterator. query: string query string of the url. query_target: string it isn't used for this datasource. """ if query_target is not None: raise ImportHandlerException( "Http datasource doesn't support query_target") if query: query = query.strip() url = '{0}/{1}'.format(self.url.rstrip('/'), str(query).lstrip('/')) else: url = self.url # TODO: params? logging.info('Getting data from: %s' % url) try: resp = requests.request(self.method, url, stream=True) except ConnectionError as exc: raise ImportHandlerException( 'Cannot reach url: {}'.format(str(exc)), exc) try: result = resp.json() except Exception as exc: raise ImportHandlerException( 'Cannot parse json: {}'.format(str(exc)), exc) if isinstance(result, dict): return iter([resp.json()]) return iter(resp.json())
def _raise(): if key: raise ImportHandlerException( "Field name couldn't be " "empty. Key is '{}'.".format(key)) else: raise ImportHandlerException( "Variable name couldn't be empty.")
def __init__(self, config): super(HttpDataSource, self).__init__(config) attrs = self.config.attrib if 'url' not in attrs: raise ImportHandlerException('No url given') self.url = attrs['url'] if not self.url: raise ImportHandlerException('No url given') self.method = attrs.get('method', 'GET')
def process_field(self, field, row, row_data=None): row_data = row_data or {} if field.column: item_value = row.get(field.column, None) else: item_value = row result = {} kwargs = { 'row': row, 'row_data': row_data, 'datasource_type': self.datasource.type, 'script_manager': self.import_handler.plan.script_manager, 'params': self.params } if field.is_datasource_field: nested_entity = self._get_entity_for_datasource_field(field) if nested_entity is None: # No one use this field datasource return result if field.transform == 'json': data = load_json(item_value) for sub_field in nested_entity.fields.values(): result[sub_field.name] = sub_field.process_value( data, **kwargs) kwargs['row_data'].update(result) elif field.transform == 'csv': # TODO: Implement Fields.transform=csv raise ImportHandlerException( 'Fields with transform=csv are not implemented yet') else: result[field.name] = field.process_value(item_value, **kwargs) return result
def execute_function(self, script, value, row_data=None, local_vars={}): """ Executes function and returns it's result. script: string python code to execute. value: any would be passed as #{value} parameter to the script row_data: dict script template parameters local_vars: dict execution context """ def update_strings(val): if isinstance(val, basestring): return "'%s'" % val return val row_data = row_data or {} params = {'value': update_strings(value)} params.update(row_data) params.update(local_vars) text = ParametrizedTemplate(script).safe_substitute(params) try: return self._exec(text, local_vars) except Exception, exc: raise ImportHandlerException( "Exception occurs while executing script: {0}. {1}".format( text[:100], exc), exc)
def __init__(self, config): self.label = config.get('label') self.script = config.get('script') try: self.value = float(config.get('value', 1)) except Exception, exc: raise ImportHandlerException( 'Invalid predict model weight: {0}.' 'Should be a float value.'.format(config.get('value')), exc)
def __init__(self, entity, import_handler, extra_params={}): self.import_handler = import_handler self.entity = entity params = {} params.update(import_handler.params) params.update(extra_params) self.params = params # Building the iterator for the entity query = entity.build_query(params) self.datasource = import_handler.plan.datasources.get( entity.datasource_name) if self.datasource is None: raise ImportHandlerException( "Datasource or transformed field {0} not found, " "but it used in the entity {1}".format(entity.datasource_name, entity.name)) # Process sqoop imports for sqoop_import in self.entity.sqoop_imports: sqoop_import.datasource = import_handler.plan.datasources.get( sqoop_import.datasource_name) if sqoop_import.query: sqoop_query = sqoop_import.build_query(params) logging.info('Run query %s' % sqoop_query) # We running db datasource query to create a table sqoop_import.datasource.run_queries(sqoop_query) if self.entity.autoload_sqoop_dataset: from utils import SCHEMA_INFO_FIELDS, PIG_TEMPLATE, \ construct_pig_fields sql = """select * from {0} limit 1; select {1} from INFORMATION_SCHEMA.COLUMNS where table_name = '{0}' order by ordinal_position;""".format(sqoop_import.table, ','.join(SCHEMA_INFO_FIELDS)) try: iterator = sqoop_import.datasource._get_iter(sql) fields_data = [{ key: opt[i] for i, key in enumerate(SCHEMA_INFO_FIELDS) } for opt in iterator] except Exception, exc: raise ValueError("Can't execute the query: {0}." "Error: {1}".format(sql, exc)) fields_str = construct_pig_fields(fields_data) load_dataset_script = PIG_TEMPLATE.format( self.entity.sqoop_dataset_name, sqoop_import.target, fields_str, self.datasource.bucket_name) query = "{0}\n{1}".format(load_dataset_script, query)
def __init__(self, config, is_file=True): if is_file: if os.path.isfile(config): with open(config, 'r') as fp: config = fp.read() else: raise ImportHandlerException("import handler file '%s' not " "found" % config) if not config: raise ImportHandlerException('import handler file is empty') try: self.data = objectify.fromstring(config) except etree.XMLSyntaxError as e: raise ImportHandlerException( "Valid XML is expected for import handler. " "Parse error: {0}".format(e), e ) if not self.is_valid(): raise ImportHandlerException( "There is an error in the import handler's XML, " "line {0}. {1}".format(self.error.line, self.error.message)) self.inputs = {} self.load_inputs(self.data) self.datasources = {} self.load_datasources(self.data) self.scripts = [] self.script_manager = ScriptManager() self.load_scripts(self.data) # Loading import section self.entity = Entity(self.data['import'].entity) # Predict section self.predict = Predict(self.data.predict) if \ hasattr(self.data, 'predict') else None
def get_script_str(self): if self.src: try: self._process_local_file() except LocalScriptNotFoundException as e: try: self._process_amazon_file() except Exception as exc: raise ImportHandlerException( "{0}. Searching on Amazon: {1} ".format( e.message, exc.message), exc) except Exception as ex: raise ImportHandlerException("Error while accessing script " "'{0}': {1}".format(self.src, ex.message), ex) elif self.text: self.out_string = self.text return self.out_string
def __get_obj(row): if len(self.headers) == 0: return {str(i): row[i] for i in range(0, len(row))} obj = {} for name, idx in self.headers: if len(row) <= idx: raise ImportHandlerException( "csv file {0} doesn't contains column " "{1}, named {2}".format(self.src, idx, name)) obj[name] = row[idx] return obj
def _get_iter(self, query=None, query_target=None, params=None): if query == 'any': return iter([params]) try: result = json.loads(query) except Exception as exc: raise ImportHandlerException( 'Cannot parse json: {}'.format(str(exc)), exc) if isinstance(result, dict): return iter([result]) return iter(result)
def _run(self, query, query_target=None, run=False): queries = self._get_queries_list(query, query_target) method = 1 if run else 0 vendor = self.config.attrib['vendor'] db_iter = self.DB.get(vendor)[method] if vendor in self.DB else None if db_iter is None: raise ImportHandlerException('Database type %s not supported' % vendor) if 'host' not in self.config.attrib: raise ImportHandlerException( 'No database connection details defined') from copy import deepcopy conn_params = deepcopy(self.config.attrib) conn_params.pop('name') conn_params.pop('vendor') conn_string = ' '.join( ['%s=%s' % (k, v) for k, v in conn_params.iteritems()]) return db_iter(queries, conn_string)
def key_exists(s3, bucket, key_name): """ Checks if key exists in the bucket """ try: s3.Object(bucket, key_name).load() return True except ClientError as e: if e.response['Error']['Code'] == "404": return False else: raise ImportHandlerException(e.message, e)
def __init__(self, config): self.name = config.get('name') self.value = config.get('value') self.script = config.get('script') if not (self.value or self.script): raise ImportHandlerException( 'Either value or script attribute need to be defined' ' for predict model {0}'.format(self.name)) self.weights = [] for weight in config.xpath('weight'): self.weights.append(Weight(weight))
def add_python(self, script): """ Adds python methods to the script manager. """ try: if script: eval(compile(script, "<str>", "exec"), self.context, self.context) except Exception, exc: raise ImportHandlerException( "Exception occurs while adding python script: {0}. {1}".format( script[:250], exc), exc)
def _get_queries_list(self, query, query_target=None): if query is None: raise ImportHandlerException( "Query is required in the DB datasource") query = query.strip(' \t\n\r') if not query: raise ImportHandlerException( "Query is required in the DB datasource") if not query.endswith(';'): query += ';' queries = query.split(';')[:-1] queries = [q + ';' for q in queries] if query_target: check_table_name(query_target) queries.append("SELECT * FROM %s;" % query_target) return queries
def load_datasources(self, config): """ Loads global datasources from configuration. """ for ds_config in iterchildren(config.datasources): ds = DataSource.factory(ds_config) if ds.name in self.datasources: raise ImportHandlerException( 'There are few datasources with name {0}'.format(ds.name)) self.datasources[ds.name] = ds ds = DataSource.DATASOURCE_DICT['input']() self.datasources[ds.name] = ds
def _exec(self, text, row_data=None): row_data = row_data or {} context = globals().copy() context.update(locals()) context.update(prepare_context(row_data)) context.update(self.context) try: return eval(text, context, context) except Exception, exc: raise ImportHandlerException( "Exception occurs while executing script: {0}. {1}".format( text[:100], exc), exc)
def _process_amazon_file(self): AMAZON_ACCESS_TOKEN, AMAZON_TOKEN_SECRET, \ BUCKET_NAME = self.amazon_settings try: s3 = boto3.resource( 's3', aws_access_key_id=AMAZON_ACCESS_TOKEN, aws_secret_access_key=AMAZON_TOKEN_SECRET) res = s3.Object(BUCKET_NAME, self.src).get() self.out_string = res["Body"].read(res["ContentLength"]) except Exception as exc: raise ImportHandlerException("Error accessing file '{0}' on Amazon" ": {1}".format(self.src, exc.message), exc)
def validate_attributes(self): # TODO: """ Validates field configuration. """ if self.type not in PROCESS_STRATEGIES: types = ", ".join(PROCESS_STRATEGIES.keys()) raise ImportHandlerException( 'Type of the field %s is invalid: %s. Choose one of %s' % (self.name, self.type, types)) if self.type != 'string': def _check_for_string(attr_name): if getattr(self, attr_name): raise ImportHandlerException('Field %s declaration \ is invalid: use %s only for string fields' % (self.name, attr_name)) _check_for_string('dateFormat')
def process_input_params(self, params): """ Validates that all required input params of the current extraction plan exist in the given param dictionary. Keyword arguments params -- the parameters to check """ logging.info('Validate input parameters.') param_set = set(params.keys() if params else ()) required_set = set(self.plan.inputs.keys()) missing = required_set.difference(param_set) if len(missing) > 0: raise ImportHandlerException('Missing input parameters: %s' % ', '.join(missing)) for name, inp in self.plan.inputs.iteritems(): self.params[name] = inp.process_value(params[name])
def run_sqoop_imports(self, sqoop_imports=[]): """ Runs sqoop imports and saves results to Amazon S3 on `self.sqoop_result_uri`. """ for sqoop_import in sqoop_imports: db_param = sqoop_import.datasource.config[0].attrib connect = "jdbc:postgresql://%s:%s/%s" % ( db_param['host'], db_param.get('port', '5432'), db_param['dbname']) sqoop_script = self.SQOOP_COMMAND % { 'table': sqoop_import.table, 'connect': connect, 'password': db_param['password'], 'user': db_param['user'], 'mappers': sqoop_import.mappers, 'options': sqoop_import.options } if sqoop_import.where: sqoop_script += " --where %s" % sqoop_import.where if sqoop_import.direct: sqoop_script += " --direct" sqoop_result_uri = "%s%s/" % (self.sqoop_result_uri, sqoop_import.target) self.sqoop_results_uries[sqoop_import.target] = sqoop_result_uri sqoop_script += " --target-dir %s" % sqoop_result_uri logging.info('Sqoop command: %s' % sqoop_script) import subprocess p = subprocess.Popen(sqoop_script, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in p.stdout.readlines(): logging.info(line) retval = p.wait() if retval != 0: raise ImportHandlerException('Sqoop import failed')
def _get_result_job(self, response, j_id): cluster = response.get('Cluster', None) if not cluster: raise ImportHandlerException( "Unexpected EMR result: {}".format(response)) return cluster
def factory(cls, config): if config.tag not in cls.DATASOURCE_DICT: raise ImportHandlerException( '{0} datasource type is not supported'.format(config.tag)) return cls.DATASOURCE_DICT[config.tag](config)
def prepare_context(data): """ Prepares context dictionary. Convertes defenitions like data['el.field1'] = val1 data['el.field2'] = val2 to object el in the context, where field1 = val1, field2 = val2 data: dict dictionary of the context data >>> data = {'data.x1': 10, 'data.result.sum': 21} >>> data['data.result.metrics.extra'] = [1, 2, 3] >>> res = prepare_context(data) >>> el = res['data'] >>> el.x1 10 >>> el.result.sum 21 >>> el.result.metrics.extra[0] 1 >>> data = {'data': 10, 'data.x': 3} >>> prepare_context(data) Traceback (most recent call last): ... ImportHandlerException: Can't set variable 'data' in \ the context twice. Keys are: data.x, data. >>> prepare_context({'data': 10, 'data.x': 3}) Traceback (most recent call last): ... ImportHandlerException: Can't set variable 'data' in the \ context twice. Keys are: data.x, data. >>> prepare_context({'data.x': 10, 'data.x.y': 3}) Traceback (most recent call last): ... ImportHandlerException: Can't create the x variable for \ data.x: element x already exist and equals The item (<class \ 'cloudml.importhandler.scripts.ContextItem'>). Keys are: data.x.y, data.x. >>> prepare_context({'data.x': 10, 'data': 3}) Traceback (most recent call last): ... ImportHandlerException: Can't create the 'data' variable \ in the context: element 'data' already exist and equals 3 \ (<type 'int'>). Keys are: data, data.x. >>> prepare_context({'data.x.y.a': 10, 'data.x.y': 3}) Traceback (most recent call last): ... ImportHandlerException: Can't create the y variable for \ data.x.y.a: element y already exist and equals 3 (<type 'int'>). \ Keys are: data.x.y, data.x.y.a. >>> prepare_context({'': 10}) Traceback (most recent call last): ... ImportHandlerException: Variable name couldn't be empty. >>> prepare_context({None: 10}) Traceback (most recent call last): ... ImportHandlerException: Variable name couldn't be empty. >>> prepare_context({'x. ': 10}) Traceback (most recent call last): ... ImportHandlerException: Field name couldn't be empty. Key is 'x. '. """ class ContextItem(object): def __str__(self): return "The item" context = {} for key, val in data.iteritems(): _check_name(key) splited = key.split('.') splitted_count = len(splited) if splitted_count == 1: # simply key value here if key in context: raise ImportHandlerException( "Can't set variable '{0}' in the context" " twice. Keys are: {1}.".format( key, ', '.join(data.keys()))) context[key] = val else: # build obj using recursion for i in xrange(0, splitted_count): name = splited[i] _check_name(name, key) if i == 0: if name in context: obj = context[name] if not isinstance(obj, ContextItem): raise ImportHandlerException( "Can't create the '{0}' variable in the " "context: element '{0}' already exist and " "equals {1} ({2}). Keys " "are: {3}.".format( name, str(obj)[:20], type(obj), ', '.join(data.keys()))) else: context[name] = ContextItem() obj = context[name] elif i == splitted_count - 1: # creating the class field if hasattr(obj, name): raise ImportHandlerException( "Can't create the {0} variable for {3}:" " element {0} already exist and equals " "{1} ({2}). Keys are: {4}.".format( name, str(obj)[:20], type(obj), key, ', '.join(data.keys()))) setattr(obj, name, val) else: if hasattr(obj, name): obj = getattr(obj, name) if not isinstance(obj, ContextItem): raise ImportHandlerException( "Can't create the {0} variable for {3}:" " element {0} already exist and equals " "{1} ({2}). Keys are: {4}.".format( name, str(obj)[:20], type(obj), key, ', '.join(data.keys()))) else: item = ContextItem() setattr(obj, name, item) obj = item return context
def __init__(self, config): self.config = config self.name = config.get('name') # unique if not self.name: raise ImportHandlerException('name is required') self.type = config.tag
def _fail_jobflow(self, step_number): logging.error('Jobflow failed, shutting down.') self._print_logs(self.log_path, step_number) raise ImportHandlerException('Emr jobflow %s failed' % self.jobid)
class EntityProcessor(object): """ Helper class that makes the logic of processing entity fields. """ def __init__(self, entity, import_handler, extra_params={}): self.import_handler = import_handler self.entity = entity params = {} params.update(import_handler.params) params.update(extra_params) self.params = params # Building the iterator for the entity query = entity.build_query(params) self.datasource = import_handler.plan.datasources.get( entity.datasource_name) if self.datasource is None: raise ImportHandlerException( "Datasource or transformed field {0} not found, " "but it used in the entity {1}".format(entity.datasource_name, entity.name)) # Process sqoop imports for sqoop_import in self.entity.sqoop_imports: sqoop_import.datasource = import_handler.plan.datasources.get( sqoop_import.datasource_name) if sqoop_import.query: sqoop_query = sqoop_import.build_query(params) logging.info('Run query %s' % sqoop_query) # We running db datasource query to create a table sqoop_import.datasource.run_queries(sqoop_query) if self.entity.autoload_sqoop_dataset: from utils import SCHEMA_INFO_FIELDS, PIG_TEMPLATE, \ construct_pig_fields sql = """select * from {0} limit 1; select {1} from INFORMATION_SCHEMA.COLUMNS where table_name = '{0}' order by ordinal_position;""".format(sqoop_import.table, ','.join(SCHEMA_INFO_FIELDS)) try: iterator = sqoop_import.datasource._get_iter(sql) fields_data = [{ key: opt[i] for i, key in enumerate(SCHEMA_INFO_FIELDS) } for opt in iterator] except Exception, exc: raise ValueError("Can't execute the query: {0}." "Error: {1}".format(sql, exc)) fields_str = construct_pig_fields(fields_data) load_dataset_script = PIG_TEMPLATE.format( self.entity.sqoop_dataset_name, sqoop_import.target, fields_str, self.datasource.bucket_name) query = "{0}\n{1}".format(load_dataset_script, query) if self.datasource.type == 'pig': self.datasource.run_sqoop_imports(self.entity.sqoop_imports) self.datasource.set_import_handler(import_handler) if self.datasource.type == 'input' and query != 'any': query = import_handler.params[query] if self.datasource.type in DATASOURCES_REQUIRE_QUERY and \ (query is None or not query.strip(' \t\n\r')): raise ImportHandlerException( "Query not specified in the entity {0}, but {1}" " datasource {2} require it".format(self.entity.name, self.datasource.type, self.datasource.name)) self.iterator = self.datasource._get_iter(query, self.entity.query_target, import_handler.params)
def _check_for_string(attr_name): if getattr(self, attr_name): raise ImportHandlerException('Field %s declaration \ is invalid: use %s only for string fields' % (self.name, attr_name))