Esempio n. 1
0
    def __init__(self, config):
        super(CsvDataSource, self).__init__(config)

        attrs = self.config.attrib
        self.headers = []
        for t in self.config.xpath('header'):
            self.headers.append((t.attrib['name'], int(t.attrib['index'])))

        if 'src' not in attrs:
            raise ImportHandlerException('No source given')
        self.src = attrs['src']
        if 'delimiter' in attrs:
            self.delimiter = attrs['delimiter']
            if self.delimiter == '\\t':
                self.delimiter = '\t'
        else:
            self.delimiter = ','
        try:
            self.offset = int(attrs.get('offset', 0))
            self.count = int(attrs.get('count', sys.maxint))
        except (ValueError, TypeError) as e:
            raise ImportHandlerException('offset and count should be integers',
                                         e)
        logging.info('In csv datasource {0} there are '
                     'offset {1} and count {2}'.format(self.name, self.offset,
                                                       self.count))
Esempio n. 2
0
    def _get_iter(self, query=None, query_target=None, params=None):
        """
        Returns datasource iterator.

        query: string
            query string of the url.
        query_target: string
            it isn't used for this datasource.
        """
        if query_target is not None:
            raise ImportHandlerException(
                "Http datasource doesn't support query_target")
        if query:
            query = query.strip()
            url = '{0}/{1}'.format(self.url.rstrip('/'),
                                   str(query).lstrip('/'))
        else:
            url = self.url

        # TODO: params?
        logging.info('Getting data from: %s' % url)
        try:
            resp = requests.request(self.method, url, stream=True)
        except ConnectionError as exc:
            raise ImportHandlerException(
                'Cannot reach url: {}'.format(str(exc)), exc)
        try:
            result = resp.json()
        except Exception as exc:
            raise ImportHandlerException(
                'Cannot parse json: {}'.format(str(exc)), exc)
        if isinstance(result, dict):
            return iter([resp.json()])
        return iter(resp.json())
Esempio n. 3
0
 def _raise():
     if key:
         raise ImportHandlerException(
             "Field name couldn't be "
             "empty. Key is '{}'.".format(key))
     else:
         raise ImportHandlerException(
             "Variable name couldn't be empty.")
Esempio n. 4
0
    def __init__(self, config):
        super(HttpDataSource, self).__init__(config)

        attrs = self.config.attrib
        if 'url' not in attrs:
            raise ImportHandlerException('No url given')

        self.url = attrs['url']
        if not self.url:
            raise ImportHandlerException('No url given')

        self.method = attrs.get('method', 'GET')
Esempio n. 5
0
    def process_field(self, field, row, row_data=None):
        row_data = row_data or {}
        if field.column:
            item_value = row.get(field.column, None)
        else:
            item_value = row
        result = {}
        kwargs = {
            'row': row,
            'row_data': row_data,
            'datasource_type': self.datasource.type,
            'script_manager': self.import_handler.plan.script_manager,
            'params': self.params
        }
        if field.is_datasource_field:
            nested_entity = self._get_entity_for_datasource_field(field)
            if nested_entity is None:
                # No one use this field datasource
                return result

            if field.transform == 'json':
                data = load_json(item_value)
                for sub_field in nested_entity.fields.values():
                    result[sub_field.name] = sub_field.process_value(
                        data, **kwargs)
                    kwargs['row_data'].update(result)
            elif field.transform == 'csv':
                # TODO: Implement Fields.transform=csv
                raise ImportHandlerException(
                    'Fields with transform=csv are not implemented yet')
        else:
            result[field.name] = field.process_value(item_value, **kwargs)
        return result
Esempio n. 6
0
    def execute_function(self, script, value,
                         row_data=None, local_vars={}):
        """
        Executes function and returns it's result.

        script: string
            python code to execute.
        value: any
            would be passed as #{value} parameter to the script
        row_data: dict
            script template parameters
        local_vars: dict
            execution context
        """
        def update_strings(val):
            if isinstance(val, basestring):
                return "'%s'" % val
            return val

        row_data = row_data or {}
        params = {'value': update_strings(value)}
        params.update(row_data)
        params.update(local_vars)
        text = ParametrizedTemplate(script).safe_substitute(params)
        try:
            return self._exec(text, local_vars)
        except Exception, exc:
            raise ImportHandlerException(
                "Exception occurs while executing script: {0}. {1}".format(
                    text[:100], exc), exc)
Esempio n. 7
0
 def __init__(self, config):
     self.label = config.get('label')
     self.script = config.get('script')
     try:
         self.value = float(config.get('value', 1))
     except Exception, exc:
         raise ImportHandlerException(
             'Invalid predict model weight: {0}.'
             'Should be a float value.'.format(config.get('value')), exc)
Esempio n. 8
0
    def __init__(self, entity, import_handler, extra_params={}):
        self.import_handler = import_handler
        self.entity = entity

        params = {}
        params.update(import_handler.params)
        params.update(extra_params)
        self.params = params

        # Building the iterator for the entity
        query = entity.build_query(params)

        self.datasource = import_handler.plan.datasources.get(
            entity.datasource_name)

        if self.datasource is None:
            raise ImportHandlerException(
                "Datasource or transformed field {0} not found, "
                "but it used in the entity {1}".format(entity.datasource_name,
                                                       entity.name))

        # Process sqoop imports
        for sqoop_import in self.entity.sqoop_imports:

            sqoop_import.datasource = import_handler.plan.datasources.get(
                sqoop_import.datasource_name)
            if sqoop_import.query:

                sqoop_query = sqoop_import.build_query(params)
                logging.info('Run query %s' % sqoop_query)
                # We running db datasource query to create a table
                sqoop_import.datasource.run_queries(sqoop_query)
            if self.entity.autoload_sqoop_dataset:
                from utils import SCHEMA_INFO_FIELDS, PIG_TEMPLATE, \
                    construct_pig_fields
                sql = """select * from {0} limit 1;
select {1} from INFORMATION_SCHEMA.COLUMNS where table_name = '{0}'
order by ordinal_position;""".format(sqoop_import.table,
                                     ','.join(SCHEMA_INFO_FIELDS))

                try:
                    iterator = sqoop_import.datasource._get_iter(sql)
                    fields_data = [{
                        key: opt[i]
                        for i, key in enumerate(SCHEMA_INFO_FIELDS)
                    } for opt in iterator]
                except Exception, exc:
                    raise ValueError("Can't execute the query: {0}."
                                     "Error: {1}".format(sql, exc))

                fields_str = construct_pig_fields(fields_data)
                load_dataset_script = PIG_TEMPLATE.format(
                    self.entity.sqoop_dataset_name, sqoop_import.target,
                    fields_str, self.datasource.bucket_name)
                query = "{0}\n{1}".format(load_dataset_script, query)
Esempio n. 9
0
    def __init__(self, config, is_file=True):
        if is_file:
            if os.path.isfile(config):
                with open(config, 'r') as fp:
                    config = fp.read()
            else:
                raise ImportHandlerException("import handler file '%s' not "
                                             "found" % config)

        if not config:
            raise ImportHandlerException('import handler file is empty')

        try:
            self.data = objectify.fromstring(config)
        except etree.XMLSyntaxError as e:
            raise ImportHandlerException(
                "Valid XML is expected for import handler. "
                "Parse error: {0}".format(e),
                e
            )

        if not self.is_valid():
            raise ImportHandlerException(
                "There is an error in the import handler's XML, "
                "line {0}. {1}".format(self.error.line, self.error.message))

        self.inputs = {}
        self.load_inputs(self.data)

        self.datasources = {}
        self.load_datasources(self.data)

        self.scripts = []
        self.script_manager = ScriptManager()
        self.load_scripts(self.data)

        # Loading import section
        self.entity = Entity(self.data['import'].entity)

        # Predict section
        self.predict = Predict(self.data.predict) if \
            hasattr(self.data, 'predict') else None
Esempio n. 10
0
    def get_script_str(self):
        if self.src:
            try:
                self._process_local_file()
            except LocalScriptNotFoundException as e:
                try:
                    self._process_amazon_file()
                except Exception as exc:
                    raise ImportHandlerException(
                        "{0}. Searching on Amazon: {1} ".format(
                            e.message, exc.message), exc)
            except Exception as ex:
                raise ImportHandlerException("Error while accessing script "
                                             "'{0}': {1}".format(self.src,
                                                                 ex.message),
                                             ex)
        elif self.text:
            self.out_string = self.text

        return self.out_string
Esempio n. 11
0
 def __get_obj(row):
     if len(self.headers) == 0:
         return {str(i): row[i] for i in range(0, len(row))}
     obj = {}
     for name, idx in self.headers:
         if len(row) <= idx:
             raise ImportHandlerException(
                 "csv file {0} doesn't contains column "
                 "{1}, named {2}".format(self.src, idx, name))
         obj[name] = row[idx]
     return obj
Esempio n. 12
0
 def _get_iter(self, query=None, query_target=None, params=None):
     if query == 'any':
         return iter([params])
     try:
         result = json.loads(query)
     except Exception as exc:
         raise ImportHandlerException(
             'Cannot parse json: {}'.format(str(exc)), exc)
     if isinstance(result, dict):
         return iter([result])
     return iter(result)
Esempio n. 13
0
    def _run(self, query, query_target=None, run=False):
        queries = self._get_queries_list(query, query_target)
        method = 1 if run else 0
        vendor = self.config.attrib['vendor']
        db_iter = self.DB.get(vendor)[method] if vendor in self.DB else None
        if db_iter is None:
            raise ImportHandlerException('Database type %s not supported' %
                                         vendor)

        if 'host' not in self.config.attrib:
            raise ImportHandlerException(
                'No database connection details defined')

        from copy import deepcopy
        conn_params = deepcopy(self.config.attrib)
        conn_params.pop('name')
        conn_params.pop('vendor')
        conn_string = ' '.join(
            ['%s=%s' % (k, v) for k, v in conn_params.iteritems()])
        return db_iter(queries, conn_string)
Esempio n. 14
0
 def key_exists(s3, bucket, key_name):
     """
     Checks if key exists in the bucket
     """
     try:
         s3.Object(bucket, key_name).load()
         return True
     except ClientError as e:
         if e.response['Error']['Code'] == "404":
             return False
         else:
             raise ImportHandlerException(e.message, e)
Esempio n. 15
0
    def __init__(self, config):
        self.name = config.get('name')
        self.value = config.get('value')
        self.script = config.get('script')
        if not (self.value or self.script):
            raise ImportHandlerException(
                'Either value or script attribute need to be defined'
                ' for predict model {0}'.format(self.name))

        self.weights = []
        for weight in config.xpath('weight'):
            self.weights.append(Weight(weight))
Esempio n. 16
0
 def add_python(self, script):
     """
     Adds python methods to the script manager.
     """
     try:
         if script:
             eval(compile(script, "<str>", "exec"), self.context,
                  self.context)
     except Exception, exc:
         raise ImportHandlerException(
             "Exception occurs while adding python script: {0}. {1}".format(
                 script[:250], exc), exc)
Esempio n. 17
0
    def _get_queries_list(self, query, query_target=None):
        if query is None:
            raise ImportHandlerException(
                "Query is required in the DB datasource")

        query = query.strip(' \t\n\r')
        if not query:
            raise ImportHandlerException(
                "Query is required in the DB datasource")

        if not query.endswith(';'):
            query += ';'

        queries = query.split(';')[:-1]
        queries = [q + ';' for q in queries]

        if query_target:
            check_table_name(query_target)
            queries.append("SELECT * FROM %s;" % query_target)

        return queries
Esempio n. 18
0
    def load_datasources(self, config):
        """
        Loads global datasources from configuration.
        """
        for ds_config in iterchildren(config.datasources):
            ds = DataSource.factory(ds_config)
            if ds.name in self.datasources:
                raise ImportHandlerException(
                    'There are few datasources with name {0}'.format(ds.name))
            self.datasources[ds.name] = ds

        ds = DataSource.DATASOURCE_DICT['input']()
        self.datasources[ds.name] = ds
Esempio n. 19
0
    def _exec(self, text, row_data=None):
        row_data = row_data or {}
        context = globals().copy()
        context.update(locals())
        context.update(prepare_context(row_data))
        context.update(self.context)

        try:
            return eval(text, context, context)
        except Exception, exc:
            raise ImportHandlerException(
                "Exception occurs while executing script: {0}. {1}".format(
                    text[:100], exc), exc)
Esempio n. 20
0
    def _process_amazon_file(self):
        AMAZON_ACCESS_TOKEN, AMAZON_TOKEN_SECRET, \
            BUCKET_NAME = self.amazon_settings
        try:
            s3 = boto3.resource(
                's3',
                aws_access_key_id=AMAZON_ACCESS_TOKEN,
                aws_secret_access_key=AMAZON_TOKEN_SECRET)
            res = s3.Object(BUCKET_NAME, self.src).get()
            self.out_string = res["Body"].read(res["ContentLength"])

        except Exception as exc:
            raise ImportHandlerException("Error accessing file '{0}' on Amazon"
                                         ": {1}".format(self.src, exc.message),
                                         exc)
Esempio n. 21
0
    def validate_attributes(self):  # TODO:
        """
        Validates field configuration.
        """
        if self.type not in PROCESS_STRATEGIES:
            types = ", ".join(PROCESS_STRATEGIES.keys())
            raise ImportHandlerException(
                'Type of the field %s is invalid: %s. Choose one of %s' %
                (self.name, self.type, types))

        if self.type != 'string':

            def _check_for_string(attr_name):
                if getattr(self, attr_name):
                    raise ImportHandlerException('Field %s declaration \
is invalid: use %s only for string fields' % (self.name, attr_name))

            _check_for_string('dateFormat')
Esempio n. 22
0
    def process_input_params(self, params):
        """
        Validates that all required input params of the current extraction plan
        exist in the given param dictionary.

        Keyword arguments
        params -- the parameters to check
        """
        logging.info('Validate input parameters.')
        param_set = set(params.keys() if params else ())
        required_set = set(self.plan.inputs.keys())
        missing = required_set.difference(param_set)
        if len(missing) > 0:
            raise ImportHandlerException('Missing input parameters: %s'
                                         % ', '.join(missing))

        for name, inp in self.plan.inputs.iteritems():
            self.params[name] = inp.process_value(params[name])
Esempio n. 23
0
    def run_sqoop_imports(self, sqoop_imports=[]):
        """
        Runs sqoop imports and saves results to Amazon S3 on
        `self.sqoop_result_uri`.
        """
        for sqoop_import in sqoop_imports:
            db_param = sqoop_import.datasource.config[0].attrib
            connect = "jdbc:postgresql://%s:%s/%s" % (
                db_param['host'], db_param.get('port',
                                               '5432'), db_param['dbname'])
            sqoop_script = self.SQOOP_COMMAND % {
                'table': sqoop_import.table,
                'connect': connect,
                'password': db_param['password'],
                'user': db_param['user'],
                'mappers': sqoop_import.mappers,
                'options': sqoop_import.options
            }
            if sqoop_import.where:
                sqoop_script += " --where %s" % sqoop_import.where
            if sqoop_import.direct:
                sqoop_script += " --direct"
            sqoop_result_uri = "%s%s/" % (self.sqoop_result_uri,
                                          sqoop_import.target)
            self.sqoop_results_uries[sqoop_import.target] = sqoop_result_uri
            sqoop_script += " --target-dir %s" % sqoop_result_uri

            logging.info('Sqoop command: %s' % sqoop_script)
            import subprocess

            p = subprocess.Popen(sqoop_script,
                                 shell=True,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            for line in p.stdout.readlines():
                logging.info(line)
            retval = p.wait()
            if retval != 0:
                raise ImportHandlerException('Sqoop import  failed')
Esempio n. 24
0
 def _get_result_job(self, response, j_id):
     cluster = response.get('Cluster', None)
     if not cluster:
         raise ImportHandlerException(
             "Unexpected EMR result: {}".format(response))
     return cluster
Esempio n. 25
0
 def factory(cls, config):
     if config.tag not in cls.DATASOURCE_DICT:
         raise ImportHandlerException(
             '{0} datasource type is not supported'.format(config.tag))
     return cls.DATASOURCE_DICT[config.tag](config)
Esempio n. 26
0
def prepare_context(data):
    """
    Prepares context dictionary.
    Convertes defenitions like
        data['el.field1'] = val1
        data['el.field2'] = val2
    to object el in the context, where field1 = val1, field2 = val2

    data: dict
        dictionary of the context data

    >>> data = {'data.x1': 10, 'data.result.sum': 21}
    >>> data['data.result.metrics.extra'] = [1, 2, 3]
    >>> res = prepare_context(data)
    >>> el = res['data']
    >>> el.x1
    10
    >>> el.result.sum
    21
    >>> el.result.metrics.extra[0]
    1

    >>> data = {'data': 10, 'data.x': 3}
    >>> prepare_context(data)
    Traceback (most recent call last):
        ...
    ImportHandlerException: Can't set variable 'data' in \
the context twice. Keys are: data.x, data.

    >>> prepare_context({'data': 10, 'data.x': 3})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Can't set variable 'data' in the \
context twice. Keys are: data.x, data.

    >>> prepare_context({'data.x': 10, 'data.x.y': 3})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Can't create the x variable for \
data.x: element x already exist and equals The item (<class \
'cloudml.importhandler.scripts.ContextItem'>). Keys are: data.x.y, data.x.

    >>> prepare_context({'data.x': 10, 'data': 3})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Can't create the 'data' variable \
in the context: element 'data' already exist and equals 3 \
(<type 'int'>). Keys are: data, data.x.

    >>> prepare_context({'data.x.y.a': 10, 'data.x.y': 3})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Can't create the y variable for \
data.x.y.a: element y already exist and equals 3 (<type 'int'>). \
Keys are: data.x.y, data.x.y.a.

    >>> prepare_context({'': 10})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Variable name couldn't be empty.

    >>> prepare_context({None: 10})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Variable name couldn't be empty.

    >>> prepare_context({'x. ': 10})
    Traceback (most recent call last):
        ...
    ImportHandlerException: Field name couldn't be empty. Key is 'x. '.
    """
    class ContextItem(object):
        def __str__(self):
            return "The item"

    context = {}
    for key, val in data.iteritems():
        _check_name(key)

        splited = key.split('.')
        splitted_count = len(splited)
        if splitted_count == 1:  # simply key value here
            if key in context:
                raise ImportHandlerException(
                    "Can't set variable '{0}' in the context"
                    " twice. Keys are: {1}.".format(
                        key, ', '.join(data.keys())))
            context[key] = val
        else:
            # build obj using recursion
            for i in xrange(0, splitted_count):
                name = splited[i]
                _check_name(name, key)
                if i == 0:
                    if name in context:
                        obj = context[name]
                        if not isinstance(obj, ContextItem):
                            raise ImportHandlerException(
                                "Can't create the '{0}' variable in the "
                                "context: element '{0}' already exist and "
                                "equals {1} ({2}). Keys "
                                "are: {3}.".format(
                                    name, str(obj)[:20], type(obj),
                                    ', '.join(data.keys())))
                    else:
                        context[name] = ContextItem()
                        obj = context[name]
                elif i == splitted_count - 1:
                    # creating the class field
                    if hasattr(obj, name):
                        raise ImportHandlerException(
                            "Can't create the {0} variable for {3}:"
                            " element {0} already exist and equals "
                            "{1} ({2}). Keys are: {4}.".format(
                                name, str(obj)[:20], type(obj), key,
                                ', '.join(data.keys())))
                    setattr(obj, name, val)
                else:
                    if hasattr(obj, name):
                        obj = getattr(obj, name)
                        if not isinstance(obj, ContextItem):
                            raise ImportHandlerException(
                                "Can't create the {0} variable for {3}:"
                                " element {0} already exist and equals "
                                "{1} ({2}). Keys are: {4}.".format(
                                    name, str(obj)[:20], type(obj), key,
                                    ', '.join(data.keys())))
                    else:
                        item = ContextItem()
                        setattr(obj, name, item)
                        obj = item
    return context
Esempio n. 27
0
 def __init__(self, config):
     self.config = config
     self.name = config.get('name')  # unique
     if not self.name:
         raise ImportHandlerException('name is required')
     self.type = config.tag
Esempio n. 28
0
 def _fail_jobflow(self, step_number):
     logging.error('Jobflow failed, shutting down.')
     self._print_logs(self.log_path, step_number)
     raise ImportHandlerException('Emr jobflow %s failed' % self.jobid)
Esempio n. 29
0
class EntityProcessor(object):
    """
    Helper class that makes the logic of processing entity fields.
    """
    def __init__(self, entity, import_handler, extra_params={}):
        self.import_handler = import_handler
        self.entity = entity

        params = {}
        params.update(import_handler.params)
        params.update(extra_params)
        self.params = params

        # Building the iterator for the entity
        query = entity.build_query(params)

        self.datasource = import_handler.plan.datasources.get(
            entity.datasource_name)

        if self.datasource is None:
            raise ImportHandlerException(
                "Datasource or transformed field {0} not found, "
                "but it used in the entity {1}".format(entity.datasource_name,
                                                       entity.name))

        # Process sqoop imports
        for sqoop_import in self.entity.sqoop_imports:

            sqoop_import.datasource = import_handler.plan.datasources.get(
                sqoop_import.datasource_name)
            if sqoop_import.query:

                sqoop_query = sqoop_import.build_query(params)
                logging.info('Run query %s' % sqoop_query)
                # We running db datasource query to create a table
                sqoop_import.datasource.run_queries(sqoop_query)
            if self.entity.autoload_sqoop_dataset:
                from utils import SCHEMA_INFO_FIELDS, PIG_TEMPLATE, \
                    construct_pig_fields
                sql = """select * from {0} limit 1;
select {1} from INFORMATION_SCHEMA.COLUMNS where table_name = '{0}'
order by ordinal_position;""".format(sqoop_import.table,
                                     ','.join(SCHEMA_INFO_FIELDS))

                try:
                    iterator = sqoop_import.datasource._get_iter(sql)
                    fields_data = [{
                        key: opt[i]
                        for i, key in enumerate(SCHEMA_INFO_FIELDS)
                    } for opt in iterator]
                except Exception, exc:
                    raise ValueError("Can't execute the query: {0}."
                                     "Error: {1}".format(sql, exc))

                fields_str = construct_pig_fields(fields_data)
                load_dataset_script = PIG_TEMPLATE.format(
                    self.entity.sqoop_dataset_name, sqoop_import.target,
                    fields_str, self.datasource.bucket_name)
                query = "{0}\n{1}".format(load_dataset_script, query)

        if self.datasource.type == 'pig':
            self.datasource.run_sqoop_imports(self.entity.sqoop_imports)
            self.datasource.set_import_handler(import_handler)

        if self.datasource.type == 'input' and query != 'any':
            query = import_handler.params[query]

        if self.datasource.type in DATASOURCES_REQUIRE_QUERY and \
                (query is None or not query.strip(' \t\n\r')):
            raise ImportHandlerException(
                "Query not specified in the entity {0}, but {1}"
                " datasource {2} require it".format(self.entity.name,
                                                    self.datasource.type,
                                                    self.datasource.name))
        self.iterator = self.datasource._get_iter(query,
                                                  self.entity.query_target,
                                                  import_handler.params)
Esempio n. 30
0
            def _check_for_string(attr_name):
                if getattr(self, attr_name):
                    raise ImportHandlerException('Field %s declaration \
is invalid: use %s only for string fields' % (self.name, attr_name))