Ejemplo n.º 1
0
    def clean(self, value):
        if value is None:
            return

        value = value.encode('utf-8')
        from cloudml.importhandler.importhandler import ExtractionPlan
        from api.amazon_utils import amazon_config
        try:
            plan = ExtractionPlan(value, is_file=False)
            plan.amazon_settings = amazon_config()
            self.import_params = plan.inputs.keys()
            self.import_handler_type = 'xml'
        except Exception as exc:
            raise ValidationError(exc.message, exc)
        return value
Ejemplo n.º 2
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        transformer = Transformer(args.path)
    except (TransformerSchemaException, IOError) as e:
        logging.warn('Invalid feature model: %s' % e.message)
        print_exception(e)
        return INVALID_TRANSFORMER_CONFIG

    try:
        if args.input is not None:
            file_format = os.path.splitext(args.input)[1][1:]
            with open(args.input, 'r') as train_fp:
                transformer.train(
                    streamingiterload(train_fp, source_format=file_format))
        elif args.extraction is not None:
            train_context = list_to_dict(args.train_params)

            try:
                plan = ExtractionPlan(args.extraction)
                train_handler = ImportHandler(plan, train_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in train_context.items():
                logging.info('%s --> %s' % (key, value))
            transformer.train(train_handler)
        else:
Ejemplo n.º 3
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        if args.user_params is not None:
            param_list = [x.split('=', 1) for x in args.user_params]
            context = dict((key, value) for (key, value) in param_list)
        else:
            context = {}

        logging.info('User-defined parameters:')
        for key, value in context.items():
            logging.info('%s --> %s' % (key, value))

        try:
            plan = ExtractionPlan(args.path)
            extractor = ImportHandler(plan, context)

        except ImportHandlerException, e:
            logging.warn('Invalid extraction plan: {}'.format(e.message))
            print_exception(e)
            return INVALID_EXTRACTION_PLAN

        if args.output is not None:
            logging.info('Storing data to %s...' % args.output)
            getattr(extractor, 'store_data_{}'.format(args.format),
                    extractor.store_data_json)(args.output)

            logging.info('Total %s lines' % (extractor.count, ))
            logging.info('Ignored %s lines' % (extractor.ignored, ))
Ejemplo n.º 4
0
    def test_pig_datasource(self, sleep_mock, sqoop_mock):
        # Amazon mock
        self.pill.attach(
            self.session,
            os.path.abspath(
                os.path.join(os.path.dirname(__file__),
                             'placebo_responses/importhandler/pigxml')))
        self.pill.playback()

        self._plan = ExtractionPlan(
            os.path.join(BASEDIR, 'extractorxml',
                         'pig-train-import-handler.xml'))

        # Sqoop import subprocess mock
        process_mock = Mock()
        attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []}
        process_mock.configure_mock(**attrs)
        sqoop_mock.return_value = process_mock

        with patch('psycopg2.extras.DictCursor.execute'):
            with patch('psycopg2.connect'):
                self._extractor = ImportHandler(self._plan, PARAMS)

        pig_ds = self._extractor.plan.datasources['pig']
        # Checking iterator
        row = self._extractor.next()
        self.assertEquals(row['opening_id'], 57)
Ejemplo n.º 5
0
    def get_fields(self):
        """
        Returns list of the field names
        """
        if self.data is None:
            return []

        def get_entity_fields(entity):
            fields = []
            for name, field in entity.fields.iteritems():
                if not field.is_datasource_field:
                    fields.append(field.name)
            for sub_entity in entity.nested_entities_field_ds.values():
                fields += get_entity_fields(sub_entity)
            for sub_entity in entity.nested_entities_global_ds:
                fields += get_entity_fields(sub_entity)
            return fields

        # TODO: try .. except after check this with real import handlers
        try:
            plan = ExtractionPlan(self.data, is_file=False)
            return get_entity_fields(plan.entity)
        except Exception, exc:
            logging.error(exc)
            raise ImportHandlerError(exc.message, exc)
Ejemplo n.º 6
0
    def clean_data(self, value, field):
        if value is None:
            return

        value = value.encode('utf-8')
        try:
            ExtractionPlan(value, is_file=False)
            return value
        except Exception as exc:
            raise ValidationError(exc.message, exc)
Ejemplo n.º 7
0
    def _get_models_action(self, **kwargs):
        parser_params = (('server', str), )
        params = self._parse_parameters(parser_params)
        server_id = params.get('server')
        server = Server.query.get(server_id)
        results = []
        models = server.list_keys(folder=FOLDER_MODELS)
        models_map = {item.get('name'): item for item in models}
        import_handlers = server.list_keys(folder=FOLDER_IMPORT_HANDLERS)
        handler_map = {
            int(item['object_id']): item
            for item in import_handlers if item['object_id']
        }
        ids = handler_map.keys()
        import_handlers_obj = XmlImportHandler.query.filter(
            XmlImportHandler.id.in_(ids)).all()
        from cloudml.importhandler.importhandler import ExtractionPlan
        for h in import_handlers_obj:
            try:
                plan = ExtractionPlan(h.get_plan_config(), is_file=False)
            except Exception, exc:
                logging.error('Corrupted import handler: {0}'.format(h.id))
                continue

            if plan.predict and plan.predict.models:
                pmodel = plan.predict.models[0]
                model_name = pmodel.value
                handler_key = handler_map[h.id]
                if pmodel.value:
                    model_key = models_map.get(pmodel.value)
                    if model_key:
                        model_obj = Model.query.get(model_key.get('object_id'))
                        results.append({
                            'model_name':
                            pmodel.value,
                            'model_metadata':
                            model_key,
                            'model':
                            model_obj,
                            'import_handler_name':
                            handler_key.get('name'),
                            'import_handler':
                            h,
                            'import_handler_metadata':
                            handler_key
                        })
                else:  # model is defined in the script
                    results.append({
                        'import_handler_name':
                        handler_key.get('name'),
                        'import_handler':
                        h,
                        'import_handler_metadata':
                        handler_key
                    })
Ejemplo n.º 8
0
    def test_load_plan_with_schema_error(self):
        def _check(name, err):
            file_name = os.path.join(BASEDIR, 'extractorxml', 'invalid', name)
            with self.assertRaisesRegexp(ImportHandlerException, err):
                ExtractionPlan(file_name)

        _check(
            "no-entity.xml", "There is an error in the import handler's "
            "XML, line 15.\W+\w+")
        _check("datasource_name.xml",
               "There are few datasources with name odw")

        with self.assertRaisesRegexp(ImportHandlerException,
                                     "import handler file is empty"):
            ExtractionPlan(None, is_file=False)
Ejemplo n.º 9
0
def main(argv=None):
    parser = create_parser()
    args = parser.parse_args(argv)
    init_logging(args.debug)

    try:
        with open(args.path, 'r') as fp:
            trainer = load_trainer(fp)
    except (IOError, InvalidTrainerFile) as exc:
        logging.warn('Invalid trainer file: {0!s}'.format(exc))
        print_exception(exc)
        return INVALID_TRAINER

    try:
        iterator = None
        if args.input is not None:
            # Read evaluation data from file.
            eval_fp = open(args.input, 'r')
            file_format = determine_data_format(args.input)
            iterator = streamingiterload(eval_fp, source_format=file_format)
        elif args.extraction is not None:
            # Use import handler
            try:
                eval_context = list_to_dict(args.eval_params)
                plan = ExtractionPlan(args.extraction)
                eval_handler = ImportHandler(plan, eval_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in eval_context.items():
                logging.info('%s --> %s' % (key, value))

            iterator = eval_handler
        else:
Ejemplo n.º 10
0
class XmlDataSourceForm(ParametersConvertorMixin, BaseForm):
    XML_PARAMETERS = True
    PARAMETERS_CONFIGURATION = ExtractionPlan.get_datasources_config()

    required_fields = ('name', 'type', 'import_handler_id')
    NO_REQUIRED_FOR_EDIT = True

    name = CharField()
    type_field = ChoiceField(choices=_get_ds_types(), name='type')
    params = JsonField()
    import_handler_id = DocumentField(doc=XmlImportHandler,
                                      by_name=False,
                                      return_doc=False)

    def clean_name(self, value, field):
        if not ((self.NO_REQUIRED_FOR_EDIT and self.obj.id) or value):
            raise ValidationError('name is required field')

        import_handler_id = self.obj.import_handler_id if \
            self.obj.id else self.data['import_handler_id']

        query = XmlDataSource.query.filter_by(
            name=value, import_handler_id=import_handler_id)
        if self.obj.id:
            query = query.filter(XmlDataSource.id != self.obj.id)
        count = query.count()
        if count:
            raise ValidationError('Data Source with name "%s" already \
exist. Please choose another one.' % value)
        return value

    def validate_data(self):
        type_ = self.cleaned_data.get('type')
        self.convert_params(type_,
                            self.cleaned_data.get('params'),
                            configuration=self.PARAMETERS_CONFIGURATION)
Ejemplo n.º 11
0
 def setUp(self):
     self._plan = ExtractionPlan(
         os.path.join(BASEDIR, 'extractorxml',
                      'http-train-import-handler.xml'))
Ejemplo n.º 12
0
 def get_iterator(self, params, callback=None):
     plan = ExtractionPlan(self.get_plan_config(), is_file=False)
     return CoreImportHandler(plan, params, callback=callback)
Ejemplo n.º 13
0
 def setUp(self):
     from cloudml.importhandler.importhandler import ExtractionPlan
     self._plan = ExtractionPlan(
         os.path.join(BASEDIR, 'extractorxml',
                      'input-datasource-handler.xml'))
Ejemplo n.º 14
0
 def test_load_valid_plan(self):
     ExtractionPlan(self.importhandler_file)
Ejemplo n.º 15
0
 def test_load_plan_with_syntax_error(self):
     with open(self.importhandler_file, 'r') as fp:
         data = fp.read()
     data = '"' + data
     with self.assertRaises(ImportHandlerException):
         ExtractionPlan(data, is_file=False)
Ejemplo n.º 16
0
def fill_import_handler(import_handler, xml_data=None):
    plan = None
    if xml_data:
        plan = ExtractionPlan(xml_data, is_file=False)

    if plan is None:
        ent = XmlEntity(
            name=import_handler.name,
            import_handler=import_handler)
        ent.save()
    else:  # Loading import handler from XML file
        ds_dict = {}
        for datasource in plan.datasources.values():
            # if datasource.name == 'input':
            #      continue
            POSSIBLE_PARAMS = ['host', 'dbname', 'port',
                               'user', 'password', 'vender']
            ds = XmlDataSource(
                name=datasource.name,
                type=datasource.type,
                import_handler=import_handler,
                params=datasource.get_params())
            ds_dict[datasource.name] = ds
            db.session.add(ds)

        import_handler.import_params = []
        for inp in plan.inputs.values():
            param = XmlInputParameter(
                name=inp.name,
                type=inp.type,
                regex=inp.regex,
                format=inp.format,
                import_handler=import_handler)
            db.session.add(param)
            import_handler.import_params.append(inp.name)

        for scr in plan.scripts:
            script = XmlScript(
                data=scr.src or scr.text,
                type=XmlScript.TYPE_PYTHON_FILE if scr.src else
                XmlScript.TYPE_PYTHON_CODE,
                import_handler=import_handler)
            db.session.add(script)

        def get_datasource(entity):
            if entity.datasource_name and \
                    entity.datasource_name in ds_dict:
                return ds_dict[entity.datasource_name]
            return None

        def load_query(entity, db_entity):
            if entity.query:
                qr = XmlQuery(
                    text=entity.query,
                    target=entity.query_target)
                db.session.add(qr)
                db_entity.query_obj = qr
            return None

        TRANSFORMED_FIELDS = {}
        ENTITIES_WITHOUT_DS = []

        def load_entity_items(entity, db_entity):
            for field in entity.fields.values():
                if hasattr(field, 'delimiter'):
                    delimiter = field.delimiter
                else:
                    delimiter = field.join
                fld = XmlField(
                    name=field.name,
                    type=field.type,
                    column=field.column,
                    jsonpath=field.jsonpath,
                    delimiter=delimiter,
                    regex=field.regex,
                    split=field.split,
                    dateFormat=field.dateFormat,
                    template=field.template,
                    script=field.script,
                    transform=field.transform,
                    headers=field.headers,
                    required=field.required,
                    multipart=field.multipart,
                    key_path=field.key_path,
                    value_path=field.value_path)
                db_entity.fields.append(fld)
                if field.transform:
                    TRANSFORMED_FIELDS[field.name] = fld
                db.session.add(fld)

            for sqoop in entity.sqoop_imports:
                sqoop_obj = XmlSqoop(
                    target=sqoop.target,
                    table=sqoop.table,
                    where=sqoop.where,
                    direct=sqoop.direct,
                    mappers=sqoop.mappers,
                    options=sqoop.options,
                    text=sqoop.query,
                    datasource=ds_dict.get(sqoop.datasource_name)
                )
                db_entity.sqoop_imports.append(sqoop_obj)
                db.session.add(sqoop_obj)

            sub_entities = entity.nested_entities_field_ds.values() + \
                entity.nested_entities_global_ds
            for sub_entity in sub_entities:
                sub_ent = XmlEntity(
                    name=sub_entity.name,
                    import_handler=import_handler)
                sub_ent.entity = db_entity
                sub_ent.datasource = get_datasource(sub_entity)
                if not sub_ent.datasource:
                    ENTITIES_WITHOUT_DS.append(
                        [sub_ent, sub_entity.datasource_name])
                db.session.add(sub_ent)
                load_query(sub_entity, db_entity=sub_ent)
                load_entity_items(sub_entity, db_entity=sub_ent)

        ent = XmlEntity(
            name=plan.entity.name,
            import_handler=import_handler,
            datasource=get_datasource(plan.entity))
        if not ent.datasource:
            ENTITIES_WITHOUT_DS.append(
                [ent, plan.entity.datasource_name])
        db.session.add(ent)
        load_query(plan.entity, db_entity=ent)
        load_entity_items(plan.entity, db_entity=ent)
        for ent, field_name in ENTITIES_WITHOUT_DS:
            if field_name not in TRANSFORMED_FIELDS:
                raise ValueError(
                    'Transformed field or datasource "{0}" '
                    'not found in the entity "{1}"'.format(
                        field_name, ent.name))
            ent.transformed_field = TRANSFORMED_FIELDS[field_name]

        # Fill predict section
        if plan.predict is not None:
            models_dict = {}
            predict = Predict()
            for model in plan.predict.models:
                predict_model = PredictModel(
                    name=model.name,
                    value=model.value,
                    script=model.script)
                db.session.add(predict_model)

                for weight in model.weights:
                    model_weight = PredictModelWeight(
                        label=weight.label,
                        script=weight.script,
                        value=str(weight.value or ''),
                        predict_model=predict_model)
                    db.session.add(model_weight)

                predict.models.append(predict_model)
                models_dict[model.name] = predict_model

            config_label = plan.predict.result.label
            predict.label = PredictResultLabel(
                script=config_label.script,
                predict_model=models_dict.get(config_label.model, None))

            config_probability = plan.predict.result.probability
            predict.probability = PredictResultProbability(
                script=config_probability.script,
                label=config_probability.label,
                predict_model=models_dict.get(config_probability.model, None))

            db.session.add(predict)
            import_handler.predict = predict
Ejemplo n.º 17
0
 def setUp(self):
     self._plan = ExtractionPlan(
         os.path.join(BASEDIR, 'extractorxml',
                      'composite-type-import-handler.xml'))
Ejemplo n.º 18
0
 def setUp(self):
     self._plan = ExtractionPlan(
         os.path.join(BASEDIR, 'extractorxml', 'train-import-handler.xml'))
     self._plan_for_script = ExtractionPlan(
         os.path.join(BASEDIR, 'extractorxml',
                      'train-import-handler-script-file.xml'))
Ejemplo n.º 19
0
 def test_get_ds_config(self):
     conf = ExtractionPlan.get_datasources_config()
     self.assertEqual(set(['db', 'http', 'pig', 'csv']), set(conf.keys()))
Ejemplo n.º 20
0
 def _check(name, err):
     file_name = os.path.join(BASEDIR, 'extractorxml', 'invalid', name)
     with self.assertRaisesRegexp(ImportHandlerException, err):
         ExtractionPlan(file_name)
Ejemplo n.º 21
0
def _get_ds_types():
    from cloudml.importhandler.importhandler import ExtractionPlan
    return ExtractionPlan.get_datasources_config().keys()
Ejemplo n.º 22
0
                    with open(args.input, 'r') as test_fp:
                        trainer.test(
                            streamingiterload(test_fp,
                                              source_format=file_format),
                            test_percent)

            if args.test is not None and args.skip_tests is False:
                file_format = os.path.splitext(args.test)[1][1:]
                with open(args.test, 'r') as test_fp:
                    trainer.test(
                        streamingiterload(test_fp, source_format=file_format))

        elif args.extraction is not None:
            train_context = list_to_dict(args.train_params)
            try:
                plan = ExtractionPlan(args.extraction)
                train_handler = ImportHandler(plan, train_context)
            except ImportHandlerException, e:
                logging.warn('Invalid extraction plan: %s' % e.message)
                print_exception(e)
                return INVALID_EXTRACTION_PLAN

            logging.info('Starting training with params:')
            for key, value in train_context.items():
                logging.info('%s --> %s' % (key, value))

            trainer.train(train_handler, test_percent)

            if args.skip_tests is False:
                if test_percent != 0:
                    if args.test_params is None:
Ejemplo n.º 23
0
 def _get_configuration_action(self, **kwargs):
     from cloudml.importhandler.importhandler import ExtractionPlan
     conf = ExtractionPlan.get_datasources_config()
     return self._render({'configuration': conf})
Ejemplo n.º 24
0
 def test_load_valid_generic_plan(self):
     ExtractionPlan(self.generic_importhandler_file)