def clean(self, value): if value is None: return value = value.encode('utf-8') from cloudml.importhandler.importhandler import ExtractionPlan from api.amazon_utils import amazon_config try: plan = ExtractionPlan(value, is_file=False) plan.amazon_settings = amazon_config() self.import_params = plan.inputs.keys() self.import_handler_type = 'xml' except Exception as exc: raise ValidationError(exc.message, exc) return value
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: transformer = Transformer(args.path) except (TransformerSchemaException, IOError) as e: logging.warn('Invalid feature model: %s' % e.message) print_exception(e) return INVALID_TRANSFORMER_CONFIG try: if args.input is not None: file_format = os.path.splitext(args.input)[1][1:] with open(args.input, 'r') as train_fp: transformer.train( streamingiterload(train_fp, source_format=file_format)) elif args.extraction is not None: train_context = list_to_dict(args.train_params) try: plan = ExtractionPlan(args.extraction) train_handler = ImportHandler(plan, train_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in train_context.items(): logging.info('%s --> %s' % (key, value)) transformer.train(train_handler) else:
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: if args.user_params is not None: param_list = [x.split('=', 1) for x in args.user_params] context = dict((key, value) for (key, value) in param_list) else: context = {} logging.info('User-defined parameters:') for key, value in context.items(): logging.info('%s --> %s' % (key, value)) try: plan = ExtractionPlan(args.path) extractor = ImportHandler(plan, context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: {}'.format(e.message)) print_exception(e) return INVALID_EXTRACTION_PLAN if args.output is not None: logging.info('Storing data to %s...' % args.output) getattr(extractor, 'store_data_{}'.format(args.format), extractor.store_data_json)(args.output) logging.info('Total %s lines' % (extractor.count, )) logging.info('Ignored %s lines' % (extractor.ignored, ))
def test_pig_datasource(self, sleep_mock, sqoop_mock): # Amazon mock self.pill.attach( self.session, os.path.abspath( os.path.join(os.path.dirname(__file__), 'placebo_responses/importhandler/pigxml'))) self.pill.playback() self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'pig-train-import-handler.xml')) # Sqoop import subprocess mock process_mock = Mock() attrs = {'wait.return_value': 0, 'stdout.readlines.return_value': []} process_mock.configure_mock(**attrs) sqoop_mock.return_value = process_mock with patch('psycopg2.extras.DictCursor.execute'): with patch('psycopg2.connect'): self._extractor = ImportHandler(self._plan, PARAMS) pig_ds = self._extractor.plan.datasources['pig'] # Checking iterator row = self._extractor.next() self.assertEquals(row['opening_id'], 57)
def get_fields(self): """ Returns list of the field names """ if self.data is None: return [] def get_entity_fields(entity): fields = [] for name, field in entity.fields.iteritems(): if not field.is_datasource_field: fields.append(field.name) for sub_entity in entity.nested_entities_field_ds.values(): fields += get_entity_fields(sub_entity) for sub_entity in entity.nested_entities_global_ds: fields += get_entity_fields(sub_entity) return fields # TODO: try .. except after check this with real import handlers try: plan = ExtractionPlan(self.data, is_file=False) return get_entity_fields(plan.entity) except Exception, exc: logging.error(exc) raise ImportHandlerError(exc.message, exc)
def clean_data(self, value, field): if value is None: return value = value.encode('utf-8') try: ExtractionPlan(value, is_file=False) return value except Exception as exc: raise ValidationError(exc.message, exc)
def _get_models_action(self, **kwargs): parser_params = (('server', str), ) params = self._parse_parameters(parser_params) server_id = params.get('server') server = Server.query.get(server_id) results = [] models = server.list_keys(folder=FOLDER_MODELS) models_map = {item.get('name'): item for item in models} import_handlers = server.list_keys(folder=FOLDER_IMPORT_HANDLERS) handler_map = { int(item['object_id']): item for item in import_handlers if item['object_id'] } ids = handler_map.keys() import_handlers_obj = XmlImportHandler.query.filter( XmlImportHandler.id.in_(ids)).all() from cloudml.importhandler.importhandler import ExtractionPlan for h in import_handlers_obj: try: plan = ExtractionPlan(h.get_plan_config(), is_file=False) except Exception, exc: logging.error('Corrupted import handler: {0}'.format(h.id)) continue if plan.predict and plan.predict.models: pmodel = plan.predict.models[0] model_name = pmodel.value handler_key = handler_map[h.id] if pmodel.value: model_key = models_map.get(pmodel.value) if model_key: model_obj = Model.query.get(model_key.get('object_id')) results.append({ 'model_name': pmodel.value, 'model_metadata': model_key, 'model': model_obj, 'import_handler_name': handler_key.get('name'), 'import_handler': h, 'import_handler_metadata': handler_key }) else: # model is defined in the script results.append({ 'import_handler_name': handler_key.get('name'), 'import_handler': h, 'import_handler_metadata': handler_key })
def test_load_plan_with_schema_error(self): def _check(name, err): file_name = os.path.join(BASEDIR, 'extractorxml', 'invalid', name) with self.assertRaisesRegexp(ImportHandlerException, err): ExtractionPlan(file_name) _check( "no-entity.xml", "There is an error in the import handler's " "XML, line 15.\W+\w+") _check("datasource_name.xml", "There are few datasources with name odw") with self.assertRaisesRegexp(ImportHandlerException, "import handler file is empty"): ExtractionPlan(None, is_file=False)
def main(argv=None): parser = create_parser() args = parser.parse_args(argv) init_logging(args.debug) try: with open(args.path, 'r') as fp: trainer = load_trainer(fp) except (IOError, InvalidTrainerFile) as exc: logging.warn('Invalid trainer file: {0!s}'.format(exc)) print_exception(exc) return INVALID_TRAINER try: iterator = None if args.input is not None: # Read evaluation data from file. eval_fp = open(args.input, 'r') file_format = determine_data_format(args.input) iterator = streamingiterload(eval_fp, source_format=file_format) elif args.extraction is not None: # Use import handler try: eval_context = list_to_dict(args.eval_params) plan = ExtractionPlan(args.extraction) eval_handler = ImportHandler(plan, eval_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in eval_context.items(): logging.info('%s --> %s' % (key, value)) iterator = eval_handler else:
class XmlDataSourceForm(ParametersConvertorMixin, BaseForm): XML_PARAMETERS = True PARAMETERS_CONFIGURATION = ExtractionPlan.get_datasources_config() required_fields = ('name', 'type', 'import_handler_id') NO_REQUIRED_FOR_EDIT = True name = CharField() type_field = ChoiceField(choices=_get_ds_types(), name='type') params = JsonField() import_handler_id = DocumentField(doc=XmlImportHandler, by_name=False, return_doc=False) def clean_name(self, value, field): if not ((self.NO_REQUIRED_FOR_EDIT and self.obj.id) or value): raise ValidationError('name is required field') import_handler_id = self.obj.import_handler_id if \ self.obj.id else self.data['import_handler_id'] query = XmlDataSource.query.filter_by( name=value, import_handler_id=import_handler_id) if self.obj.id: query = query.filter(XmlDataSource.id != self.obj.id) count = query.count() if count: raise ValidationError('Data Source with name "%s" already \ exist. Please choose another one.' % value) return value def validate_data(self): type_ = self.cleaned_data.get('type') self.convert_params(type_, self.cleaned_data.get('params'), configuration=self.PARAMETERS_CONFIGURATION)
def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'http-train-import-handler.xml'))
def get_iterator(self, params, callback=None): plan = ExtractionPlan(self.get_plan_config(), is_file=False) return CoreImportHandler(plan, params, callback=callback)
def setUp(self): from cloudml.importhandler.importhandler import ExtractionPlan self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'input-datasource-handler.xml'))
def test_load_valid_plan(self): ExtractionPlan(self.importhandler_file)
def test_load_plan_with_syntax_error(self): with open(self.importhandler_file, 'r') as fp: data = fp.read() data = '"' + data with self.assertRaises(ImportHandlerException): ExtractionPlan(data, is_file=False)
def fill_import_handler(import_handler, xml_data=None): plan = None if xml_data: plan = ExtractionPlan(xml_data, is_file=False) if plan is None: ent = XmlEntity( name=import_handler.name, import_handler=import_handler) ent.save() else: # Loading import handler from XML file ds_dict = {} for datasource in plan.datasources.values(): # if datasource.name == 'input': # continue POSSIBLE_PARAMS = ['host', 'dbname', 'port', 'user', 'password', 'vender'] ds = XmlDataSource( name=datasource.name, type=datasource.type, import_handler=import_handler, params=datasource.get_params()) ds_dict[datasource.name] = ds db.session.add(ds) import_handler.import_params = [] for inp in plan.inputs.values(): param = XmlInputParameter( name=inp.name, type=inp.type, regex=inp.regex, format=inp.format, import_handler=import_handler) db.session.add(param) import_handler.import_params.append(inp.name) for scr in plan.scripts: script = XmlScript( data=scr.src or scr.text, type=XmlScript.TYPE_PYTHON_FILE if scr.src else XmlScript.TYPE_PYTHON_CODE, import_handler=import_handler) db.session.add(script) def get_datasource(entity): if entity.datasource_name and \ entity.datasource_name in ds_dict: return ds_dict[entity.datasource_name] return None def load_query(entity, db_entity): if entity.query: qr = XmlQuery( text=entity.query, target=entity.query_target) db.session.add(qr) db_entity.query_obj = qr return None TRANSFORMED_FIELDS = {} ENTITIES_WITHOUT_DS = [] def load_entity_items(entity, db_entity): for field in entity.fields.values(): if hasattr(field, 'delimiter'): delimiter = field.delimiter else: delimiter = field.join fld = XmlField( name=field.name, type=field.type, column=field.column, jsonpath=field.jsonpath, delimiter=delimiter, regex=field.regex, split=field.split, dateFormat=field.dateFormat, template=field.template, script=field.script, transform=field.transform, headers=field.headers, required=field.required, multipart=field.multipart, key_path=field.key_path, value_path=field.value_path) db_entity.fields.append(fld) if field.transform: TRANSFORMED_FIELDS[field.name] = fld db.session.add(fld) for sqoop in entity.sqoop_imports: sqoop_obj = XmlSqoop( target=sqoop.target, table=sqoop.table, where=sqoop.where, direct=sqoop.direct, mappers=sqoop.mappers, options=sqoop.options, text=sqoop.query, datasource=ds_dict.get(sqoop.datasource_name) ) db_entity.sqoop_imports.append(sqoop_obj) db.session.add(sqoop_obj) sub_entities = entity.nested_entities_field_ds.values() + \ entity.nested_entities_global_ds for sub_entity in sub_entities: sub_ent = XmlEntity( name=sub_entity.name, import_handler=import_handler) sub_ent.entity = db_entity sub_ent.datasource = get_datasource(sub_entity) if not sub_ent.datasource: ENTITIES_WITHOUT_DS.append( [sub_ent, sub_entity.datasource_name]) db.session.add(sub_ent) load_query(sub_entity, db_entity=sub_ent) load_entity_items(sub_entity, db_entity=sub_ent) ent = XmlEntity( name=plan.entity.name, import_handler=import_handler, datasource=get_datasource(plan.entity)) if not ent.datasource: ENTITIES_WITHOUT_DS.append( [ent, plan.entity.datasource_name]) db.session.add(ent) load_query(plan.entity, db_entity=ent) load_entity_items(plan.entity, db_entity=ent) for ent, field_name in ENTITIES_WITHOUT_DS: if field_name not in TRANSFORMED_FIELDS: raise ValueError( 'Transformed field or datasource "{0}" ' 'not found in the entity "{1}"'.format( field_name, ent.name)) ent.transformed_field = TRANSFORMED_FIELDS[field_name] # Fill predict section if plan.predict is not None: models_dict = {} predict = Predict() for model in plan.predict.models: predict_model = PredictModel( name=model.name, value=model.value, script=model.script) db.session.add(predict_model) for weight in model.weights: model_weight = PredictModelWeight( label=weight.label, script=weight.script, value=str(weight.value or ''), predict_model=predict_model) db.session.add(model_weight) predict.models.append(predict_model) models_dict[model.name] = predict_model config_label = plan.predict.result.label predict.label = PredictResultLabel( script=config_label.script, predict_model=models_dict.get(config_label.model, None)) config_probability = plan.predict.result.probability predict.probability = PredictResultProbability( script=config_probability.script, label=config_probability.label, predict_model=models_dict.get(config_probability.model, None)) db.session.add(predict) import_handler.predict = predict
def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'composite-type-import-handler.xml'))
def setUp(self): self._plan = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'train-import-handler.xml')) self._plan_for_script = ExtractionPlan( os.path.join(BASEDIR, 'extractorxml', 'train-import-handler-script-file.xml'))
def test_get_ds_config(self): conf = ExtractionPlan.get_datasources_config() self.assertEqual(set(['db', 'http', 'pig', 'csv']), set(conf.keys()))
def _check(name, err): file_name = os.path.join(BASEDIR, 'extractorxml', 'invalid', name) with self.assertRaisesRegexp(ImportHandlerException, err): ExtractionPlan(file_name)
def _get_ds_types(): from cloudml.importhandler.importhandler import ExtractionPlan return ExtractionPlan.get_datasources_config().keys()
with open(args.input, 'r') as test_fp: trainer.test( streamingiterload(test_fp, source_format=file_format), test_percent) if args.test is not None and args.skip_tests is False: file_format = os.path.splitext(args.test)[1][1:] with open(args.test, 'r') as test_fp: trainer.test( streamingiterload(test_fp, source_format=file_format)) elif args.extraction is not None: train_context = list_to_dict(args.train_params) try: plan = ExtractionPlan(args.extraction) train_handler = ImportHandler(plan, train_context) except ImportHandlerException, e: logging.warn('Invalid extraction plan: %s' % e.message) print_exception(e) return INVALID_EXTRACTION_PLAN logging.info('Starting training with params:') for key, value in train_context.items(): logging.info('%s --> %s' % (key, value)) trainer.train(train_handler, test_percent) if args.skip_tests is False: if test_percent != 0: if args.test_params is None:
def _get_configuration_action(self, **kwargs): from cloudml.importhandler.importhandler import ExtractionPlan conf = ExtractionPlan.get_datasources_config() return self._render({'configuration': conf})
def test_load_valid_generic_plan(self): ExtractionPlan(self.generic_importhandler_file)