class Weight(db.Model, BaseMixin): """ Represents Model Parameter Weight """ name = db.Column(db.String(200)) short_name = db.Column(db.String(200)) model_name = db.Column(db.String(200)) value = db.Column(db.Float) value2 = db.Column(db.Float) is_positive = db.Column(db.Boolean) css_class = db.Column(db.String) class_label = db.Column(db.String(100), nullable=True) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('weights')) segment_id = db.Column(db.Integer, db.ForeignKey('segment.id')) segment = relationship(Segment, backref=backref('weights')) parent = db.Column(db.String(200)) test_weights = db.Column(JSONType) @hybrid_method def test_weight(self, test_id): return TestWeightColumn(test_id)
class WeightsCategory(db.Model, BaseMixin): """ Represents Model Parameter Weights Category. NOTE: used for constructing trees of weights. """ __tablename__ = 'weights_category' name = db.Column(db.String(200)) short_name = db.Column(db.String(200)) # TODO: remove it model_name = db.Column(db.String(200)) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('weight_categories')) segment_id = db.Column(db.Integer, db.ForeignKey('segment.id')) segment = relationship(Segment, backref=backref('weight_categories')) normalized_weight = db.Column(db.Float) class_label = db.Column(db.String(100), nullable=True) parent = db.Column(db.String(200)) # TODO: Maybe have FK Weight to WeightsCategory? # @aggregated('normalized_weight', sa.Column(sa.Float)) # def normalized_weight(self): # return sa.func.sum(Weight.value2) def __repr__(self): return '<Category {0}>'.format(self.name)
class VerificationExample(BaseMixin, db.Model): verification_id = db.Column(db.Integer, db.ForeignKey('server_model_verification.id')) verification = relationship('ServerModelVerification', backref=backref('verification_examples', cascade='all,delete')) example_id = db.Column(db.Integer, db.ForeignKey('test_example.id')) example = relationship('TestExample', backref=backref('verification_examples', cascade='all,delete')) result = db.Column(JSONType)
class Predict(db.Model, BaseMixin): models = relationship( 'PredictModel', secondary=lambda: predict_models_table, backref='predict_section') # Results label_id = db.Column(db.ForeignKey('predict_result_label.id')) label = relationship('PredictResultLabel', foreign_keys=[label_id], cascade='all,delete', backref='results') probability_id = db.Column(db.ForeignKey('predict_result_probability.id')) probability = relationship( 'PredictResultProbability', foreign_keys=[probability_id], cascade='all,delete', backref='probabilities')
class Segment(db.Model, BaseMixin): __tablename__ = 'segment' name = db.Column(db.String(200)) records = db.Column(db.Integer) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('segments'))
class ClassifierGridParams(db.Model, BaseModel): STATUS_LIST = ('New', 'Queued', 'Calculating', 'Completed', 'Error') model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('classifier_grid_params')) scoring = db.Column(db.String(100), default='accuracy') status = db.Column(db.Enum(*STATUS_LIST, name='classifier_grid_params_statuses'), nullable=False, default='New') train_data_set_id = db.Column( db.Integer, db.ForeignKey('data_set.id', ondelete='SET NULL')) train_dataset = relationship('DataSet', foreign_keys=[train_data_set_id]) test_data_set_id = db.Column( db.Integer, db.ForeignKey('data_set.id', ondelete='SET NULL')) test_dataset = relationship('DataSet', foreign_keys=[test_data_set_id]) parameters = db.Column(JSONType) parameters_grid = db.Column(JSONType)
class ServerModelVerification(BaseModel, db.Model, RefXmlImportHandlerMixin): """ Represents verification of the model, that deployed to the server """ STATUS_NEW = 'New' STATUS_QUEUED = 'Queued' STATUS_IN_PROGRESS = 'In Progress' STATUS_ERROR = 'Error' STATUS_DONE = 'Done' STATUSES = [ STATUS_NEW, STATUS_QUEUED, STATUS_IN_PROGRESS, STATUS_ERROR, STATUS_DONE ] status = db.Column(db.Enum(*STATUSES, name='model_verification_statuses'), nullable=False, default=STATUS_NEW) error = db.Column(db.Text) server_id = db.Column(db.Integer, db.ForeignKey('server.id')) server = relationship(Server, backref=backref('model_verifications', cascade='all,delete')) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('model_verifications', cascade='all,delete')) test_result_id = db.Column(db.Integer, db.ForeignKey('test_result.id')) test_result = relationship('TestResult', backref=backref('model_verifications', cascade='all,delete')) description = db.Column(JSONType) result = db.Column(JSONType) params_map = db.Column(JSONType) clazz = db.Column(db.String(200)) def __repr__(self): return '<ServerModelVerification {0}>'.format(self.model.name)
class XmlEntity(db.Model, BaseMixin, RefXmlImportHandlerMixin): id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(200), nullable=False) autoload_fields = db.Column(db.Boolean, default=False) # JSON or CSV field as datasource transformed_field_id = db.Column(db.ForeignKey( 'xml_field.id', use_alter=True, name="fk_transformed_field", ondelete='SET NULL')) transformed_field = relationship('XmlField', post_update=True, foreign_keys=[transformed_field_id], backref='entities_for_field_ds') # Sub entity entity_id = db.Column(db.ForeignKey('xml_entity.id')) entity = relationship('XmlEntity', remote_side=[id], backref=backref('entities', cascade='all,delete')) # Global datasource datasource_id = db.Column(db.ForeignKey('xml_data_source.id', ondelete='CASCADE')) datasource = relationship('XmlDataSource', foreign_keys=[datasource_id]) query_id = db.Column(db.ForeignKey('xml_query.id')) query_obj = relationship('XmlQuery', foreign_keys=[query_id], cascade='all,delete', backref='parent_entity') def __repr__(self): return "<Entity %s>" % self.name def to_dict(self): ent = {'name': self.name} if self.transformed_field: ent['datasource'] = self.transformed_field.name if self.datasource: ent['datasource'] = self.datasource.name if self.autoload_fields: ent['autoload_fields'] = str(self.autoload_fields).lower() return ent
class XmlSqoop(db.Model, BaseMixin): target = db.Column(db.String(200), nullable=False) table = db.Column(db.String(200), nullable=False) where = db.Column(db.String(200), nullable=True) direct = db.Column(db.String(200), nullable=True) mappers = db.Column(db.String(200), nullable=True) options = db.Column(db.String(200), nullable=True) text = db.Column(db.Text, nullable=True) FIELDS_TO_SERIALIZE = ['target', 'table', 'where', 'direct', 'mappers', 'options'] # Global datasource datasource_id = db.Column(db.ForeignKey('xml_data_source.id', ondelete='SET NULL')) datasource = relationship('XmlDataSource', foreign_keys=[datasource_id]) entity_id = db.Column(db.ForeignKey('xml_entity.id')) entity = relationship( 'XmlEntity', foreign_keys=[entity_id], backref=backref( 'sqoop_imports', cascade='all,delete', order_by='XmlSqoop.id')) @property def pig_fields(self): from api.async_tasks.models import AsyncTask return AsyncTask.get_current_by_object( self, 'api.import_handlers.tasks.load_pig_fields', ) def to_dict(self): sqoop = super(XmlSqoop, self).to_dict() if self.datasource: sqoop['datasource'] = self.datasource.name return sqoop
class XmlField(db.Model, BaseMixin): TYPES = PROCESS_STRATEGIES.keys() TRANSFORM_TYPES = ['json', 'csv'] FIELDS_TO_SERIALIZE = ['name', 'type', 'column', 'jsonpath', 'delimiter', 'regex', 'split', 'dateFormat', 'template', 'transform', 'headers', 'script', 'required', 'multipart', 'key_path', 'value_path'] def to_dict(self): fieldDict = super(XmlField, self).to_dict() if 'multipart' in fieldDict and fieldDict['multipart'] == 'false': fieldDict.pop('multipart') if 'required' in fieldDict and fieldDict['required'] == 'false': fieldDict.pop('required') return fieldDict name = db.Column(db.String(200), nullable=False) type = db.Column(db.Enum(*TYPES, name='xml_field_types')) column = db.Column(db.String(200)) jsonpath = db.Column(db.String(200)) delimiter = db.Column(db.String(200)) regex = db.Column(db.String(200)) split = db.Column(db.String(200)) dateFormat = db.Column(db.String(200)) template = db.Column(db.String(200)) transform = db.Column( db.Enum(*TRANSFORM_TYPES, name='xml_transform_types')) headers = db.Column(db.String(200)) script = db.Column(db.Text) required = db.Column(db.Boolean, default=False) multipart = db.Column(db.Boolean, default=False) key_path = db.Column(db.String(200)) value_path = db.Column(db.String(200)) entity_id = db.Column(db.ForeignKey('xml_entity.id')) entity = relationship( 'XmlEntity', foreign_keys=[entity_id], backref=backref( 'fields', cascade='all,delete', order_by='XmlField.id'))
def import_handler_id(cls): return db.Column('import_handler_id', db.ForeignKey('xml_import_handler.id'))
class XmlImportHandler(db.Model, ImportHandlerMixin, BaseDeployedEntity): TYPE = 'xml' DATASOURCES_ORDER = ['db', 'csv', 'http', 'pig', 'input'] predict_id = db.Column(db.ForeignKey('predict.id', ondelete='CASCADE')) predict = relationship( 'Predict', foreign_keys=[predict_id], backref="import_handler") locked = db.Column(db.Boolean, default=False) @property def data(self): return self.get_plan_config() @property def crc32(self): import zlib return '0x%08X' % (zlib.crc32(self.data) & 0xffffffff) @data.setter def data(self, val): has_root_ent = XmlEntity.query.filter_by( import_handler=self, entity=None).count() if has_root_ent: raise ValueError("Import Handler isn't empty") fill_import_handler(self, val) def _get_in_order(self, items, field, order): from collections import OrderedDict data = OrderedDict([(key, []) for key in order]) for item in items: data[getattr(item, field)].append(item) for key in data: for item in data[key]: yield item def get_plan_config(self, pretty_print=True, secure=True): plan = etree.Element("plan") inputs = etree.SubElement(plan, "inputs") for param in self.xml_input_parameters: etree.SubElement(inputs, "param", **param.to_dict()) for scr in self.xml_scripts: if scr.data and scr.data.strip(): # script isn't empty if scr.type == XmlScript.TYPE_PYTHON_FILE: scr_tag = etree.SubElement(plan, 'script', src=scr.data) if scr.type == XmlScript.TYPE_PYTHON_CODE: scr_tag = etree.SubElement(plan, 'script') scr_tag.text = etree.CDATA(scr.data) datasources = etree.SubElement(plan, "datasources") for ds in self._get_in_order(self.xml_data_sources, 'type', self.DATASOURCES_ORDER): if ds.name != "input": extra = ds.params if secure else {} etree.SubElement( datasources, ds.type, name=ds.name, **extra) import_ = etree.SubElement(plan, "import") tree = get_entity_tree(self) def build_tree(entity, parent): ent = etree.SubElement(parent, "entity", **entity.to_dict()) for sqoop in entity.sqoop_imports: sqoop_el = etree.SubElement(ent, "sqoop", **sqoop.to_dict()) if sqoop.text: sqoop_el.text = etree.CDATA(sqoop.text) if entity.query_obj: query = etree.SubElement( ent, "query", **entity.query_obj.to_dict()) query.text = etree.CDATA(entity.query_obj.text or '') for field in entity.fields: field_dict = field.to_dict() script = field_dict.get('script') script_text = None if script and (len(script.splitlines()) > 1 or len(script) > 50): del field_dict['script'] script_text = script field_el = etree.SubElement(ent, "field", **field_dict) if script_text: script_tag = etree.SubElement(field_el, "script") script_tag.text = etree.CDATA(script_text) for subentity in entity.entities: build_tree(subentity, parent=ent) build_tree(tree, import_) if self.predict is not None: predict = etree.SubElement(plan, "predict") for model in self.predict.models: predict_model = etree.SubElement( predict, "model", **model.to_dict()) for weight in model.predict_model_weights: etree.SubElement( predict_model, "weight", **weight.to_dict()) if self.predict.label or self.predict.probability: result = etree.SubElement(predict, "result") etree.SubElement( result, "label", **self.predict.label.to_dict()) etree.SubElement( result, "probability", **self.predict.probability.to_dict()) return etree.tostring(plan, pretty_print=pretty_print) def get_iterator(self, params, callback=None): plan = ExtractionPlan(self.get_plan_config(), is_file=False) return CoreImportHandler(plan, params, callback=callback) def get_fields(self): """ Returns list of the field names """ if self.data is None: return [] def get_entity_fields(entity): fields = [] for name, field in entity.fields.iteritems(): if not field.is_datasource_field: fields.append(field.name) for sub_entity in entity.nested_entities_field_ds.values(): fields += get_entity_fields(sub_entity) for sub_entity in entity.nested_entities_global_ds: fields += get_entity_fields(sub_entity) return fields # TODO: try .. except after check this with real import handlers try: plan = ExtractionPlan(self.data, is_file=False) return get_entity_fields(plan.entity) except Exception, exc: logging.error(exc) raise ImportHandlerError(exc.message, exc)
probability_id = db.Column(db.ForeignKey('predict_result_probability.id')) probability = relationship( 'PredictResultProbability', foreign_keys=[probability_id], cascade='all,delete', backref='probabilities') class PredictModel(db.Model, BaseMixin): FIELDS_TO_SERIALIZE = ('name', 'value', 'script') name = db.Column(db.String(200), nullable=False, name='name') value = db.Column(db.String(200), name='value') script = db.Column(db.Text, name='script') predict_models_table = db.Table( 'predict_models_table', db.Model.metadata, db.Column('predict_model_id', db.Integer, db.ForeignKey( 'predict_model.id', ondelete='CASCADE', onupdate='CASCADE')), db.Column('predict_id', db.Integer, db.ForeignKey( 'predict.id', ondelete='CASCADE', onupdate='CASCADE')) ) class RefPredictModelMixin(BaseMixin): @declared_attr def predict_model_id(cls): return db.Column( 'predict_model_id', db.ForeignKey('predict_model.id')) @declared_attr def predict_model(cls): from api.base.utils import convert_name, pluralize backref_name = pluralize(convert_name(cls.__name__))
self.features_dict['schema-name'] = self.schema_name BaseModel.save(self, commit=commit) def delete(self): features = Feature.query.filter( Feature.feature_set_id == self.id).all() for feature in features: feature.delete() super(FeatureSet, self).delete() group_by_table = db.Table( 'group_by_table', db.Model.metadata, db.Column( 'feature_set_id', db.Integer, db.ForeignKey('feature_set.id', ondelete='CASCADE', onupdate='CASCADE')), db.Column( 'feature_id', db.Integer, db.ForeignKey('feature.id', ondelete='CASCADE', onupdate='CASCADE'))) @event.listens_for(Feature, "after_insert") def after_insert_feature(mapper, connection, target): if target.feature_set is None and target.feature_set_id is not None: from sqlalchemy.orm import joinedload target = target.__class__.query.options(joinedload('feature_set')).get( target.id) if target.feature_set is not None: update_feature_set_on_change_features(connection, target.feature_set, target)
class Model(db.Model, BaseModel, BaseTrainedEntity, BaseDeployedEntity): """ Represents Model details. """ LOG_TYPE = LogMessage.TRAIN_MODEL comparable = db.Column(db.Boolean, default=False) weights_synchronized = db.Column(db.Boolean, default=False) labels = db.Column(postgresql.ARRAY(db.String), default=[]) example_label = db.Column(db.String(100)) example_id = db.Column(db.String(100)) train_records_count = db.Column(db.Integer) tags = relationship('Tag', secondary=lambda: tags_table, backref='models') target_variable = db.Column(db.Unicode) feature_count = db.Column(db.Integer, default=0) features_set_id = db.Column(db.Integer, db.ForeignKey('feature_set.id')) features_set = relationship('FeatureSet', uselist=False, backref='model') test_import_handler_id = db.Column(db.Integer, nullable=True) test_import_handler_type = db.Column(db.String(200), default='json') datasets = relationship('DataSet', secondary=lambda: data_sets_table) classifier = deferred(db.Column(JSONType)) # Note: It could contains different keys depends to the classifier used visualization_data = deferred(db.Column(JSONType)) locked = db.Column(db.Boolean, default=False) model_parts_size = deferred(db.Column(JSONType)) def __init__(self, *args, **kwargs): super(Model, self).__init__(*args, **kwargs) self.visualization_data = {} def visualize_model(self, data=None, status=None, commit=True, segment=None): """ Saves visualization data to the db. Note: visualization_data is the dict like: { segment_name1: {parameters: {status: new, ...}, ...}, segment_name2: {parameters: {status: new, ...}, ...} ... } """ def set_status(item, status): if 'parameters' not in item: item['parameters'] = {} item['parameters']['status'] = status from copy import deepcopy visualization_data = deepcopy(self.visualization_data or {}) if segment is None: if data: visualization_data = data if status: set_status(visualization_data, status) else: if data is None: raise ValueError("data is required when segment is specified") else: # updating the visualization data of specific segment visualization_data[segment] = data or {} if status: set_status(visualization_data[segment], status) self.visualization_data = visualization_data if commit: self.save() @property def test_import_handler(self): try: return getattr( self, "rel_test_import_handler_%s" % self.test_import_handler_type) except AttributeError: return None @test_import_handler.setter def test_import_handler(self, handler): if handler is not None: self.test_import_handler_id = handler.id self.test_import_handler_type = handler.TYPE def create_segments(self, segments): """ Creates Segment models by segments dict. segments: dict Dictionary where keys are segment names and values - count records in this segment. """ for name, records in segments.iteritems(): segment = Segment() segment.name = name segment.records = records segment.model = self segment.save() def __repr__(self): return "<Model {0}>".format(self.name) def save(self, commit=True): if self.features_set is None: from api.features.models import FeatureSet self.features_set = FeatureSet() db.session.add(self.features_set) if self.classifier is None: self.classifier = {} super(Model, self).save(commit) def delete(self): # delete features and feature set as they are used by this model only self.features_set.delete() # prepare dataset list to unlock and tags to decrease ds_to_unlock = self.datasets tags = self.tags super(Model, self).delete() # unlock datasets after model deletion for ds in ds_to_unlock: ds.unlock() # decrease corresponding tags counters for tag in tags: tag.update_counter() @property def dataset(self): return self.datasets[0] if len(self.datasets) else None @property def data_fields(self): ds = self.dataset return ds.data_fields if ds else [] @property def test_handler_fields(self): handler = self.test_import_handler if handler: try: return handler.get_fields() except: pass return [] def run_test(self, dataset, callback=None): trainer = self.get_trainer() fp = dataset.get_data_stream() try: metrics = trainer.test(dataset.get_iterator(fp), callback=callback, save_raw=True) finally: fp.close() raw_data = trainer._raw_data trainer.clear_temp_data() self.set_trainer(trainer) self.save() return metrics, raw_data def transform_dataset(self, dataset): trainer = self.get_trainer() fp = dataset.get_data_stream() try: return trainer.transform(dataset.get_iterator(fp)) finally: fp.close() def set_trainer(self, trainer): super(Model, self).set_trainer(trainer) self.target_variable = trainer._feature_model.target_variable self.feature_count = len(trainer._feature_model.features.keys()) if self.status == self.STATUS_TRAINED and \ trainer.model_type == TYPE_CLASSIFICATION: self.labels = trainer._get_labels() def get_features_json(self): data = self.features_set.features if data is None: self.features_set.modified = True data = self.features_set.features data['features'] = [ f for f in data['features'] if f.get('disabled', False) is False ] data['classifier'] = self.classifier return json.dumps(data, indent=4) @property def features(self): return self.get_features_json() def prepare_fields_for_train(self, user, datasets=[], delete_metadata=True): """ Flushes model fields while re-training. Removes related models, when `delete_metadata` setted. """ if delete_metadata: from api.model_tests.models import TestResult, TestExample from api.base.models import db from api.servers.models import ServerModelVerification, \ VerificationExample LogMessage.delete_related_logs(self.id) def _del(Cls, related_name): count = Cls.query.filter(Cls.model_id == self.id).delete( synchronize_session=False) logging.info('%s %s to delete' % (count, related_name)) # delete server model verification examples # (as they don't have reference to model) smv = ServerModelVerification.query.filter( ServerModelVerification.model_id == self.id).all() if len(smv): smv_ids = [s.id for s in smv] count = VerificationExample.query.filter( VerificationExample.verification_id.in_(smv_ids)).delete( synchronize_session=False) logging.info('%s model verification examples to delete' % count) _del(ServerModelVerification, 'server model verifications') _del(TestExample, 'test examples') _del(TestResult, 'tests') _del(Weight, 'weights') _del(WeightsCategory, 'weights categories') _del(Segment, 'segments') self.datasets = datasets # model is trained, lock datasets used for training for dataset in datasets: dataset.locked = True dataset.save() self.status = self.STATUS_TRAINING self.visualization_data = {} self.error = "" self.trained_by = user self.comparable = False db.session.add(self) db.session.commit() def _check_deployed(self): if not app.config['MODIFY_DEPLOYED_MODEL'] and self.locked: self.reason_msg = 'Model {0} has been deployed and blocked ' \ 'for modifications. '.format(self.name) return False return True @property def can_edit(self): return self._check_deployed() and super(Model, self).can_edit @property def can_delete(self): if self.training_in_progress: self.reason_msg = "The model cannot be deleted while training is" \ " still in progress." return self._check_deployed() and not self.training_in_progress and \ super(Model, self).can_delete
return self._check_deployed() and super(Model, self).can_edit @property def can_delete(self): if self.training_in_progress: self.reason_msg = "The model cannot be deleted while training is" \ " still in progress." return self._check_deployed() and not self.training_in_progress and \ super(Model, self).can_delete tags_table = db.Table( 'model_tag', db.Model.metadata, db.Column( 'model_id', db.Integer, db.ForeignKey('model.id', ondelete='CASCADE', onupdate='CASCADE')), db.Column('tag_id', db.Integer, db.ForeignKey('tag.id', ondelete='CASCADE', onupdate='CASCADE'))) data_sets_table = db.Table( 'model_dataset', db.Model.metadata, db.Column( 'model_id', db.Integer, db.ForeignKey('model.id', ondelete='CASCADE', onupdate='CASCADE')), db.Column( 'data_set_id', db.Integer, db.ForeignKey('data_set.id', ondelete='CASCADE', onupdate='CASCADE'))) transformer_data_sets_table = db.Table( 'transformer_dataset', db.Model.metadata, db.Column(
def trained_by_id(cls): return db.Column(db.ForeignKey('user.id', ondelete='SET NULL'))
class TestExample(db.Model, BaseModel): __tablename__ = 'test_example' NONAME = 'noname' NOT_FILED_ID = '-1' example_id = db.Column(db.String(100)) name = db.Column(db.String(100)) label = db.Column(db.String(100)) pred_label = db.Column(db.String(100)) num = db.Column(db.Integer) prob = db.Column(postgresql.ARRAY(db.Float)) data_input = db.Column(JSONType) weighted_data_input = db.Column(JSONType) test_result_id = db.Column(db.Integer, db.ForeignKey('test_result.id')) test_result = relationship('TestResult', backref=backref('examples', cascade='all,delete')) test_name = db.Column(db.String(200)) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship('Model') model_name = db.Column(db.String(200)) def __repr__(self): return '<TestExample {0}>'.format(self.name) @property def parameters_weights(self): res = [] def sort_by_weight(val): return -val['weight'] def go_tree(params, prefix=''): for name, val in params.iteritems(): if 'weight' in val and val['weight'] != 0: if prefix: val['name'] = '{0}->{1}'.format(prefix, name) else: val['name'] = name res.append(val) if 'weights' in val: go_tree(val['weights'], prefix=name) return res go_tree(self.weighted_data_input) res.sort(key=sort_by_weight) return res @property def is_weights_calculated(self): return self.weighted_data_input and self.weighted_data_input != {} def calc_weighted_data(self): if not self.data_input: return None from api.ml_models.helpers.features import get_features_vect_data model = self.model trainer = model.get_trainer() feature_model = trainer._feature_model segment = 'default' if len(trainer.with_segmentation) > 0: ndata = dict([(key.replace('->', '.'), val) for key, val in self.data_input.iteritems()]) data = trainer._apply_feature_types(ndata) segment = "_".join([ str(data[feature_name]) for feature_name in trainer._feature_model.group_by ]) features = trainer.features[segment] for feature_name in trainer._feature_model.group_by: features.pop(feature_name) else: try: features = trainer.features[segment] except: features = feature_model.features ndata = dict([(key.replace('->', '.'), val) for key, val in self.data_input.iteritems()]) trainer._prepare_data(iter([ ndata, ]), callback=None, save_raw=False, is_predict=True) vect_data1 = trainer._get_vectorized_data( segment, trainer._test_prepare_feature) vect = scipy.sparse.hstack(vect_data1) vect_data = vect.todense().tolist()[0] data = get_features_vect_data(vect_data, features.items(), feature_model.target_variable) from api.ml_models.helpers.weights import get_example_params segment = Segment.query.filter(Segment.name == segment, Segment.model == model)[0] model_weights = Weight.query.with_entities( Weight.name, Weight.value).filter(Weight.segment_id == segment.id) weighted_data = dict( get_example_params(model_weights, self.data_input, data)) self.weighted_data_input = weighted_data self.save() del trainer gc.collect() @classmethod def get_grouped(cls, field, model_id, test_result_id): cursor = cls.query.filter_by( model_id=model_id, test_result_id=test_result_id ).with_entities( cls.pred_label, cls.label, cls.prob, # Selecting field from json object isn't supported by alchemy, # using literal column instead expression.literal_column("data_input->>'{!s}'".format(field) ).label('group')) groups = defaultdict(list) for row in cursor.all(): groups[row[3]].append({ 'label': row[0], 'pred': row[1], 'prob': row[2], }) return [{ field: key, 'list': value } for key, value in groups.iteritems()] @classmethod def get_data(cls, test_result_id, fields): db_fields = [] for field in fields: if field == 'id': field = 'example_id' db_field = getattr(cls, field, None) if db_field: db_fields.append(db_field) else: # Selecting field from json object isn't supported by alchemy, # using literal column instead db_fields.append( expression.literal_column("data_input->>'{!s}'".format( field.replace('data_input.', ''))).label(field)) cursor = cls.query.filter_by( test_result_id=test_result_id).with_entities(*db_fields) for row in cursor.all(): yield dict(zip(row.keys(), row))
def predict_model_id(cls): return db.Column( 'predict_model_id', db.ForeignKey('predict_model.id'))
class TestResult(db.Model, BaseModel): LOG_TYPE = LogMessage.RUN_TEST STATUS_QUEUED = 'Queued' STATUS_IMPORTING = 'Importing' STATUS_IMPORTED = 'Imported' STATUS_IN_PROGRESS = 'In Progress' STATUS_STORING = 'Storing' STATUS_COMPLETED = 'Completed' STATUS_ERROR = 'Error' STATUSES = [ STATUS_QUEUED, STATUS_IMPORTING, STATUS_IMPORTED, STATUS_IN_PROGRESS, STATUS_STORING, STATUS_COMPLETED, STATUS_ERROR ] TEST_STATUSES = [ STATUS_QUEUED, STATUS_IMPORTING, STATUS_IMPORTED, STATUS_IN_PROGRESS, STATUS_STORING ] __tablename__ = 'test_result' name = db.Column(db.String(200), nullable=False) status = db.Column(db.Enum(*STATUSES, name='test_statuses')) error = db.Column(db.String(300)) model_id = db.Column(db.Integer, db.ForeignKey('model.id')) model = relationship(Model, backref=backref('tests', cascade='all,delete')) model_name = db.Column(db.String(200)) data_set_id = db.Column(db.Integer, db.ForeignKey('data_set.id', ondelete='SET NULL')) dataset = relationship(DataSet, foreign_keys=[data_set_id]) examples_count = db.Column(db.Integer) examples_fields = db.Column(postgresql.ARRAY(db.String)) examples_size = db.Column(db.Float) parameters = db.Column(JSONType) classes_set = db.Column(postgresql.ARRAY(db.String)) accuracy = db.Column(db.Float) roc_auc = db.Column(JSONType) metrics = db.Column(JSONType) memory_usage = db.Column(db.Integer) vect_data = deferred(db.Column(S3File)) fill_weights = db.Column(db.Boolean, default=False) def __repr__(self): return '<TestResult {0}>'.format(self.name) def get_vect_data(self, num, segment): from pickle import loads data = loads(self.vect_data) offset = 0 for k, v in data.items(): offset += v.shape[0] if k == segment: break import numpy if isinstance(data[segment], numpy.ndarray): return data[num - offset] return data[segment].getrow(num - offset).todense().tolist()[0] def set_error(self, error, commit=True): self.error = str(error)[:299] self.status = TestResult.STATUS_ERROR if commit: self.save() @property def exports(self): from api.async_tasks.models import AsyncTask return AsyncTask.get_current_by_object( self, 'api.model_tests.tasks.get_csv_results', ) @property def db_exports(self): from api.async_tasks.models import AsyncTask return AsyncTask.get_current_by_object( self, 'api.model_tests.tasks.export_results_to_db', ) @property def confusion_matrix_calculations(self): from api.async_tasks.models import AsyncTask return AsyncTask.get_current_by_object( self, 'api.model_tests.tasks.calculate_confusion_matrix', statuses=AsyncTask.STATUSES) @property def can_edit(self): if not self.model.can_edit: self.reason_msg = self.model.reason_msg return False return super(TestResult, self).can_edit @property def can_delete(self): if not self.model.can_delete: self.reason_msg = self.model.reason_msg return False return super(TestResult, self).can_delete def delete(self): ds = self.dataset super(TestResult, self).delete() ds.unlock() @property def test_in_progress(self): return self.status in self.TEST_STATUSES
def feature_set_id(cls): return db.Column('feature_set_id', db.ForeignKey('feature_set.id'))
class DataSet(db.Model, BaseModel): """ Set of the imported data. """ LOG_TYPE = LogMessage.IMPORT_DATA STATUS_NEW = 'New' STATUS_IMPORTING = 'Importing' STATUS_UPLOADING = 'Uploading' STATUS_IMPORTED = 'Imported' STATUS_ERROR = 'Error' STATUSES = [ STATUS_IMPORTING, STATUS_UPLOADING, STATUS_IMPORTED, STATUS_ERROR, STATUS_NEW ] FORMAT_JSON = 'json' FORMAT_CSV = 'csv' FORMATS = [FORMAT_JSON, FORMAT_CSV] name = db.Column(db.String(200)) status = db.Column(db.Enum(*STATUSES, name='dataset_statuses'), default=STATUS_NEW) error = db.Column(db.String(300)) # TODO: trunc error to 300 symbols data = db.Column(db.String(200)) import_params = db.Column(JSONType) # Generic relation to import handler import_handler_id = db.Column(db.Integer, nullable=False) import_handler_type = db.Column(db.String(200), default='xml') import_handler_xml = db.Column(db.Text) cluster_id = db.Column(db.Integer, db.ForeignKey('cluster.id', ondelete='SET NULL')) cluster = relationship('Cluster', backref=backref('datasets')) pig_step = db.Column(db.Integer, nullable=True) pig_row = db.Column(JSONType) on_s3 = db.Column(db.Boolean) compress = db.Column(db.Boolean) filename = db.Column(db.String(200)) filesize = db.Column(db.BigInteger) records_count = db.Column(db.Integer) time = db.Column(db.Integer) data_fields = db.Column(postgresql.ARRAY(db.String)) format = db.Column(db.String(10)) uid = db.Column(db.String(200)) locked = db.Column(db.Boolean, default=False) @property def import_handler(self): """Provides in-Python access to the "parent" by choosing the appropriate relationship. """ return ImportHandler.query.get(self.import_handler_id) @import_handler.setter def import_handler(self, handler): self.import_handler_id = handler.id self.import_handler_type = handler.TYPE self.import_handler_xml = handler.data def set_uid(self): if not self.uid: self.uid = uuid.uuid1().hex def get_s3_download_url(self, expires_in=3600): helper = AmazonS3Helper() return helper.get_download_url(self.uid, expires_in) def set_file_path(self): self.set_uid() data = '%s.%s' % (self.uid, 'gz' if self.compress else 'json') self.data = data from api.base.io_utils import get_or_create_data_folder path = get_or_create_data_folder() self.filename = join(path, data) self.save() @property def loaded_data(self): if not self.on_s3: raise Exception('Invalid oper') if not hasattr(self, '_data'): self._data = self.load_from_s3() return self._data def get_data_stream(self): import gzip if not self.on_s3 or exists(self.filename): logging.info('Loading data from local file') open_meth = gzip.open if self.compress else open return open_meth(self.filename, 'r') else: logging.info('Loading data from Amazon S3') stream = StringIO.StringIO(self.loaded_data) if self.compress: logging.info('Decompress data') return gzip.GzipFile(fileobj=stream, mode='r') return stream def get_iterator(self, stream): from cloudml.trainer.streamutils import streamingiterload return streamingiterload(stream, source_format=self.format) def load_from_s3(self): helper = AmazonS3Helper() return helper.load_key(self.uid) def save_to_s3(self): meta = { 'handler': self.import_handler_id, 'dataset': self.name, 'params': str(self.import_params) } self.set_uid() helper = AmazonS3Helper() helper.save_gz_file(self.uid, self.filename, meta) helper.close() self.on_s3 = True self.save() def set_error(self, error, commit=True): self.error = str(error)[:299] self.status = self.STATUS_ERROR if commit: self.save() def delete(self): # Stop task # self.terminate_task() # TODO filename = self.filename on_s3 = self.on_s3 uid = self.uid super(DataSet, self).delete() LogMessage.delete_related_logs(self.id, type_=LogMessage.IMPORT_DATA) # TODO: check import handler type try: os.remove(filename) except OSError: pass if on_s3: from botocore.exceptions import ClientError helper = AmazonS3Helper() try: helper.delete_key(uid) except ClientError as e: logging.exception(str(e)) def save(self, *args, **kwargs): if self.status != self.STATUS_ERROR: self.error = '' super(DataSet, self).save(*args, **kwargs) def __repr__(self): return '<Dataset %r>' % self.name def _check_locked(self): if self.locked: self.reason_msg = 'Some existing models were trained/tested ' \ 'using this dataset. ' return False return True @property def can_edit(self): return self._check_locked() and super(DataSet, self).can_edit @property def can_delete(self): return self._check_locked() and super(DataSet, self).can_delete def unlock(self): from api.ml_models.models import data_sets_table from api.model_tests.models import TestResult if db.session.query(data_sets_table).filter( data_sets_table.c.data_set_id == self.id).count() == 0 and \ TestResult.query.filter( TestResult.data_set_id == self.id).count() == 0: self.locked = False self.save()