Esempio n. 1
0
class Weight(db.Model, BaseMixin):
    """
    Represents Model Parameter Weight
    """
    name = db.Column(db.String(200))
    short_name = db.Column(db.String(200))
    model_name = db.Column(db.String(200))
    value = db.Column(db.Float)
    value2 = db.Column(db.Float)
    is_positive = db.Column(db.Boolean)
    css_class = db.Column(db.String)
    class_label = db.Column(db.String(100), nullable=True)

    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model, backref=backref('weights'))

    segment_id = db.Column(db.Integer, db.ForeignKey('segment.id'))
    segment = relationship(Segment, backref=backref('weights'))

    parent = db.Column(db.String(200))

    test_weights = db.Column(JSONType)

    @hybrid_method
    def test_weight(self, test_id):
        return TestWeightColumn(test_id)
Esempio n. 2
0
class WeightsCategory(db.Model, BaseMixin):
    """
    Represents Model Parameter Weights Category.

    NOTE: used for constructing trees of weights.
    """
    __tablename__ = 'weights_category'

    name = db.Column(db.String(200))
    short_name = db.Column(db.String(200))
    # TODO: remove it
    model_name = db.Column(db.String(200))

    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model, backref=backref('weight_categories'))

    segment_id = db.Column(db.Integer, db.ForeignKey('segment.id'))
    segment = relationship(Segment, backref=backref('weight_categories'))

    normalized_weight = db.Column(db.Float)
    class_label = db.Column(db.String(100), nullable=True)

    parent = db.Column(db.String(200))

    # TODO: Maybe have FK Weight to WeightsCategory?
    # @aggregated('normalized_weight', sa.Column(sa.Float))
    # def normalized_weight(self):
    #     return sa.func.sum(Weight.value2)

    def __repr__(self):
        return '<Category {0}>'.format(self.name)
Esempio n. 3
0
class VerificationExample(BaseMixin, db.Model):
    verification_id = db.Column(db.Integer,
                                db.ForeignKey('server_model_verification.id'))
    verification = relationship('ServerModelVerification',
                                backref=backref('verification_examples',
                                                cascade='all,delete'))

    example_id = db.Column(db.Integer, db.ForeignKey('test_example.id'))
    example = relationship('TestExample',
                           backref=backref('verification_examples',
                                           cascade='all,delete'))

    result = db.Column(JSONType)
Esempio n. 4
0
class Predict(db.Model, BaseMixin):
    models = relationship(
        'PredictModel',
        secondary=lambda: predict_models_table, backref='predict_section')

    # Results
    label_id = db.Column(db.ForeignKey('predict_result_label.id'))
    label = relationship('PredictResultLabel', foreign_keys=[label_id],
                         cascade='all,delete', backref='results')

    probability_id = db.Column(db.ForeignKey('predict_result_probability.id'))
    probability = relationship(
        'PredictResultProbability', foreign_keys=[probability_id],
        cascade='all,delete', backref='probabilities')
Esempio n. 5
0
class Segment(db.Model, BaseMixin):
    __tablename__ = 'segment'

    name = db.Column(db.String(200))
    records = db.Column(db.Integer)

    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model, backref=backref('segments'))
Esempio n. 6
0
class ClassifierGridParams(db.Model, BaseModel):
    STATUS_LIST = ('New', 'Queued', 'Calculating', 'Completed', 'Error')
    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model, backref=backref('classifier_grid_params'))
    scoring = db.Column(db.String(100), default='accuracy')
    status = db.Column(db.Enum(*STATUS_LIST,
                               name='classifier_grid_params_statuses'),
                       nullable=False,
                       default='New')

    train_data_set_id = db.Column(
        db.Integer, db.ForeignKey('data_set.id', ondelete='SET NULL'))
    train_dataset = relationship('DataSet', foreign_keys=[train_data_set_id])

    test_data_set_id = db.Column(
        db.Integer, db.ForeignKey('data_set.id', ondelete='SET NULL'))
    test_dataset = relationship('DataSet', foreign_keys=[test_data_set_id])

    parameters = db.Column(JSONType)
    parameters_grid = db.Column(JSONType)
Esempio n. 7
0
class ServerModelVerification(BaseModel, db.Model, RefXmlImportHandlerMixin):
    """
    Represents verification of the model,
    that deployed to the server
    """
    STATUS_NEW = 'New'
    STATUS_QUEUED = 'Queued'
    STATUS_IN_PROGRESS = 'In Progress'
    STATUS_ERROR = 'Error'
    STATUS_DONE = 'Done'

    STATUSES = [
        STATUS_NEW, STATUS_QUEUED, STATUS_IN_PROGRESS, STATUS_ERROR,
        STATUS_DONE
    ]

    status = db.Column(db.Enum(*STATUSES, name='model_verification_statuses'),
                       nullable=False,
                       default=STATUS_NEW)
    error = db.Column(db.Text)
    server_id = db.Column(db.Integer, db.ForeignKey('server.id'))
    server = relationship(Server,
                          backref=backref('model_verifications',
                                          cascade='all,delete'))
    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model,
                         backref=backref('model_verifications',
                                         cascade='all,delete'))
    test_result_id = db.Column(db.Integer, db.ForeignKey('test_result.id'))
    test_result = relationship('TestResult',
                               backref=backref('model_verifications',
                                               cascade='all,delete'))
    description = db.Column(JSONType)
    result = db.Column(JSONType)
    params_map = db.Column(JSONType)
    clazz = db.Column(db.String(200))

    def __repr__(self):
        return '<ServerModelVerification {0}>'.format(self.model.name)
Esempio n. 8
0
class XmlEntity(db.Model, BaseMixin, RefXmlImportHandlerMixin):

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(200), nullable=False)
    autoload_fields = db.Column(db.Boolean, default=False)

    # JSON or CSV field as datasource
    transformed_field_id = db.Column(db.ForeignKey(
        'xml_field.id', use_alter=True,
        name="fk_transformed_field", ondelete='SET NULL'))
    transformed_field = relationship('XmlField', post_update=True,
                                     foreign_keys=[transformed_field_id],
                                     backref='entities_for_field_ds')
    # Sub entity
    entity_id = db.Column(db.ForeignKey('xml_entity.id'))
    entity = relationship('XmlEntity', remote_side=[id],
                          backref=backref('entities', cascade='all,delete'))

    # Global datasource
    datasource_id = db.Column(db.ForeignKey('xml_data_source.id',
                                            ondelete='CASCADE'))
    datasource = relationship('XmlDataSource',
                              foreign_keys=[datasource_id])
    query_id = db.Column(db.ForeignKey('xml_query.id'))
    query_obj = relationship('XmlQuery', foreign_keys=[query_id],
                             cascade='all,delete', backref='parent_entity')

    def __repr__(self):
        return "<Entity %s>" % self.name

    def to_dict(self):
        ent = {'name': self.name}
        if self.transformed_field:
            ent['datasource'] = self.transformed_field.name
        if self.datasource:
            ent['datasource'] = self.datasource.name
        if self.autoload_fields:
            ent['autoload_fields'] = str(self.autoload_fields).lower()
        return ent
Esempio n. 9
0
class XmlSqoop(db.Model, BaseMixin):
    target = db.Column(db.String(200), nullable=False)
    table = db.Column(db.String(200), nullable=False)
    where = db.Column(db.String(200), nullable=True)
    direct = db.Column(db.String(200), nullable=True)
    mappers = db.Column(db.String(200), nullable=True)
    options = db.Column(db.String(200), nullable=True)
    text = db.Column(db.Text, nullable=True)

    FIELDS_TO_SERIALIZE = ['target', 'table', 'where', 'direct',
                           'mappers', 'options']

    # Global datasource
    datasource_id = db.Column(db.ForeignKey('xml_data_source.id',
                                            ondelete='SET NULL'))
    datasource = relationship('XmlDataSource',
                              foreign_keys=[datasource_id])

    entity_id = db.Column(db.ForeignKey('xml_entity.id'))
    entity = relationship(
        'XmlEntity', foreign_keys=[entity_id], backref=backref(
            'sqoop_imports', cascade='all,delete', order_by='XmlSqoop.id'))

    @property
    def pig_fields(self):
        from api.async_tasks.models import AsyncTask
        return AsyncTask.get_current_by_object(
            self,
            'api.import_handlers.tasks.load_pig_fields',
        )

    def to_dict(self):
        sqoop = super(XmlSqoop, self).to_dict()
        if self.datasource:
            sqoop['datasource'] = self.datasource.name
        return sqoop
Esempio n. 10
0
class XmlField(db.Model, BaseMixin):
    TYPES = PROCESS_STRATEGIES.keys()
    TRANSFORM_TYPES = ['json', 'csv']
    FIELDS_TO_SERIALIZE = ['name', 'type', 'column', 'jsonpath', 'delimiter',
                           'regex', 'split', 'dateFormat', 'template',
                           'transform', 'headers', 'script', 'required',
                           'multipart', 'key_path', 'value_path']

    def to_dict(self):
        fieldDict = super(XmlField, self).to_dict()
        if 'multipart' in fieldDict and fieldDict['multipart'] == 'false':
            fieldDict.pop('multipart')
        if 'required' in fieldDict and fieldDict['required'] == 'false':
            fieldDict.pop('required')
        return fieldDict

    name = db.Column(db.String(200), nullable=False)
    type = db.Column(db.Enum(*TYPES, name='xml_field_types'))
    column = db.Column(db.String(200))
    jsonpath = db.Column(db.String(200))
    delimiter = db.Column(db.String(200))
    regex = db.Column(db.String(200))
    split = db.Column(db.String(200))
    dateFormat = db.Column(db.String(200))
    template = db.Column(db.String(200))
    transform = db.Column(
        db.Enum(*TRANSFORM_TYPES, name='xml_transform_types'))
    headers = db.Column(db.String(200))
    script = db.Column(db.Text)
    required = db.Column(db.Boolean, default=False)
    multipart = db.Column(db.Boolean, default=False)
    key_path = db.Column(db.String(200))
    value_path = db.Column(db.String(200))

    entity_id = db.Column(db.ForeignKey('xml_entity.id'))
    entity = relationship(
        'XmlEntity', foreign_keys=[entity_id], backref=backref(
            'fields', cascade='all,delete', order_by='XmlField.id'))
Esempio n. 11
0
 def import_handler_id(cls):
     return db.Column('import_handler_id',
                      db.ForeignKey('xml_import_handler.id'))
Esempio n. 12
0
class XmlImportHandler(db.Model, ImportHandlerMixin, BaseDeployedEntity):
    TYPE = 'xml'

    DATASOURCES_ORDER = ['db', 'csv', 'http', 'pig', 'input']

    predict_id = db.Column(db.ForeignKey('predict.id', ondelete='CASCADE'))
    predict = relationship(
        'Predict', foreign_keys=[predict_id], backref="import_handler")
    locked = db.Column(db.Boolean, default=False)

    @property
    def data(self):
        return self.get_plan_config()

    @property
    def crc32(self):
        import zlib
        return '0x%08X' % (zlib.crc32(self.data) & 0xffffffff)

    @data.setter
    def data(self, val):
        has_root_ent = XmlEntity.query.filter_by(
            import_handler=self,
            entity=None).count()
        if has_root_ent:
            raise ValueError("Import Handler isn't empty")

        fill_import_handler(self, val)

    def _get_in_order(self, items, field, order):
        from collections import OrderedDict
        data = OrderedDict([(key, []) for key in order])
        for item in items:
            data[getattr(item, field)].append(item)
        for key in data:
            for item in data[key]:
                yield item

    def get_plan_config(self, pretty_print=True, secure=True):
        plan = etree.Element("plan")

        inputs = etree.SubElement(plan, "inputs")
        for param in self.xml_input_parameters:
            etree.SubElement(inputs, "param", **param.to_dict())

        for scr in self.xml_scripts:
            if scr.data and scr.data.strip():  # script isn't empty
                if scr.type == XmlScript.TYPE_PYTHON_FILE:
                    scr_tag = etree.SubElement(plan, 'script', src=scr.data)
                if scr.type == XmlScript.TYPE_PYTHON_CODE:
                    scr_tag = etree.SubElement(plan, 'script')
                    scr_tag.text = etree.CDATA(scr.data)

        datasources = etree.SubElement(plan, "datasources")
        for ds in self._get_in_order(self.xml_data_sources, 'type',
                                     self.DATASOURCES_ORDER):
            if ds.name != "input":
                extra = ds.params if secure else {}
                etree.SubElement(
                    datasources, ds.type, name=ds.name, **extra)

        import_ = etree.SubElement(plan, "import")
        tree = get_entity_tree(self)

        def build_tree(entity, parent):
            ent = etree.SubElement(parent, "entity", **entity.to_dict())

            for sqoop in entity.sqoop_imports:
                sqoop_el = etree.SubElement(ent, "sqoop", **sqoop.to_dict())
                if sqoop.text:
                    sqoop_el.text = etree.CDATA(sqoop.text)

            if entity.query_obj:
                query = etree.SubElement(
                    ent, "query", **entity.query_obj.to_dict())
                query.text = etree.CDATA(entity.query_obj.text or '')

            for field in entity.fields:
                field_dict = field.to_dict()
                script = field_dict.get('script')
                script_text = None
                if script and (len(script.splitlines()) > 1 or
                               len(script) > 50):
                    del field_dict['script']
                    script_text = script
                field_el = etree.SubElement(ent, "field", **field_dict)
                if script_text:
                    script_tag = etree.SubElement(field_el, "script")
                    script_tag.text = etree.CDATA(script_text)

            for subentity in entity.entities:
                build_tree(subentity, parent=ent)

        build_tree(tree, import_)

        if self.predict is not None:
            predict = etree.SubElement(plan, "predict")
            for model in self.predict.models:
                predict_model = etree.SubElement(
                    predict, "model", **model.to_dict())
                for weight in model.predict_model_weights:
                    etree.SubElement(
                        predict_model, "weight", **weight.to_dict())

            if self.predict.label or self.predict.probability:
                result = etree.SubElement(predict, "result")
                etree.SubElement(
                    result, "label", **self.predict.label.to_dict())
                etree.SubElement(
                    result, "probability",
                    **self.predict.probability.to_dict())

        return etree.tostring(plan, pretty_print=pretty_print)

    def get_iterator(self, params, callback=None):
        plan = ExtractionPlan(self.get_plan_config(), is_file=False)
        return CoreImportHandler(plan, params, callback=callback)

    def get_fields(self):
        """
        Returns list of the field names
        """
        if self.data is None:
            return []

        def get_entity_fields(entity):
            fields = []
            for name, field in entity.fields.iteritems():
                if not field.is_datasource_field:
                    fields.append(field.name)
            for sub_entity in entity.nested_entities_field_ds.values():
                fields += get_entity_fields(sub_entity)
            for sub_entity in entity.nested_entities_global_ds:
                fields += get_entity_fields(sub_entity)
            return fields

        # TODO: try .. except after check this with real import handlers
        try:
            plan = ExtractionPlan(self.data, is_file=False)
            return get_entity_fields(plan.entity)
        except Exception, exc:
            logging.error(exc)
            raise ImportHandlerError(exc.message, exc)
Esempio n. 13
0
    probability_id = db.Column(db.ForeignKey('predict_result_probability.id'))
    probability = relationship(
        'PredictResultProbability', foreign_keys=[probability_id],
        cascade='all,delete', backref='probabilities')


class PredictModel(db.Model, BaseMixin):
    FIELDS_TO_SERIALIZE = ('name', 'value', 'script')

    name = db.Column(db.String(200), nullable=False, name='name')
    value = db.Column(db.String(200), name='value')
    script = db.Column(db.Text, name='script')

predict_models_table = db.Table(
    'predict_models_table', db.Model.metadata,
    db.Column('predict_model_id', db.Integer, db.ForeignKey(
        'predict_model.id', ondelete='CASCADE', onupdate='CASCADE')),
    db.Column('predict_id', db.Integer, db.ForeignKey(
        'predict.id', ondelete='CASCADE', onupdate='CASCADE'))
)


class RefPredictModelMixin(BaseMixin):
    @declared_attr
    def predict_model_id(cls):
        return db.Column(
            'predict_model_id', db.ForeignKey('predict_model.id'))

    @declared_attr
    def predict_model(cls):
        from api.base.utils import convert_name, pluralize
        backref_name = pluralize(convert_name(cls.__name__))
Esempio n. 14
0
        self.features_dict['schema-name'] = self.schema_name
        BaseModel.save(self, commit=commit)

    def delete(self):
        features = Feature.query.filter(
            Feature.feature_set_id == self.id).all()
        for feature in features:
            feature.delete()
        super(FeatureSet, self).delete()


group_by_table = db.Table(
    'group_by_table', db.Model.metadata,
    db.Column(
        'feature_set_id', db.Integer,
        db.ForeignKey('feature_set.id', ondelete='CASCADE',
                      onupdate='CASCADE')),
    db.Column(
        'feature_id', db.Integer,
        db.ForeignKey('feature.id', ondelete='CASCADE', onupdate='CASCADE')))


@event.listens_for(Feature, "after_insert")
def after_insert_feature(mapper, connection, target):
    if target.feature_set is None and target.feature_set_id is not None:
        from sqlalchemy.orm import joinedload
        target = target.__class__.query.options(joinedload('feature_set')).get(
            target.id)
    if target.feature_set is not None:
        update_feature_set_on_change_features(connection, target.feature_set,
                                              target)
Esempio n. 15
0
class Model(db.Model, BaseModel, BaseTrainedEntity, BaseDeployedEntity):
    """
    Represents Model details.
    """
    LOG_TYPE = LogMessage.TRAIN_MODEL

    comparable = db.Column(db.Boolean, default=False)
    weights_synchronized = db.Column(db.Boolean, default=False)

    labels = db.Column(postgresql.ARRAY(db.String), default=[])
    example_label = db.Column(db.String(100))
    example_id = db.Column(db.String(100))

    train_records_count = db.Column(db.Integer)

    tags = relationship('Tag', secondary=lambda: tags_table, backref='models')

    target_variable = db.Column(db.Unicode)
    feature_count = db.Column(db.Integer, default=0)

    features_set_id = db.Column(db.Integer, db.ForeignKey('feature_set.id'))
    features_set = relationship('FeatureSet', uselist=False, backref='model')

    test_import_handler_id = db.Column(db.Integer, nullable=True)
    test_import_handler_type = db.Column(db.String(200), default='json')

    datasets = relationship('DataSet', secondary=lambda: data_sets_table)

    classifier = deferred(db.Column(JSONType))
    # Note: It could contains different keys depends to the classifier used
    visualization_data = deferred(db.Column(JSONType))
    locked = db.Column(db.Boolean, default=False)
    model_parts_size = deferred(db.Column(JSONType))

    def __init__(self, *args, **kwargs):
        super(Model, self).__init__(*args, **kwargs)
        self.visualization_data = {}

    def visualize_model(self,
                        data=None,
                        status=None,
                        commit=True,
                        segment=None):
        """
        Saves visualization data to the db.

        Note:
            visualization_data is the dict like:
            {
                segment_name1: {parameters: {status: new, ...}, ...},
                segment_name2: {parameters: {status: new, ...}, ...}
                ...
            }
        """
        def set_status(item, status):
            if 'parameters' not in item:
                item['parameters'] = {}
            item['parameters']['status'] = status

        from copy import deepcopy
        visualization_data = deepcopy(self.visualization_data or {})

        if segment is None:
            if data:
                visualization_data = data
            if status:
                set_status(visualization_data, status)
        else:
            if data is None:
                raise ValueError("data is required when segment is specified")
            else:
                # updating the visualization data of specific segment
                visualization_data[segment] = data or {}
                if status:
                    set_status(visualization_data[segment], status)

        self.visualization_data = visualization_data
        if commit:
            self.save()

    @property
    def test_import_handler(self):
        try:
            return getattr(
                self,
                "rel_test_import_handler_%s" % self.test_import_handler_type)
        except AttributeError:
            return None

    @test_import_handler.setter
    def test_import_handler(self, handler):
        if handler is not None:
            self.test_import_handler_id = handler.id
            self.test_import_handler_type = handler.TYPE

    def create_segments(self, segments):
        """
        Creates Segment models by segments dict.

        segments: dict
            Dictionary where keys are segment names and values - count records
            in this segment.
        """
        for name, records in segments.iteritems():
            segment = Segment()
            segment.name = name
            segment.records = records
            segment.model = self
            segment.save()

    def __repr__(self):
        return "<Model {0}>".format(self.name)

    def save(self, commit=True):
        if self.features_set is None:
            from api.features.models import FeatureSet
            self.features_set = FeatureSet()
            db.session.add(self.features_set)
        if self.classifier is None:
            self.classifier = {}
        super(Model, self).save(commit)

    def delete(self):
        # delete features and feature set as they are used by this model only
        self.features_set.delete()

        # prepare dataset list to unlock and tags to decrease
        ds_to_unlock = self.datasets
        tags = self.tags
        super(Model, self).delete()
        # unlock datasets after model deletion
        for ds in ds_to_unlock:
            ds.unlock()
        # decrease corresponding tags counters
        for tag in tags:
            tag.update_counter()

    @property
    def dataset(self):
        return self.datasets[0] if len(self.datasets) else None

    @property
    def data_fields(self):
        ds = self.dataset
        return ds.data_fields if ds else []

    @property
    def test_handler_fields(self):
        handler = self.test_import_handler
        if handler:
            try:
                return handler.get_fields()
            except:
                pass
        return []

    def run_test(self, dataset, callback=None):
        trainer = self.get_trainer()
        fp = dataset.get_data_stream()
        try:
            metrics = trainer.test(dataset.get_iterator(fp),
                                   callback=callback,
                                   save_raw=True)
        finally:
            fp.close()
        raw_data = trainer._raw_data
        trainer.clear_temp_data()
        self.set_trainer(trainer)
        self.save()
        return metrics, raw_data

    def transform_dataset(self, dataset):
        trainer = self.get_trainer()
        fp = dataset.get_data_stream()
        try:
            return trainer.transform(dataset.get_iterator(fp))
        finally:
            fp.close()

    def set_trainer(self, trainer):
        super(Model, self).set_trainer(trainer)
        self.target_variable = trainer._feature_model.target_variable
        self.feature_count = len(trainer._feature_model.features.keys())
        if self.status == self.STATUS_TRAINED and \
                trainer.model_type == TYPE_CLASSIFICATION:
            self.labels = trainer._get_labels()

    def get_features_json(self):
        data = self.features_set.features
        if data is None:
            self.features_set.modified = True
            data = self.features_set.features
        data['features'] = [
            f for f in data['features'] if f.get('disabled', False) is False
        ]
        data['classifier'] = self.classifier
        return json.dumps(data, indent=4)

    @property
    def features(self):
        return self.get_features_json()

    def prepare_fields_for_train(self,
                                 user,
                                 datasets=[],
                                 delete_metadata=True):
        """
        Flushes model fields while re-training.
        Removes related models, when `delete_metadata` setted.
        """
        if delete_metadata:
            from api.model_tests.models import TestResult, TestExample
            from api.base.models import db
            from api.servers.models import ServerModelVerification, \
                VerificationExample
            LogMessage.delete_related_logs(self.id)

            def _del(Cls, related_name):
                count = Cls.query.filter(Cls.model_id == self.id).delete(
                    synchronize_session=False)
                logging.info('%s %s to delete' % (count, related_name))

            # delete server model verification examples
            # (as they don't have reference to model)
            smv = ServerModelVerification.query.filter(
                ServerModelVerification.model_id == self.id).all()
            if len(smv):
                smv_ids = [s.id for s in smv]
                count = VerificationExample.query.filter(
                    VerificationExample.verification_id.in_(smv_ids)).delete(
                        synchronize_session=False)
                logging.info('%s model verification examples to delete' %
                             count)
            _del(ServerModelVerification, 'server model verifications')
            _del(TestExample, 'test examples')
            _del(TestResult, 'tests')
            _del(Weight, 'weights')
            _del(WeightsCategory, 'weights categories')
            _del(Segment, 'segments')

        self.datasets = datasets
        # model is trained, lock datasets used for training
        for dataset in datasets:
            dataset.locked = True
            dataset.save()

        self.status = self.STATUS_TRAINING
        self.visualization_data = {}
        self.error = ""
        self.trained_by = user
        self.comparable = False
        db.session.add(self)
        db.session.commit()

    def _check_deployed(self):
        if not app.config['MODIFY_DEPLOYED_MODEL'] and self.locked:
            self.reason_msg = 'Model {0} has been deployed and blocked ' \
                              'for modifications. '.format(self.name)
            return False
        return True

    @property
    def can_edit(self):
        return self._check_deployed() and super(Model, self).can_edit

    @property
    def can_delete(self):
        if self.training_in_progress:
            self.reason_msg = "The model cannot be deleted while training is" \
                              " still in progress."
        return self._check_deployed() and not self.training_in_progress and \
            super(Model, self).can_delete
Esempio n. 16
0
        return self._check_deployed() and super(Model, self).can_edit

    @property
    def can_delete(self):
        if self.training_in_progress:
            self.reason_msg = "The model cannot be deleted while training is" \
                              " still in progress."
        return self._check_deployed() and not self.training_in_progress and \
            super(Model, self).can_delete


tags_table = db.Table(
    'model_tag', db.Model.metadata,
    db.Column(
        'model_id', db.Integer,
        db.ForeignKey('model.id', ondelete='CASCADE', onupdate='CASCADE')),
    db.Column('tag_id', db.Integer,
              db.ForeignKey('tag.id', ondelete='CASCADE', onupdate='CASCADE')))

data_sets_table = db.Table(
    'model_dataset', db.Model.metadata,
    db.Column(
        'model_id', db.Integer,
        db.ForeignKey('model.id', ondelete='CASCADE', onupdate='CASCADE')),
    db.Column(
        'data_set_id', db.Integer,
        db.ForeignKey('data_set.id', ondelete='CASCADE', onupdate='CASCADE')))

transformer_data_sets_table = db.Table(
    'transformer_dataset', db.Model.metadata,
    db.Column(
Esempio n. 17
0
 def trained_by_id(cls):
     return db.Column(db.ForeignKey('user.id', ondelete='SET NULL'))
Esempio n. 18
0
class TestExample(db.Model, BaseModel):
    __tablename__ = 'test_example'

    NONAME = 'noname'
    NOT_FILED_ID = '-1'

    example_id = db.Column(db.String(100))
    name = db.Column(db.String(100))
    label = db.Column(db.String(100))
    pred_label = db.Column(db.String(100))
    num = db.Column(db.Integer)
    prob = db.Column(postgresql.ARRAY(db.Float))

    data_input = db.Column(JSONType)
    weighted_data_input = db.Column(JSONType)

    test_result_id = db.Column(db.Integer, db.ForeignKey('test_result.id'))
    test_result = relationship('TestResult',
                               backref=backref('examples',
                                               cascade='all,delete'))
    test_name = db.Column(db.String(200))

    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship('Model')
    model_name = db.Column(db.String(200))

    def __repr__(self):
        return '<TestExample {0}>'.format(self.name)

    @property
    def parameters_weights(self):
        res = []

        def sort_by_weight(val):
            return -val['weight']

        def go_tree(params, prefix=''):
            for name, val in params.iteritems():
                if 'weight' in val and val['weight'] != 0:
                    if prefix:
                        val['name'] = '{0}->{1}'.format(prefix, name)
                    else:
                        val['name'] = name
                    res.append(val)
                if 'weights' in val:
                    go_tree(val['weights'], prefix=name)
            return res

        go_tree(self.weighted_data_input)

        res.sort(key=sort_by_weight)
        return res

    @property
    def is_weights_calculated(self):
        return self.weighted_data_input and self.weighted_data_input != {}

    def calc_weighted_data(self):
        if not self.data_input:
            return None

        from api.ml_models.helpers.features import get_features_vect_data
        model = self.model
        trainer = model.get_trainer()
        feature_model = trainer._feature_model
        segment = 'default'
        if len(trainer.with_segmentation) > 0:
            ndata = dict([(key.replace('->', '.'), val)
                          for key, val in self.data_input.iteritems()])
            data = trainer._apply_feature_types(ndata)
            segment = "_".join([
                str(data[feature_name])
                for feature_name in trainer._feature_model.group_by
            ])
            features = trainer.features[segment]
            for feature_name in trainer._feature_model.group_by:
                features.pop(feature_name)
        else:
            try:
                features = trainer.features[segment]
            except:
                features = feature_model.features

        ndata = dict([(key.replace('->', '.'), val)
                      for key, val in self.data_input.iteritems()])
        trainer._prepare_data(iter([
            ndata,
        ]),
                              callback=None,
                              save_raw=False,
                              is_predict=True)
        vect_data1 = trainer._get_vectorized_data(
            segment, trainer._test_prepare_feature)

        vect = scipy.sparse.hstack(vect_data1)
        vect_data = vect.todense().tolist()[0]

        data = get_features_vect_data(vect_data, features.items(),
                                      feature_model.target_variable)

        from api.ml_models.helpers.weights import get_example_params
        segment = Segment.query.filter(Segment.name == segment,
                                       Segment.model == model)[0]
        model_weights = Weight.query.with_entities(
            Weight.name, Weight.value).filter(Weight.segment_id == segment.id)
        weighted_data = dict(
            get_example_params(model_weights, self.data_input, data))
        self.weighted_data_input = weighted_data
        self.save()
        del trainer
        gc.collect()

    @classmethod
    def get_grouped(cls, field, model_id, test_result_id):
        cursor = cls.query.filter_by(
            model_id=model_id, test_result_id=test_result_id
        ).with_entities(
            cls.pred_label,
            cls.label,
            cls.prob,
            # Selecting field from json object isn't supported by alchemy,
            # using literal column instead
            expression.literal_column("data_input->>'{!s}'".format(field)
                                      ).label('group'))

        groups = defaultdict(list)
        for row in cursor.all():
            groups[row[3]].append({
                'label': row[0],
                'pred': row[1],
                'prob': row[2],
            })

        return [{
            field: key,
            'list': value
        } for key, value in groups.iteritems()]

    @classmethod
    def get_data(cls, test_result_id, fields):
        db_fields = []
        for field in fields:
            if field == 'id':
                field = 'example_id'
            db_field = getattr(cls, field, None)
            if db_field:
                db_fields.append(db_field)
            else:
                # Selecting field from json object isn't supported by alchemy,
                # using literal column instead
                db_fields.append(
                    expression.literal_column("data_input->>'{!s}'".format(
                        field.replace('data_input.', ''))).label(field))

        cursor = cls.query.filter_by(
            test_result_id=test_result_id).with_entities(*db_fields)

        for row in cursor.all():
            yield dict(zip(row.keys(), row))
Esempio n. 19
0
 def predict_model_id(cls):
     return db.Column(
         'predict_model_id', db.ForeignKey('predict_model.id'))
Esempio n. 20
0
class TestResult(db.Model, BaseModel):
    LOG_TYPE = LogMessage.RUN_TEST

    STATUS_QUEUED = 'Queued'
    STATUS_IMPORTING = 'Importing'
    STATUS_IMPORTED = 'Imported'
    STATUS_IN_PROGRESS = 'In Progress'
    STATUS_STORING = 'Storing'
    STATUS_COMPLETED = 'Completed'
    STATUS_ERROR = 'Error'

    STATUSES = [
        STATUS_QUEUED, STATUS_IMPORTING, STATUS_IMPORTED, STATUS_IN_PROGRESS,
        STATUS_STORING, STATUS_COMPLETED, STATUS_ERROR
    ]

    TEST_STATUSES = [
        STATUS_QUEUED, STATUS_IMPORTING, STATUS_IMPORTED, STATUS_IN_PROGRESS,
        STATUS_STORING
    ]

    __tablename__ = 'test_result'

    name = db.Column(db.String(200), nullable=False)
    status = db.Column(db.Enum(*STATUSES, name='test_statuses'))
    error = db.Column(db.String(300))

    model_id = db.Column(db.Integer, db.ForeignKey('model.id'))
    model = relationship(Model, backref=backref('tests', cascade='all,delete'))
    model_name = db.Column(db.String(200))

    data_set_id = db.Column(db.Integer,
                            db.ForeignKey('data_set.id', ondelete='SET NULL'))
    dataset = relationship(DataSet, foreign_keys=[data_set_id])

    examples_count = db.Column(db.Integer)
    examples_fields = db.Column(postgresql.ARRAY(db.String))
    examples_size = db.Column(db.Float)

    parameters = db.Column(JSONType)
    classes_set = db.Column(postgresql.ARRAY(db.String))
    accuracy = db.Column(db.Float)
    roc_auc = db.Column(JSONType)
    metrics = db.Column(JSONType)
    memory_usage = db.Column(db.Integer)

    vect_data = deferred(db.Column(S3File))
    fill_weights = db.Column(db.Boolean, default=False)

    def __repr__(self):
        return '<TestResult {0}>'.format(self.name)

    def get_vect_data(self, num, segment):
        from pickle import loads
        data = loads(self.vect_data)
        offset = 0
        for k, v in data.items():
            offset += v.shape[0]
            if k == segment:
                break
        import numpy
        if isinstance(data[segment], numpy.ndarray):
            return data[num - offset]
        return data[segment].getrow(num - offset).todense().tolist()[0]

    def set_error(self, error, commit=True):
        self.error = str(error)[:299]
        self.status = TestResult.STATUS_ERROR
        if commit:
            self.save()

    @property
    def exports(self):
        from api.async_tasks.models import AsyncTask
        return AsyncTask.get_current_by_object(
            self,
            'api.model_tests.tasks.get_csv_results',
        )

    @property
    def db_exports(self):
        from api.async_tasks.models import AsyncTask
        return AsyncTask.get_current_by_object(
            self,
            'api.model_tests.tasks.export_results_to_db',
        )

    @property
    def confusion_matrix_calculations(self):
        from api.async_tasks.models import AsyncTask
        return AsyncTask.get_current_by_object(
            self,
            'api.model_tests.tasks.calculate_confusion_matrix',
            statuses=AsyncTask.STATUSES)

    @property
    def can_edit(self):
        if not self.model.can_edit:
            self.reason_msg = self.model.reason_msg
            return False
        return super(TestResult, self).can_edit

    @property
    def can_delete(self):
        if not self.model.can_delete:
            self.reason_msg = self.model.reason_msg
            return False
        return super(TestResult, self).can_delete

    def delete(self):
        ds = self.dataset
        super(TestResult, self).delete()
        ds.unlock()

    @property
    def test_in_progress(self):
        return self.status in self.TEST_STATUSES
Esempio n. 21
0
 def feature_set_id(cls):
     return db.Column('feature_set_id', db.ForeignKey('feature_set.id'))
Esempio n. 22
0
class DataSet(db.Model, BaseModel):
    """
    Set of the imported data.
    """
    LOG_TYPE = LogMessage.IMPORT_DATA

    STATUS_NEW = 'New'
    STATUS_IMPORTING = 'Importing'
    STATUS_UPLOADING = 'Uploading'
    STATUS_IMPORTED = 'Imported'
    STATUS_ERROR = 'Error'
    STATUSES = [
        STATUS_IMPORTING, STATUS_UPLOADING, STATUS_IMPORTED, STATUS_ERROR,
        STATUS_NEW
    ]

    FORMAT_JSON = 'json'
    FORMAT_CSV = 'csv'
    FORMATS = [FORMAT_JSON, FORMAT_CSV]

    name = db.Column(db.String(200))
    status = db.Column(db.Enum(*STATUSES, name='dataset_statuses'),
                       default=STATUS_NEW)
    error = db.Column(db.String(300))  # TODO: trunc error to 300 symbols
    data = db.Column(db.String(200))
    import_params = db.Column(JSONType)

    # Generic relation to import handler
    import_handler_id = db.Column(db.Integer, nullable=False)
    import_handler_type = db.Column(db.String(200), default='xml')
    import_handler_xml = db.Column(db.Text)

    cluster_id = db.Column(db.Integer,
                           db.ForeignKey('cluster.id', ondelete='SET NULL'))
    cluster = relationship('Cluster', backref=backref('datasets'))
    pig_step = db.Column(db.Integer, nullable=True)
    pig_row = db.Column(JSONType)

    on_s3 = db.Column(db.Boolean)
    compress = db.Column(db.Boolean)
    filename = db.Column(db.String(200))
    filesize = db.Column(db.BigInteger)
    records_count = db.Column(db.Integer)
    time = db.Column(db.Integer)
    data_fields = db.Column(postgresql.ARRAY(db.String))
    format = db.Column(db.String(10))
    uid = db.Column(db.String(200))
    locked = db.Column(db.Boolean, default=False)

    @property
    def import_handler(self):
        """Provides in-Python access to the "parent" by choosing
        the appropriate relationship.
        """
        return ImportHandler.query.get(self.import_handler_id)

    @import_handler.setter
    def import_handler(self, handler):
        self.import_handler_id = handler.id
        self.import_handler_type = handler.TYPE
        self.import_handler_xml = handler.data

    def set_uid(self):
        if not self.uid:
            self.uid = uuid.uuid1().hex

    def get_s3_download_url(self, expires_in=3600):
        helper = AmazonS3Helper()
        return helper.get_download_url(self.uid, expires_in)

    def set_file_path(self):
        self.set_uid()
        data = '%s.%s' % (self.uid, 'gz' if self.compress else 'json')
        self.data = data
        from api.base.io_utils import get_or_create_data_folder
        path = get_or_create_data_folder()
        self.filename = join(path, data)
        self.save()

    @property
    def loaded_data(self):
        if not self.on_s3:
            raise Exception('Invalid oper')

        if not hasattr(self, '_data'):
            self._data = self.load_from_s3()
        return self._data

    def get_data_stream(self):
        import gzip
        if not self.on_s3 or exists(self.filename):
            logging.info('Loading data from local file')
            open_meth = gzip.open if self.compress else open
            return open_meth(self.filename, 'r')
        else:
            logging.info('Loading data from Amazon S3')
            stream = StringIO.StringIO(self.loaded_data)
            if self.compress:
                logging.info('Decompress data')
                return gzip.GzipFile(fileobj=stream, mode='r')
            return stream

    def get_iterator(self, stream):
        from cloudml.trainer.streamutils import streamingiterload
        return streamingiterload(stream, source_format=self.format)

    def load_from_s3(self):
        helper = AmazonS3Helper()
        return helper.load_key(self.uid)

    def save_to_s3(self):
        meta = {
            'handler': self.import_handler_id,
            'dataset': self.name,
            'params': str(self.import_params)
        }
        self.set_uid()
        helper = AmazonS3Helper()
        helper.save_gz_file(self.uid, self.filename, meta)
        helper.close()
        self.on_s3 = True
        self.save()

    def set_error(self, error, commit=True):
        self.error = str(error)[:299]
        self.status = self.STATUS_ERROR
        if commit:
            self.save()

    def delete(self):
        # Stop task
        # self.terminate_task()  # TODO
        filename = self.filename
        on_s3 = self.on_s3
        uid = self.uid

        super(DataSet, self).delete()
        LogMessage.delete_related_logs(self.id, type_=LogMessage.IMPORT_DATA)

        # TODO: check import handler type
        try:
            os.remove(filename)
        except OSError:
            pass
        if on_s3:
            from botocore.exceptions import ClientError
            helper = AmazonS3Helper()
            try:
                helper.delete_key(uid)
            except ClientError as e:
                logging.exception(str(e))

    def save(self, *args, **kwargs):
        if self.status != self.STATUS_ERROR:
            self.error = ''
        super(DataSet, self).save(*args, **kwargs)

    def __repr__(self):
        return '<Dataset %r>' % self.name

    def _check_locked(self):
        if self.locked:
            self.reason_msg = 'Some existing models were trained/tested ' \
                              'using this dataset. '
            return False
        return True

    @property
    def can_edit(self):
        return self._check_locked() and super(DataSet, self).can_edit

    @property
    def can_delete(self):
        return self._check_locked() and super(DataSet, self).can_delete

    def unlock(self):
        from api.ml_models.models import data_sets_table
        from api.model_tests.models import TestResult
        if db.session.query(data_sets_table).filter(
                data_sets_table.c.data_set_id == self.id).count() == 0 and \
           TestResult.query.filter(
                TestResult.data_set_id == self.id).count() == 0:
            self.locked = False
            self.save()