Beispiel #1
0
    def __init__(self,
                 data,
                 model_name,
                 ml_model_name='pytorch.models.column_based_fcnn',
                 config={}):
        """

        :param data:
        :type data: ModelData
        :param model_name:
        :param ml_model_name:
        :param config:
        """

        self.data = data
        self.model_name = model_name
        self.ml_model_name = ml_model_name
        self.config = config
        self.config_serialized = json.dumps(self.config)
        self.config_hash = hashtext(self.config_serialized)

        # get basic variables defined

        self.persistent_model_metadata = PersistentModelMetadata().find_one(
            {'model_name': self.model_name})

        self.ml_model_info = PersistentMlModelInfo()
        self.ml_model_info.model_name = self.model_name
        self.ml_model_info.ml_model_name = self.ml_model_name
        self.ml_model_info.config_serialized = self.config_serialized
        self.ml_model_info.insert()

        self.framework, self.dummy, self.data_model_name = self.ml_model_name.split(
            '.')
        self.ml_model_module_path = 'mindsdb.libs.ml_models.' + self.ml_model_name + '.' + self.data_model_name
        self.ml_model_class_name = convert_snake_to_cammelcase_string(
            self.data_model_name)

        self.ml_model_module = importlib.import_module(
            self.ml_model_module_path)
        self.ml_model_class = getattr(self.ml_model_module,
                                      self.ml_model_class_name)

        self.train_sampler = Sampler(
            self.data.train_set,
            metadata_as_stored=self.persistent_model_metadata,
            ignore_types=self.ml_model_class.ignore_types,
            sampler_mode=SAMPLER_MODES.LEARN)
        self.test_sampler = Sampler(
            self.data.test_set,
            metadata_as_stored=self.persistent_model_metadata,
            ignore_types=self.ml_model_class.ignore_types,
            sampler_mode=SAMPLER_MODES.LEARN)

        self.train_sampler.variable_wrapper = self.ml_model_class.variable_wrapper
        self.test_sampler.variable_wrapper = self.ml_model_class.variable_wrapper
        self.sample_batch = self.train_sampler.getSampleBatch()

        self.gfs_save_head_time = time.time(
        )  # the last time it was saved into GridFS, assume it was now

        logging.info('Starting model...')
        self.data_model_object = self.ml_model_class(self.sample_batch)
        logging.info('Training model...')
        self.train()
Beispiel #2
0
    def run(self):

        self.train_meta_data = TransactionMetadata()
        self.train_meta_data.setFromDict(
            self.transaction.persistent_model_metadata.train_metadata)

        group_by = self.train_meta_data.model_group_by

        group_by_index = None
        if group_by:
            group_by_index = self.transaction.input_data.columns.index(
                group_by
            )  # TODO: Consider supporting more than one index column

        # this is a template of how we store columns
        column_packs_template = OrderedDict()

        # create a template of the column packs
        for i, column_name in enumerate(self.transaction.input_data.columns):
            column_packs_template[column_name] = []

        if self.transaction.metadata.type == TRANSACTION_LEARN:
            groups = [{
                'name': 'test',
                'target_set': self.transaction.model_data.test_set,
                'map': self.transaction.model_data.test_set_map,
                'indexes': self.transaction.input_data.test_indexes
            }, {
                'name': 'train',
                'target_set': self.transaction.model_data.train_set,
                'map': self.transaction.model_data.train_set_map,
                'indexes': self.transaction.input_data.train_indexes
            }, {
                'name': 'validation',
                'target_set': self.transaction.model_data.validation_set,
                'map': self.transaction.model_data.validation_set_map,
                'indexes': self.transaction.input_data.validation_indexes
            }]
        else:
            groups = [{
                'name':
                'predict',
                'target_set':
                self.transaction.model_data.predict_set,
                'map':
                self.transaction.model_data.predict_set_map,
                'indexes':
                range(0, len(self.transaction.input_data.data_array)
                      )  # TODO: measure impact of this
            }]

        # iterate over all groups and populate tensors by columns

        for group in groups:

            target_set = group['target_set']  # for ease use a pointer

            # iterate over all indexes taht belong to this group
            for input_row_index in group['indexes']:

                row = self.transaction.input_data.data_array[
                    input_row_index]  # extract the row from input data
                map = group['map']

                if group_by is not None:
                    group_by_hash = hashtext(row[group_by_index])

                else:
                    group_by_hash = KEY_NO_GROUP_BY

                # if the set group has not been initiated add a new one
                if group_by_hash not in target_set:
                    target_set[group_by_hash] = copy.deepcopy(
                        column_packs_template)
                    map[group_by_hash] = {}

                # Now populate into the group_hash column pile
                for column_index, cell_value in enumerate(row):

                    column_name = self.transaction.input_data.columns[
                        column_index]

                    value = self.cast(cell_value)
                    stats = self.transaction.persistent_model_metadata.column_stats[
                        column_name]

                    # TODO: Provide framework for custom nom functions
                    # TODO: FIX norm allways add column for is null
                    normalized = norm(
                        value=value, cell_stats=stats
                    )  # this should return a vector representation already normalized

                    # keep track of where it came from in the input data inc ase we need to go back
                    position = len(target_set[group_by_hash][column_name])
                    map[group_by_hash][position] = input_row_index

                    # append normalized vector to column tensor
                    target_set[group_by_hash][column_name] += [normalized]

                    if self.transaction.persistent_model_metadata.column_stats[
                            column_name][KEYS.DATA_TYPE] in [
                                DATA_TYPES.NUMERIC, DATA_TYPES.DATE
                            ] and column_name in self.train_meta_data.model_predict_columns:
                        column_name_expanded = EXTENSION_COLUMNS_TEMPLATE.format(
                            column_name=column_name)
                        if column_name_expanded not in target_set[
                                group_by_hash]:
                            target_set[group_by_hash][
                                column_name_expanded] = []

                        normalized_buckets = norm_buckets(value=value,
                                                          cell_stats=stats)
                        target_set[group_by_hash][column_name_expanded] += [
                            normalized_buckets
                        ]

            # turn into numpy arrays:
            for group_by_hash in target_set:

                distances = None

                # if we have a group by and order by calculate a distances vector for each data point in this batch
                if self.train_meta_data.model_group_by is not None and self.train_meta_data.model_order_by is not None:
                    distances = []
                    batch_height = len(target_set[group_by_hash][
                        self.train_meta_data.model_group_by])

                    # create a vector for the top distance

                    for j in range(batch_height):
                        order_by_bottom_vector = np.array(
                            list(
                                itertools.chain.from_iterable([
                                    target_set[group_by_hash][order_by_col][j]
                                    for order_by_col in
                                    self.train_meta_data.model_order_by
                                ])))
                        if j == 0:
                            order_by_top_vector = order_by_bottom_vector
                        else:
                            order_by_top_vector = np.array(
                                list(
                                    itertools.chain.from_iterable([
                                        target_set[group_by_hash][order_by_col]
                                        [j - 1] for order_by_col in
                                        self.train_meta_data.model_order_by
                                    ])))
                        # create a vector for the current row

                        # calculate distance and append to distances
                        distance = float(
                            np.linalg.norm(order_by_top_vector -
                                           order_by_bottom_vector))
                        distances.append(distance)

                # Append the time series data to each column
                # NOTE: we want to make sure that the self.train_meta_data.model_predict_columns are the first in being converted into vectors
                #       the reason for this is that if there is a time series query then we will want to add the history of the target value (see self._getRowExtraVector)
                columns_in_order = self.train_meta_data.model_predict_columns + [
                    column_name
                    for column_name in target_set[group_by_hash] if column_name
                    not in self.train_meta_data.model_predict_columns
                ]

                for column_name in columns_in_order:

                    # if there is a group by and order by and this is not a column to be predicted, append history vector
                    # TODO: Encode the history vector if possible
                    non_groupable_columns = self.train_meta_data.model_predict_columns + [
                        self.train_meta_data.model_group_by
                    ] + self.train_meta_data.model_order_by
                    # NOTE: since distances is only not None if there is a group by this is only evaluated for group by queries
                    if distances is not None and column_name not in non_groupable_columns:
                        # for each row create a vector of history and append to it
                        prev = 0
                        for col_row_index, col_row in enumerate(
                                target_set[group_by_hash][column_name]):
                            row_extra_vector = self._getRowExtraVector(
                                target_set[group_by_hash], column_name,
                                col_row_index, distances)
                            target_set[group_by_hash][column_name][
                                col_row_index] = target_set[group_by_hash][
                                    column_name][
                                        col_row_index] + row_extra_vector

                    target_set[group_by_hash][column_name] = np.array(
                        target_set[group_by_hash][column_name])

        return []