Exemple #1
0
    def getPreparedInputDF(self, train_metadata):
        """

        :param train_metadata:
        :type train_metadata: TransactionMetadata
        :return:
        """
        if self.transaction.metadata.type == TRANSACTION_PREDICT:

            # these are the columns in the model, pulled from persistent_data
            columns = self.transaction.persistent_model_metadata.columns  # type: list

            # if the predict statement comes with some data as from_date use it
            if self.transaction.metadata.from_data is not None:

                # make sure we build a dataframe that has all the columns we need
                df = self.transaction.metadata.from_data
                df = df.where((pandas.notnull(df)), None)

                from_data_columns = df.columns

                # remove the ones that dont exist in the train data
                for col in from_data_columns:
                    if col not in columns:
                        logging.debug(
                            'Removing column "{col}" from data as it did not exist in training'
                            .format(col=col))
                        df.drop(columns=[col])

                # add the ones that dont exist in
                for col in columns:
                    if col not in from_data_columns:
                        df[col] = None

                # amke sure it has the same order
                result = df[columns]

            else:

                if type(self.transaction.metadata.model_when_conditions
                        ) != type([]):
                    when_conditions = [
                        self.transaction.metadata.model_when_conditions
                    ]
                else:
                    when_conditions = self.transaction.metadata.model_when_conditions

                when_conditions_list = []
                # here we want to make a list of the type  ( ValueForField1, ValueForField2,..., ValueForFieldN ), ...
                for when_condition in when_conditions:
                    cond_list = [None] * len(
                        columns)  # empty list with blanks for values

                    for condition_col in when_condition:
                        col_index = columns.index(condition_col)
                        cond_list[col_index] = when_condition[condition_col]

                    when_conditions_list.append(cond_list)

                result = pandas.DataFrame(when_conditions_list,
                                          columns=columns)
        else:

            df = train_metadata.from_data
            result = df.where((pandas.notnull(df)), None)

        # apply order by (group_by, order_by)
        if train_metadata.model_order_by:
            if train_metadata.model_group_by:
                sort_by = [train_metadata.model_group_by
                           ] + train_metadata.model_order_by
                result = result.sort_values(sort_by, ascending=[True, True])
            else:
                sort_by = [train_metadata.model_order_by]
                result = result.sort_values(sort_by, ascending=[True])

        return result
Exemple #2
0
    def testModel(self, test_sampler):
        """

        :param test_sampler:
        :return:  TesterResponse
        """

        real_target_all = []
        predicted_target_all = []

        real_target_all_ret = []
        predicted_target_all_ret = []

        self.eval()  # toggle eval
        for batch_number, batch in enumerate(test_sampler):
            for permutation in self.col_permutations:
                batch.blank_columns = permutation
                #batch.blank_columns = []
                logging.debug(
                    '[EPOCH-BATCH] testing batch: {batch_number}'.format(
                        batch_number=batch_number))
                # get real and predicted values by running the model with the input of this batch
                predicted_target = self.forward(
                    batch.getInput(flatten=self.flatInput))
                real_target = batch.getTarget(flatten=self.flatTarget)
                # append to all targets and all real values
                real_target_all += real_target.data.tolist()
                predicted_target_all += predicted_target.data.tolist()

                if len(permutation) == 0:
                    # append to all targets and all real values
                    real_target_all_ret += real_target.data.tolist()
                    predicted_target_all_ret += predicted_target.data.tolist()

        if batch is None:
            logging.error('there is no data in test, we should not be here')
            return

        # caluclate the error for all values
        predicted_targets = batch.deflatTarget(np.array(predicted_target_all))
        real_targets = batch.deflatTarget(np.array(real_target_all))
        # caluclate the error for all values
        predicted_targets_ret = batch.deflatTarget(
            np.array(predicted_target_all_ret))
        real_targets_ret = batch.deflatTarget(np.array(real_target_all_ret))

        r_values = {}
        # calculate r and other statistical properties of error
        for target_key in real_targets_ret:

            r_values[target_key] = explained_variance_score(
                real_targets_ret[target_key],
                predicted_targets_ret[target_key],
                multioutput='variance_weighted')

        # calculate error using error function
        errors = {
            target_key: float(
                self.errorFunction(
                    Variable(torch.FloatTensor(predicted_targets[target_key])),
                    Variable(torch.FloatTensor(
                        real_targets[target_key]))).item())
            for target_key in real_targets
        }
        error = np.average([errors[key] for key in errors])
        r_value = np.average([r_values[key] for key in r_values])

        resp = TesterResponse(error=error,
                              accuracy=r_value,
                              predicted_targets=predicted_targets_ret,
                              real_targets=real_targets_ret)

        self.current_accuracy = r_value

        return resp
Exemple #3
0
    def train(self):
        """

        :return:
        """

        last_epoch = 0
        lowest_error = None
        highest_accuracy = 0
        local_files = None

        for i in range(len(self.data_model_object.learning_rates)):

            self.data_model_object.setLearningRateIndex(i)

            for train_ret in self.data_model_object.trainModel(
                    self.train_sampler):

                logging.debug(
                    'Training State epoch:{epoch}, batch:{batch}, loss:{loss}'.
                    format(epoch=train_ret.epoch,
                           batch=train_ret.batch,
                           loss=train_ret.loss))

                # save model every new epoch
                if last_epoch != train_ret.epoch:
                    last_epoch = train_ret.epoch
                    logging.debug(
                        'New epoch:{epoch}, testing and calculating error'.
                        format(epoch=last_epoch))
                    test_ret = self.data_model_object.testModel(
                        self.test_sampler)
                    logging.info(
                        'Test Error:{error}, Accuracy:{accuracy} | Best Accuracy so far: {best_accuracy}'
                        .format(error=test_ret.error,
                                accuracy=test_ret.accuracy,
                                best_accuracy=highest_accuracy))
                    is_it_lowest_error_epoch = False
                    # if lowest error save model
                    if lowest_error in [None]:
                        lowest_error = test_ret.error
                    if lowest_error > test_ret.error:
                        is_it_lowest_error_epoch = True
                        lowest_error = test_ret.error
                        highest_accuracy = test_ret.accuracy
                        logging.info(
                            '[SAVING MODEL] Lowest ERROR so far! - Test Error: {error}, Accuracy: {accuracy}'
                            .format(error=test_ret.error,
                                    accuracy=test_ret.accuracy))
                        logging.debug(
                            'Lowest ERROR so far! Saving: model {model_name}, {data_model} config:{config}'
                            .format(
                                model_name=self.model_name,
                                data_model=self.ml_model_name,
                                config=self.ml_model_info.config_serialized))

                        # save model local file
                        local_files = self.saveToDisk(local_files)
                        # throttle model saving into GridFS to 10 minutes
                        # self.saveToGridFs(local_files, throttle=True)

                        # save model predicted - real vectors
                        logging.debug(
                            'Saved: model {model_name}:{ml_model_name} state vars into db [OK]'
                            .format(model_name=self.model_name,
                                    ml_model_name=self.ml_model_name))

                    # check if continue training
                    if self.shouldContinue() == False:
                        return
                    # save/update model loss, error, confusion_matrix
                    self.registerModelData(train_ret, test_ret,
                                           is_it_lowest_error_epoch)

            logging.info(
                'Loading model from store for retrain on new learning rate {lr}'
                .format(lr=self.data_model_object.learning_rates[i]
                        [LEARNING_RATE_INDEX]))
            # after its done with the first batch group, get the one with the lowest error and keep training

            ml_model_info = self.ml_model_info.find_one({
                'model_name':
                self.model_name,
                'ml_model_name':
                self.ml_model_name,
                'config_serialized':
                json.dumps(self.config)
            })

            if ml_model_info is None:
                # TODO: Make sure we have a model for this
                logging.info('No model found in storage')
                return

            fs_file_ids = ml_model_info.fs_file_ids

            self.data_model_object = self.ml_model_class.loadFromDisk(
                file_ids=fs_file_ids)
Exemple #4
0
    def trainModel(self, train_sampler, learning_rate_index=None):
        """
        This function is an interator to train over the sampler

        :param train_sampler: the sampler to iterate over and train on

        :yield: TrainerResponse
        """

        model_object = self
        response = TrainerResponse(model_object)

        if learning_rate_index is not None:
            self.setLearningRateIndex(learning_rate_index)

        model_object.optimizer = None

        for epoch in range(self.total_epochs):

            full_set_loss = 0
            total_samples = 0
            response.epoch = epoch
            perm_index = 0
            # train epoch
            for batch_number, batch in enumerate(train_sampler):
                # TODO: Build machanics for model to learn about missing data
                # How? Here build permutation list of all possible combinations of blank columns
                # Iterate over permutations on train loop (which is what is inside this for statement)
                # Interface: Batch.setNullColumns(cols=<type: list>)

                # do only one permutation at a time, if we 2 or more columns
                if len(self.input_column_names) > 1:
                    perms = [self.col_permutations[perm_index], []]
                    perm_index = perm_index + 1 if perm_index + 1 < len(
                        self.col_permutations) else 0
                else:
                    perms = [[]]
                # so here we pass one column per batch that can use as permutation,
                # essentially on every batch it makes sure that we pass all columns [[]] and also we do a run with one column with none values

                for permutation in perms:
                    batch.blank_columns = permutation
                    response.batch = batch_number
                    logging.debug(
                        '[EPOCH-BATCH] Training on epoch: {epoch}/{num_epochs}, batch: {batch_number}'
                        .format(epoch=epoch + 1,
                                num_epochs=self.total_epochs,
                                batch_number=batch_number))
                    model_object.train()  # toggle to train
                    model_object.zeroGradOptimizer()
                    loss, batch_size = model_object.calculateBatchLoss(batch)
                    if batch_size <= 0:
                        break
                    total_samples += batch_size
                    full_set_loss += int(
                        loss.item()
                    ) * batch_size  # this is because we need to wight the error by samples in batch
                    average_loss = full_set_loss / total_samples
                    loss.backward()  #retain_graph=True)
                    model_object.optimize()
                    response.loss = average_loss

                    yield response
Exemple #5
0
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        train_metadata = self.transaction.metadata

        if self.transaction.metadata.type == TRANSACTION_PREDICT:

            self.populatePredictQuery()

            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type not in [TRANSACTION_PREDICT, TRANSACTION_LEARN]:

            self.session.logging.error('Do not support transaction {type}'.format(type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return



        query = self.prepareFullQuery(train_metadata)

        try:
            self.transaction.session.logging.info('About to pull query {query}'.format(query=query))
            conn = sqlite3.connect(self.transaction.metadata.storage_file)
            self.logging.info(self.transaction.metadata.model_query)
            df = pandas.read_sql_query(query, conn)
            result = df.where((pandas.notnull(df)), None)
            df = None # clean memory

        except Exception:

            self.session.logging.error(traceback.print_exc())
            self.transaction.error =True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        if len(data_array[0])>0 and  self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            if self.transaction.metadata.model_test_query:
                try:
                    test_query = query_wrapper.format(orig_query = self.transaction.metadata.model_test_query, order_by_string= order_by_string, where_not_null_string=where_not_null_string)
                    self.transaction.session.logging.info('About to pull TEST query {query}'.format(query=test_query))
                    #drill = self.session.drill.query(test_query, timeout=CONFIG.DRILL_TIMEOUT)
                    df = pandas.read_sql_query(test_query, conn)
                    result = df.where((pandas.notnull(df)), None)
                    df = None

                    #result = vars(drill)['data']
                except Exception:

                    # If testing offline, get results from a .cache file
                    self.session.logging.error(traceback.print_exc())
                    self.transaction.error = True
                    self.transaction.errorMsg = traceback.print_exc(1)
                    return

                columns = list(result.columns.values)
                data_array = result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array =  len(data_array)
                test_indexes = [i for i in range(total_data_array, total_data_array+total_test_array)]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1-test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))
                df = pandas.read_sql_query(all_group_by_items_query, conn)
                result = df.where((pandas.notnull(df)), None)
                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {i[0]:i[1] for i in result.values.tolist()}
                all_group_by_values = all_group_by_counts.keys()

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values :
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values :
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values :
                            self.transaction.input_data.validation_indexes += [i]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(self.transaction.input_data.validation_indexes)==0:
                            self.transaction.input_data.validation_indexes += [i]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio*2
                test_size = int(len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[-test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:-test_size]

            logging.info('- Test: {size} rows'.format(size=len(self.transaction.input_data.test_indexes)))
            logging.info('- Train: {size} rows'.format(size=len(self.transaction.input_data.train_indexes)))
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        # Train metadata is the metadata that was used when training the model,
        # note: that we need this train metadata even if we are predicting, so we can understand about the model
        train_metadata = None

        if self.transaction.metadata.type == TRANSACTION_PREDICT:
            # extract this from the persistent_model_metadata
            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(
                self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type == TRANSACTION_LEARN:
            # Pull this straight from the the current transaction
            train_metadata = self.transaction.metadata

        else:
            # We cannot proceed without train metadata
            self.session.logging.error(
                'Do not support transaction {type}'.format(
                    type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        result = self.getPreparedInputDF(train_metadata)

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        # make sure that the column we are trying to predict is on the input_data
        # else fail, because we cannot predict data we dont have
        # TODO: Revise this, I may pass a source data that doesnt have the column I want to predict and that may still be ok if we are making a prediction that is not time series
        if len(data_array[0]
               ) > 0 and self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(
                        column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            # if a test_data set was given use it
            if self.transaction.metadata.test_from_data:
                df = self.transaction.metadata.test_from_data.df
                test_result = df.where((pandas.notnull(df)), None)

                columns = list(test_result.columns.values)
                data_array = test_result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array = len(data_array)
                test_indexes = [
                    i for i in range(total_data_array, total_data_array +
                                     total_test_array)
                ]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1 - test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(
                        group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(
                        column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                #all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                #self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))

                uniques = result.groupby([group_by]).size()
                all_group_by_values = uniques.index.tolist()
                uniques_counts = uniques.values.tolist()

                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {
                    value: uniques_counts[i]
                    for i, value in enumerate(all_group_by_values)
                }

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(
                            train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values:
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values:
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(
                                self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(
                                self.transaction.input_data.validation_indexes
                        ) == 0:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(
                        self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio * 2
                test_size = int(
                    len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[
                    -test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:
                                                                                                      -test_size]

            test_len = len(self.transaction.input_data.test_indexes)
            train_len = len(self.transaction.input_data.train_indexes)
            validation_len = len(
                self.transaction.input_data.validation_indexes)
            total_len = test_len + train_len + validation_len
            logging.info('- Train: {size} rows'.format(size=train_len))
            logging.info('- Test: {size} rows'.format(size=test_len))
            logging.info(
                '- Validation: {size} rows'.format(size=validation_len))
            logging.info('-- Total: {size} rows'.format(size=total_len))