Ejemplo n.º 1
0
def train_and_test(model_definition, dataset_file, target_folder):
    model = LudwigModel(model_definition, logging_level=DEBUG)
    train_stats = model.train(data_csv=dataset_file)
    model.save(target_folder)
    # optionally a separate test file can be supplied OR
    # Ludwig built-in "split" column mechanism can be used
    predictions, test_stats = model.test(data_csv=dataset_file)
    print(test_stats['combined']['accuracy'])
    model.close()
Ejemplo n.º 2
0
def main():
    data = pd.read_csv('./winequality-red.csv')
    for dummy_i in range(len(data["quality"])):
        if data["quality"][dummy_i] < 6.5:
            data.at[dummy_i, "quality"] = 0
        else:
            data.at[dummy_i, "quality"] = 1
    model_definition = {
        "input_features": [{
            'name': 'fixed_acidity',
            'type': 'numerical'
        }, {
            'name': 'volatile_acidity',
            'type': 'numerical'
        }, {
            'name': 'citric_acid',
            'type': 'numerical'
        }, {
            'name': 'residual_sugar',
            'type': 'numerical'
        }, {
            'name': 'chlorides',
            'type': 'numerical'
        }, {
            'name': 'free_sulfur_dioxide',
            'type': 'numerical'
        }, {
            'name': 'total_sulfur_dioxide',
            'type': 'numerical'
        }, {
            'name': 'density',
            'type': 'numerical'
        }, {
            'name': 'pH',
            'type': 'numerical'
        }, {
            'name': 'sulphates',
            'type': 'numerical'
        }, {
            'name': 'alcohol',
            'type': 'numerical'
        }],
        "output_features": [{
            'name': 'quality',
            'type': 'binary'
        }]
    }
    model = LudwigModel(model_definition)
    trained_model = model.train(data)
    predictions, test_stats = model.test(data_df=data)
    model.save("wine")
    model.close()
Ejemplo n.º 3
0
    def train(self):
        training_dataframe, model_definition = self._create_ludwig_dataframe('train')
        if self.transaction.lmd['model_order_by'] is None:
            timeseries_cols = []
        else:
            timeseries_cols = list(map(lambda x: x[0], self.transaction.lmd['model_order_by']))

        if len(timeseries_cols) > 0:
            training_dataframe, model_definition =  self._translate_df_to_timeseries_format(training_dataframe, model_definition, timeseries_cols, 'train')

        with disable_ludwig_output(True):

            model = LudwigModel(model_definition)

            # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295
            #model.initialize_model(train_set_metadata={})
            #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name']

            if self.transaction.lmd['rebuild_model'] is True:
                train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True)
            else:
                model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
                train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True)
                #,model_load_path=self.transaction.lmd['ludwig_data']['ludwig_save_path'])

            for k in train_stats['train']:
                if k not in self.transaction.lmd['model_accuracy']['train']:
                    self.transaction.lmd['model_accuracy']['train'][k] = []
                    self.transaction.lmd['model_accuracy']['test'][k] = []
                elif k is not 'combined':
                    # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway
                    pass
                else:
                    self.transaction.lmd['model_accuracy']['train'][k].extend(train_stats['train'][k]['accuracy'])
                    self.transaction.lmd['model_accuracy']['test'][k].extend(train_stats['test'][k]['accuracy'])

                '''
                @ TRAIN ONLINE BIT That's not working
                model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
                for i in range(0,100):
                    train_stats = model.train_online(data_df=training_dataframe)
                    # The resulting train_stats are "None"... wonderful -_-
                '''

            ludwig_model_savepath = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data')

        model.save(ludwig_model_savepath)
        model.close()

        self.transaction.lmd['ludwig_data'] = {'ludwig_save_path': ludwig_model_savepath}
        self.transaction.hmd['ludwig_data'] = {'model_definition': model_definition}
Ejemplo n.º 4
0
def main():
    folderName = "testeste"

    cvsFile = pd.read_csv(folderName + '/train.csv')
    print(cvsFile)
    cvsFilePredict = pd.read_csv(folderName + '/predict.csv')
    
    model_definition = {
        'input_features':[
            {'name':'image_path', 'type':'image', 'encoder':'stacked_cnn'}
        ],
        'output_features': [
            {'name': 'class', 'type': 'binary'}
        ]
    }

    model = LudwigModel(model_definition)
    trainData = model.train(data_df=cvsFile)

    #model = LudwigModel.load("trainedModel")
    predictionData1 = model.predict(data_df=cvsFilePredict)

    '''
    numpyPrediction = predictionData1.to_numpy()
    results = []
    for i in range(len(numpyPrediction)):
        results.append(numpyPrediction[i][0])
    
    #results now has the bool values
    '''


    
    print("=========================PREDICTION 1=========================")
    print(predictionData1.to_string())

    model.close()
Ejemplo n.º 5
0
    def train(self):
        training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe(
            'train')

        if len(timeseries_cols) > 0:
            training_dataframe, model_definition = self._translate_df_to_timeseries_format(
                training_dataframe, model_definition, timeseries_cols, 'train')

        with disable_console_output(True):
            # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295
            #model.initialize_model(train_set_metadata={})
            #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name']

            ludwig_save_is_working = False

            if not ludwig_save_is_working:
                shutil.rmtree('results', ignore_errors=True)

            if self.transaction.lmd['rebuild_model'] is True:
                model = LudwigModel(model_definition)
                merged_model_definition = model.model_definition
                train_set_metadata = build_metadata(
                    training_dataframe,
                    (merged_model_definition['input_features'] +
                     merged_model_definition['output_features']),
                    merged_model_definition['preprocessing'])
                model.initialize_model(train_set_metadata=train_set_metadata,
                                       gpus=self.get_useable_gpus())

                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())
            else:
                model = LudwigModel.load(model_dir=self.get_model_dir())
                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())

            for k in train_stats['train']:
                if k not in self.transaction.lmd['model_accuracy']['train']:
                    self.transaction.lmd['model_accuracy']['train'][k] = []
                    self.transaction.lmd['model_accuracy']['test'][k] = []
                elif k is not 'combined':
                    # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway
                    pass
                else:
                    self.transaction.lmd['model_accuracy']['train'][k].extend(
                        train_stats['train'][k]['accuracy'])
                    self.transaction.lmd['model_accuracy']['test'][k].extend(
                        train_stats['test'][k]['accuracy'])
            '''
            @ TRAIN ONLINE BIT That's not working
            model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
            for i in range(0,100):
                train_stats = model.train_online(data_df=training_dataframe)
                # The resulting train_stats are "None"... wonderful -_-
            '''

        ludwig_model_savepath = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_ludwig_data')
        if ludwig_save_is_working:
            model.save(ludwig_model_savepath)
            model.close()
        else:
            shutil.rmtree(ludwig_model_savepath, ignore_errors=True)
            shutil.move(os.path.join('results',
                                     os.listdir('results')[0]),
                        ludwig_model_savepath)
        self.transaction.lmd['ludwig_data'] = {
            'ludwig_save_path': ludwig_model_savepath
        }
        self.transaction.hmd['ludwig_data'] = {
            'model_definition': model_definition
        }
Ejemplo n.º 6
0
#!/usr/bin/env python
# coding: utf-8

# # Simple Model Training Example
#
# This example is the API example for this Ludwig command line example
# (https://uber.github.io/ludwig/examples/#kaggles-titanic-predicting-survivors).

# Import required libraries

from ludwig.api import LudwigModel
import logging
import shutil

# clean out prior results
try:
    shutil.rmtree('./results')
except:
    pass

# Define Ludwig model object that drive model training
model = LudwigModel(model_definition_file='./model1_definition.yaml',
                    logging_level=logging.INFO)

# initiate model training
train_stats = model.train(data_csv='./data/train.csv',
                          experiment_name='simple_experiment',
                          model_name='simple_model')

model.close()
Ejemplo n.º 7
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]

    output_features = [category_feature(vocab_size=2)]

    predictions_column_name = '{}_predictions'.format(
        output_features[0]['name'])

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)
    original_weights = deepcopy(ludwig_model.model.model.trainable_variables)
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    loaded_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    loaded_weights = deepcopy(ludwig_model.model.model.trainable_variables)

    #################################################
    # restore savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    restored_model = tf.saved_model.load(savedmodel_path)

    if_name = list(ludwig_model.model.model.input_features.keys())[0]
    of_name = list(ludwig_model.model.model.output_features.keys())[0]

    data_to_predict = {
        if_name: tf.convert_to_tensor(dataset.dataset[if_name], dtype=tf.int32)
    }

    logits = restored_model(data_to_predict, False, None)

    restored_predictions = tf.argmax(logits[of_name]['logits'],
                                     -1,
                                     name='predictions_{}'.format(of_name))
    restored_predictions = tf.map_fn(
        lambda idx: train_set_metadata[of_name]['idx2str'][idx],
        restored_predictions,
        dtype=tf.string)

    restored_weights = deepcopy(restored_model.trainable_variables)

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    # check for same number of weights as original model
    assert len(original_weights) == len(loaded_weights)
    assert len(original_weights) == len(restored_weights)

    # check to ensure weight valuess match the original model
    loaded_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy()))
        for i in range(len(original_weights))
    ])
    restored_weights_match = np.all([
        np.all(
            np.isclose(original_weights[i].numpy(),
                       restored_weights[i].numpy()))
        for i in range(len(original_weights))
    ])

    assert loaded_weights_match and restored_weights_match

    #  Are predictions identical to original ones?
    loaded_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        loaded_prediction_df[predictions_column_name])

    restored_predictions_match = np.all(
        original_predictions_df[predictions_column_name] ==
        restored_predictions.numpy().astype('str'))

    assert loaded_predictions_match and restored_predictions_match
Ejemplo n.º 8
0
def test_savedmodel(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)

    # Single sequence input, single category output
    sf = sequence_feature()
    sf['encoder'] = 'parallel_cnn'
    input_features = [sf]
    input_feature_name = input_features[0]['name']
    input_feature_tensor_name = '{}/{}_placeholder:0'.format(
        input_feature_name, input_feature_name)
    output_features = [category_feature(vocab_size=2)]
    output_feature_name = output_features[0]['name']
    output_feature_tensor_name = '{}/predictions_{}/predictions_{}:0'.format(
        output_feature_name, output_feature_name, output_feature_name)
    predictions_column_name = '{}_predictions'.format(output_feature_name)
    weight_tensor_name = '{}/fc_0/weights:0'.format(input_feature_name)

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    #################
    # save savedmodel
    #################
    savedmodel_path = os.path.join(dir_path, 'savedmodel')
    shutil.rmtree(savedmodel_path, ignore_errors=True)
    ludwig_model.model.save_savedmodel(savedmodel_path)

    ##############################
    # collect weight tensors names
    ##############################
    with ludwig_model.model.session as sess:
        all_variables = tf.compat.v1.trainable_variables()
        all_variables_names = [v.name for v in all_variables]
    ludwig_model.close()

    ###################################################
    # load Ludwig model, obtain predictions and weights
    ###################################################
    ludwig_model = LudwigModel.load(ludwigmodel_path)
    ludwig_prediction_df = ludwig_model.predict(data_csv=data_csv_path)
    ludwig_weights = ludwig_model.model.collect_weights(all_variables_names)
    ludwig_model.close()

    #################################################
    # load savedmodel, obtain predictions and weights
    #################################################
    train_set_metadata_json_fp = os.path.join(ludwigmodel_path,
                                              TRAIN_SET_METADATA_FILE_NAME)

    dataset, train_set_metadata = preprocess_for_prediction(
        ludwigmodel_path,
        split=FULL,
        data_csv=data_csv_path,
        train_set_metadata=train_set_metadata_json_fp,
        evaluate_performance=False)

    with tf.compat.v1.Session() as sess:
        tf.saved_model.loader.load(sess, [tf.saved_model.SERVING],
                                   savedmodel_path)

        predictions = sess.run(output_feature_tensor_name,
                               feed_dict={
                                   input_feature_tensor_name:
                                   dataset.get(input_feature_name),
                               })

        savedmodel_prediction_df = pd.DataFrame(
            data=[
                train_set_metadata[output_feature_name]["idx2str"][p]
                for p in predictions
            ],
            columns=[predictions_column_name])

        savedmodel_weights = sess.run({n: n for n in all_variables_names})

    #########
    # Cleanup
    #########
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    shutil.rmtree(savedmodel_path, ignore_errors=True)

    ###############################################
    # Check if weights and predictions are the same
    ###############################################

    for var in all_variables_names:
        print("Are the weights in {} identical?".format(var),
              np.all(ludwig_weights[var] == savedmodel_weights[var]))
    print(
        "Are loaded model predictions identical to original ones?",
        np.all(
            original_predictions_df[predictions_column_name] == \
            ludwig_prediction_df[predictions_column_name]
        )
    )
    print(
        "Are savedmodel predictions identical to loaded model?",
        np.all(
            ludwig_prediction_df[predictions_column_name] == \
            savedmodel_prediction_df[predictions_column_name]
        )
    )

    for var in all_variables_names:
        assert np.all(ludwig_weights[var] == savedmodel_weights[var])
    assert np.all(
        original_predictions_df[predictions_column_name] == \
        ludwig_prediction_df[predictions_column_name]
    )
    assert np.all(
        ludwig_prediction_df[predictions_column_name] == \
        savedmodel_prediction_df[predictions_column_name]
    )
Ejemplo n.º 9
0
    def train(self):
        training_dataframe, model_definition, timeseries_cols, has_heavy_data, self.transaction.lmd[
            'ludwig_tf_self_col_map'] = self._create_ludwig_dataframe('train')

        if len(timeseries_cols) > 0:
            training_dataframe, model_definition = self._translate_df_to_timeseries_format(
                training_dataframe, model_definition, timeseries_cols, 'train')

        with disable_console_output(True):
            # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295
            #model.initialize_model(train_set_metadata={})
            #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name']

            ludwig_save_is_working = False

            if not ludwig_save_is_working:
                shutil.rmtree('results', ignore_errors=True)

            if self.transaction.lmd['rebuild_model'] is True:
                model = LudwigModel(model_definition)
                merged_model_definition = model.model_definition
                train_set_metadata = build_metadata(
                    training_dataframe,
                    (merged_model_definition['input_features'] +
                     merged_model_definition['output_features']),
                    merged_model_definition['preprocessing'])
                model.initialize_model(train_set_metadata=train_set_metadata,
                                       gpus=self._get_useable_gpus())
            else:
                model = LudwigModel.load(model_dir=self._get_model_dir())

            split_by = int(20 * pow(10, 6))
            if has_heavy_data:
                split_by = 40
            df_len = len(training_dataframe[training_dataframe.columns[0]])
            if df_len > split_by:
                i = 0
                while i < df_len:
                    end = i + split_by
                    self.transaction.log.info(
                        f'Training with batch from index {i} to index {end}')
                    training_sample = training_dataframe.iloc[i:end]
                    training_sample = training_sample.reset_index()

                    if len(training_sample) < 1:
                        continue

                    train_stats = model.train(
                        data_df=training_sample,
                        model_name=self.transaction.lmd['name'],
                        skip_save_model=ludwig_save_is_working,
                        skip_save_progress=True,
                        gpus=self._get_useable_gpus())
                    i = end
            else:
                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self._get_useable_gpus())

            for k in train_stats['train']:
                if k not in self.transaction.lmd['model_accuracy']['train']:
                    self.transaction.lmd['model_accuracy']['train'][k] = []
                    self.transaction.lmd['model_accuracy']['test'][k] = []
                elif k != 'combined':
                    # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway
                    pass
                else:
                    self.transaction.lmd['model_accuracy']['train'][k].extend(
                        train_stats['train'][k]['accuracy'])
                    self.transaction.lmd['model_accuracy']['test'][k].extend(
                        train_stats['test'][k]['accuracy'])
            '''
            @ TRAIN ONLINE BIT That's not working
            model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
            for i in range(0,100):
                train_stats = model.train_online(data_df=training_dataframe)
                # The resulting train_stats are "None"... wonderful -_-
            '''

        ludwig_model_savepath = os.path.join(CONFIG.MINDSDB_STORAGE_PATH,
                                             self.transaction.lmd['name'],
                                             'ludwig_data')
        Path(CONFIG.MINDSDB_STORAGE_PATH).joinpath(
            self.transaction.lmd['name']).mkdir(mode=0o777,
                                                exist_ok=True,
                                                parents=True)
        if ludwig_save_is_working:
            model.save(ludwig_model_savepath)
            model.close()
        else:
            shutil.rmtree(ludwig_model_savepath, ignore_errors=True)
            shutil.move(os.path.join('results',
                                     os.listdir('results')[0]),
                        ludwig_model_savepath)
        self.transaction.lmd['ludwig_data'] = {
            'ludwig_save_path': ludwig_model_savepath
        }
        self.transaction.hmd['ludwig_data'] = {
            'model_definition': model_definition
        }