def test_learn_and_predict_nnmixer(self): config = { 'input_features': [{ 'name': 'sqft', 'type': 'numeric' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', 'weights': { '0': 0.8, '1': 0.6, '2': 0.5, '3': 0.7, '4': 1 } }, { 'name': 'number_of_bathrooms', 'type': 'categorical', 'weights': { '0': 0.8, '1': 0.6, '2': 4 } }, { 'name': 'rental_price', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }], 'mixer': { 'class': NnMixer, 'kwargs': { 'eval_every_x_epochs': 4, 'stop_training_after_seconds': 10 } } } df = pd.read_csv( 'https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv' ) predictor = Predictor(config) predictor.learn(from_data=df) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df)
def test_multiple_categories_as_output(self): pass # fails: AssertionError: 0.0 not greater than or equal to 0.15 vocab = self.get_vocab(10) # x1 contains the index of first tag present # x2 contains the index of second tag present # if a tag is missing then x1/x2 contain -1 instead # Thus the dataset should be perfectly predicted n_points = 10000 x1 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)] x2 = [random.randint(0, len(vocab) - 1) if random.random() > 0.2 else -1 for i in range(n_points)] tags = [] for x1_index, x2_index in zip(x1, x2): row_tags = set([vocab.get(x1_index), vocab.get(x2_index)]) row_tags = [x for x in row_tags if x is not None] tags.append(row_tags) df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags}) config = { 'input_features': [ {'name': 'x1', 'type': ColumnDataTypes.CATEGORICAL}, {'name': 'x2', 'type': ColumnDataTypes.CATEGORICAL} ], 'output_features': [ {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL} ], 'mixer': {'class': NnMixer, 'kwargs': {'stop_training_after_seconds': 25}} } df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor(config) predictor.learn(from_data=df_train) predictions = predictor.predict(when_data=df_train) train_tags = df_train.tags predicted_tags = predictions['tags']['predictions'] train_tags_encoded = predictor._mixer.encoders['tags'].encode(train_tags) pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags) score = f1_score(train_tags_encoded, pred_labels_encoded, average='weighted') print('Train f1 score', score) self.assertGreaterEqual(score, 0.15) predictions = predictor.predict(when_data=df_test) test_tags = df_test.tags predicted_tags = predictions['tags']['predictions'] test_tags_encoded = predictor._mixer.encoders['tags'].encode(test_tags) pred_labels_encoded = predictor._mixer.encoders['tags'].encode(predicted_tags) score = f1_score(test_tags_encoded, pred_labels_encoded, average='weighted') print('Test f1 score', score) self.assertGreaterEqual(score, 0.15)
def test_multiple_categories_as_input(self): vocab = self.get_vocab(10) # tags contains up to 2 randomly selected tags # y contains the sum of indices of tags # the dataset should be nearly perfectly predicted n_points = 10000 tags = [] y = [] for i in range(n_points): row_tags = [] row_y = 0 for k in range(2): if random.random() > 0.2: selected_index = random.randint(0, len(vocab)-1) if vocab[selected_index] not in row_tags: row_tags.append(vocab[selected_index]) row_y += selected_index tags.append(row_tags) y.append(row_y) df = pd.DataFrame({'tags': tags, 'y': y}) config = { 'input_features': [ {'name': 'tags', 'type': ColumnDataTypes.MULTIPLE_CATEGORICAL} ], 'output_features': [ {'name': 'y', 'type': ColumnDataTypes.NUMERIC} ], 'mixer': { 'class': NnMixer, 'kwargs': {'stop_training_after_seconds': 10} } } df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor(config) predictor.learn(from_data=df_train) predictions = predictor.predict(when_data=df_test) test_y = df_test.y predicted_y = predictions['y']['predictions'] score = r2_score(test_y, predicted_y) print('Test R2 score', score) # The score check is very light because we only allow the model to train for a few seconds # We are just checking that it learns something and predicts properly, not benchmarking here self.assertGreaterEqual(score, 0.15)
def test_home_rentals(self): lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.PLINEAR = PLINEAR config = { 'input_features': [ {'name': 'sqft', 'type': 'numeric'}, {'name': 'days_on_market', 'type': 'numeric'}, {'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4} ], 'output_features': [ {'name': 'number_of_rooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 0.5, '3': 0.7, '4': 1}}, {'name': 'number_of_bathrooms', 'type': 'categorical', 'weights': {'0': 0.8, '1': 0.6, '2': 4}}, {'name': 'rental_price', 'type': 'numeric'}, {'name': 'location', 'type': 'categorical'} ], 'data_source': {'cache_transformed_data': CACHE_ENCODED_DATA}, 'mixer': { 'class': NnMixer, 'kwargs': { 'selfaware': SELFAWARE, 'eval_every_x_epochs': 4, 'stop_training_after_seconds': 80 } } } df = pd.read_csv('https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv') predictor = Predictor(config) predictor.learn(from_data=df) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') for j in range(100): pred = predictor.predict(when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] assert isinstance(pred, (str, int))
'name': 'next', 'type': 'numeric' }] } def iter_function(epoch, error, test_error, test_error_gradient): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next']) predictor = Predictor(config) predictor.learn(from_data=data, callback_on_iter=iter_function, eval_every_x_epochs=10) ret = predictor.predict(when={ 'ts': " ".join([str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len)]) }) print(" ".join( [str(math.sin(i / max)) for i in range(10 + 1, 10 + ts_len + 1)])) print(ret)
def run_full_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR): ''' Run full test example with home_rentals dataset ''' lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.PLINEAR = PLINEAR config = { 'input_features': [{ 'name': 'number_of_bathrooms', 'type': 'numeric' }, { 'name': 'sqft', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }, { 'name': 'rental_price', 'type': 'numeric' }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', # 'weights':{ # '0': 0.8, # '1': 0.6, # '2': 0.5, # '3': 0.7, # '4': 1, # } }], 'data_source': { 'cache_transformed_data': CACHE_ENCODED_DATA }, 'mixer': { 'class': lightwood.BUILTIN_MIXERS.NnMixer, 'selfaware': SELFAWARE } } df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv" ) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor = Predictor(config) # stop_training_after_seconds given in order to not get timeouts in travis predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=4, stop_training_after_seconds=40) df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') preds = {} for j in range(100): pred = predictor.predict( when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] if pred not in preds: preds[pred] = 0 preds[pred] += 1
}], 'output_features': [{ 'name': 'superclass', 'type': 'categorical', 'encoder_attrs': {} }, { 'name': 'class', 'type': 'categorical', 'encoder_attrs': {} }] } predictor = Predictor(config) def iter_function(epoch, error, test_error, test_error_gradient): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) predictor.learn(from_data=pd.read_csv('train_sample.csv'), callback_on_iter=iter_function, eval_every_x_epochs=1) results = predictor.predict(when_data=pd.read_csv('test_sample.csv')) print(results)
'mixer': { 'class': lightwood.BUILTIN_MIXERS.BayesianNnMixer } } #'mixer':{'class': lightwood.BUILTIN_MIXERS.NnMixer}} df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv") predictor = Predictor(config) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=2, stop_training_after_seconds=20) print(predictor.predict(when_data=df.iloc[0:20])) #print(predictor.predict(when={'number_of_rooms':3, 'number_of_bathrooms':2, 'sqft':700, 'location':'great'}))
data_frame = pandas.DataFrame(data) print(data_frame) predictor = Predictor(output=['z']) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}'.format( iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor.learn(from_data=data_frame, callback_on_iter=iter_function) print('accuracy') print(predictor.train_accuracy) print('accuracy over all dataset') print(predictor.calculate_accuracy(from_data=data_frame)) when = {'x': [1], 'y': [0]} print('- multiply when. {when}'.format(when=when)) print(predictor.predict(when=when)) # saving the predictor predictor.save('ok.pkl') # loading the predictor predictor2 = Predictor(load_from_path='ok.pkl') when = {'x': [0, 0, 1, -1, 1], 'y': [0, 1, -1, -1, 1]}
config = { 'input_features': [{ 'name': 'ts', 'type': COLUMN_DATA_TYPES.TIME_SERIES }], 'output_features': [{ 'name': 'next', 'type': 'numeric' }] } def iter_function(epoch, training_error, test_error, test_error_gradient, test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) data = pandas.DataFrame(ts_data, columns=['time', 'ts', 'next']) predictor = Predictor(config) predictor.learn(from_data=data) print('\n\n') ret = predictor.predict( when={'ts': [math.sin(i / max) for i in range(10 + 1, 10 + ts_len)]}) print([math.sin(i / max) for i in range(10 + 1, 10 + ts_len + 1)]) print('Got predictions: ') print(ret)
data_frame = pandas.DataFrame(data) print(data_frame) predictor = Predictor(output=['z']) def feedback(iter, error, test_error, test_error_gradient): # predictor.stop_training() print( 'iteration: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, accuracy: {accuracy}'.format( iter=iter, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy)) predictor.learn(from_data=data_frame, callback_on_iter=feedback) print('accuracy') print(predictor.train_accuracy) print('accuracy over all dataset') print(predictor.calculate_accuracy(from_data=data_frame)) when = {'x': [1], 'y': [0]} print('- multiply when. {when}'.format(when=when)) print(predictor.predict(when=when)) # saving the predictor predictor.save('/tmp/ok.pkl') # loading the predictor predictor2 = Predictor(load_from_path='/tmp/ok.pkl') when = {'x': [0, 0, 1, -1, 1], 'y': [0, 1, -1, -1, 1]}
df_train = pandas.DataFrame(data_train) df_test = pandas.DataFrame(data_test) predictor = Predictor(output=['z']) def iter_function(epoch, training_error, test_error, test_error_gradient, test_accuracy): print( f'Epoch: {epoch}, Train Error: {training_error}, Test Error: {test_error}, Test Error Gradient: {test_error_gradient}, Test Accuracy: {test_accuracy}' ) predictor.learn(from_data=df_train, callback_on_iter=iter_function, eval_every_x_epochs=200) predictor.save('ok.pkl') predictor = Predictor(load_from_path='ok.pkl') print('Train accuracy: ', predictor.train_accuracy) print('Test accuracy: ', predictor.calculate_accuracy(from_data=df_test)) predictions = predictor.predict(when_data=df_test) print('Confidence mean for both x and y present: ', np.mean(predictions['z']['selfaware_confidences'])) print(list(df_test['z'])[30:60]) print(predictions['z']['predictions'][30:60]) predictions = predictor.predict(when_data=df_test.drop(columns=['x'])) print('Confidence mean for x missing: ',
def run_test(USE_CUDA, CACHE_ENCODED_DATA, SELFAWARE, PLINEAR): lightwood.config.config.CONFIG.USE_CUDA = USE_CUDA lightwood.config.config.CONFIG.CACHE_ENCODED_DATA = CACHE_ENCODED_DATA lightwood.config.config.CONFIG.SELFAWARE = SELFAWARE lightwood.config.config.CONFIG.PLINEAR = PLINEAR #################### config = { 'input_features': [{ 'name': 'number_of_bathrooms', 'type': 'numeric' }, { 'name': 'sqft', 'type': 'numeric' }, { 'name': 'location', 'type': 'categorical' }, { 'name': 'days_on_market', 'type': 'numeric' }, { 'name': 'neighborhood', 'type': 'categorical', 'dropout': 0.4 }, { 'name': 'rental_price', 'type': 'numeric' }], 'output_features': [{ 'name': 'number_of_rooms', 'type': 'categorical', # 'weights':{ # '0': 0.8, # '1': 0.6, # '2': 0.5, # '3': 0.7, # '4': 1, # } }], 'mixer': { 'class': lightwood.BUILTIN_MIXERS.NnMixer } } # AX doesn't seem to work on the travis version of windows, so don't test it there as of now if sys.platform not in ['win32', 'cygwin', 'windows']: pass #config['optimizer'] = lightwood.model_building.BasicAxOptimizer df = pd.read_csv( "https://mindsdb-example-data.s3.eu-west-2.amazonaws.com/home_rentals.csv" ) def iter_function(epoch, error, test_error, test_error_gradient, test_accuracy): print( 'epoch: {iter}, error: {error}, test_error: {test_error}, test_error_gradient: {test_error_gradient}, test_accuracy: {test_accuracy}' .format(iter=epoch, error=error, test_error=test_error, test_error_gradient=test_error_gradient, accuracy=predictor.train_accuracy, test_accuracy=test_accuracy)) predictor = Predictor(config) # stop_training_after_seconds given in order to not get timeouts in travis predictor.learn(from_data=df, callback_on_iter=iter_function, eval_every_x_epochs=1, stop_training_after_seconds=1) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') df = df.drop([x['name'] for x in config['output_features']], axis=1) predictor.predict(when_data=df) predictor.save('test.pkl') predictor = Predictor(load_from_path='test.pkl') preds = {} for j in range(100): pred = predictor.predict( when={'sqft': round(j * 10)})['number_of_rooms']['predictions'][0] if pred not in preds: preds[pred] = 0 preds[pred] += 1