class NativeDataFrame(BasePredictor): def __init__(self, dataset): super().__init__(dataset) self.df = pd.read_csv(f"{self.dataset.name}_test.csv") self.predictor = Predictor(name=self.dataset.name) def predict(self, row_number=1): return self.predictor.predict(self.df[:row_number])
def test_regressor(): """ Sanity check. MindsDB point predictions should be within range of predicted bounds by the inductive conformal predictor. """ def _df_from_x(x, columns=None): x = pd.DataFrame(x) if columns is None: x.columns = 'c' + pd.Series([i for i in range(len(x.columns)) ]).astype(str) else: x.columns = columns return x def _df_from_xy(x, y, target): x = _df_from_x(x) x[target] = pd.DataFrame(y) return x X, y = load_boston(return_X_y=True) X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=5) target = 'medv' x_tr = _df_from_xy(X_train, Y_train, target) p = Predictor("ConformalTest") p.learn(from_data=x_tr, to_predict=target) x_te = _df_from_xy(X_test, Y_test, target) r = p.predict(when_data=x_te) r = [x.explanation[target] for x in r] for x in r: assert x['confidence_interval'][0] <= x['predicted_value'] <= x[ 'confidence_interval'][1]
class NativeClickhouse(BasePredictor): host = CONFIG['database']['host'] port = CONFIG['database']['port'] user = CONFIG['database']['user'] password = CONFIG['database']['password'] def __init__(self, dataset): super().__init__(dataset) self.predictor = Predictor(name=self.dataset.name) self.query_template = f"SELECT * FROM test_data.{self.dataset.name} LIMIT %s" def predict(self, row_number=1): _query = self.query_template % row_number return self.predictor.predict(when_data=ClickhouseDS( _query, host=self.host, user=self.user, password=self.password))
def test_postgres_ds(): import pg8000 from mindsdb_native.libs.data_sources.postgres_ds import PostgresDS HOST = 'localhost' USER = '******' PASSWORD = '' DBNAME = 'postgres' PORT = 5432 con = pg8000.connect(database=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT) cur = con.cursor() cur.execute('DROP TABLE IF EXISTS test_mindsdb') cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 Int, col_3 Boolean, col_4 Date, col_5 Int [])' ) for i in range(0, 200): dt = datetime.datetime.now() - datetime.timedelta(days=i) dt_str = dt.strftime('%Y-%m-%d') cur.execute( f'INSERT INTO test_mindsdb VALUES (\'String {i}\', {i}, {i % 2 == 0}, \'{dt_str}\', ARRAY [1, 2, {i}])' ) con.commit() con.close() postgres_ds = PostgresDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DBNAME, port=PORT) assert postgres_ds.name() == 'PostgresDS: postgres/test_mindsdb' assert (len(postgres_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=postgres_ds)
def test_mongodb_ds(): from pymongo import MongoClient from mindsdb_native.libs.data_sources.mongodb_ds import MongoDS HOST = 'localhost' USER = '******' PASSWORD = '******' DATABASE = 'database' COLLECTION_NAME = 'test_mindsdb' PORT = 27017 con = MongoClient(host=HOST, port=PORT, username=USER, password=PASSWORD) db = con[DATABASE] if COLLECTION_NAME in db.list_collection_names(): db[COLLECTION_NAME].drop() collection = db[COLLECTION_NAME] for i in range(0, 200): collection.insert_one({ 'col_1': "This is string number {}".format(i), 'col_2': i, 'col_3': (i % 2) == 0 }) mongodb_ds = MongoDS(collection=COLLECTION_NAME, query={}, host=HOST, port=PORT, user=USER, password=PASSWORD, database=DATABASE) assert mongodb_ds.name() == 'MongoDS: database/test_mindsdb' assert (len(mongodb_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mongodb_ds)
def test_mssql_ds(): import pytds from mindsdb_native.libs.data_sources.ms_sql_ds import MSSQLDS HOST = 'localhost' USER = '******' PASSWORD = '******' DATABASE = 'master' PORT = 1433 with pytds.connect(dsn=HOST, user=USER, password=PASSWORD, database=DATABASE) as con: with con.cursor() as cur: cur.execute( "IF OBJECT_ID('dbo.test_mindsdb') IS NOT NULL DROP TABLE dbo.test_mindsdb" ) cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BIT)' ) for i in range(0, 200): cur.execute( f"INSERT INTO test_mindsdb ([col_1], [col_2], [col_3]) VALUES ('This is string number {i}', {i}, {i % 2})" ) con.commit() mssql_ds = MSSQLDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert mssql_ds.name() == 'MSSQLDS: master/test_mindsdb' assert (len(mssql_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mssql_ds)
def test_mysql_ds(): import mysql.connector from mindsdb_native.libs.data_sources.mysql_ds import MySqlDS HOST = 'localhost' USER = '******' PASSWORD = '' DATABASE = 'mysql' PORT = 3306 con = mysql.connector.connect(host=HOST, port=PORT, user=USER, password=PASSWORD, database=DATABASE) cur = con.cursor() cur.execute('DROP TABLE IF EXISTS test_mindsdb') cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BOOL)') for i in range(0, 200): cur.execute( f'INSERT INTO test_mindsdb VALUES ("This is string number {i}", {i}, {i % 2 == 0})' ) con.commit() con.close() mysql_ds = MySqlDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert mysql_ds.name() == 'MySqlDS: mysql/test_mindsdb' assert (len(mysql_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mysql_ds)
def test_clickhouse_ds(): from mindsdb_native.libs.data_sources.clickhouse_ds import ClickhouseDS HOST = 'localhost' PORT = 8123 clickhouse_url = f'http://{HOST}:{PORT}' queries = [ 'CREATE DATABASE IF NOT EXISTS test', 'DROP TABLE IF EXISTS test.mock', ''' CREATE TABLE test.mock( col1 String ,col2 Int64 ,col3 Array(UInt8) ) ENGINE = MergeTree() ORDER BY col2 PARTITION BY col1 ''', "INSERT INTO test.mock VALUES ('a',1,[1,2,3])", "INSERT INTO test.mock VALUES ('b',2,[2,3,1])", "INSERT INTO test.mock VALUES ('c',3,[3,1,2])" ] for q in queries: r = requests.post(clickhouse_url, data=q) assert r.status_code == 200 clickhouse_ds = ClickhouseDS( 'SELECT * FROM test.mock ORDER BY col2 DESC LIMIT 2', host=HOST, port=PORT) assert (len(clickhouse_ds.df) == 2) assert (sum(map(int, clickhouse_ds.df['col2'])) == 5) assert (len(list(clickhouse_ds.df['col3'][1])) == 3) assert (set(clickhouse_ds.df.columns) == set(['col1', 'col2', 'col3'])) mdb = Predictor(name='analyse_dataset_test_predictor') F.analyse_dataset(from_data=clickhouse_ds)
def train(self, train_df): mdb = Predictor(name=self.name) mdb.learn(to_predict=self.target_column, from_data=train_df, backend='lightwood') return mdb
from mindsdb_native import Predictor Predictor(name='fuel').learn( to_predict='Main_Engine_Fuel_Consumption_MT_day', from_data='fuel.csv', # Time series arguments: order_by='Time', group_by='id', window_size=24, # just 24 hours )
from mindsdb_native import Predictor mdb = Predictor(name='coffee_predictor') mdb.learn(from_data='data.tsv', to_predict=[ 'Coffe_Malt', 'Chocolat', 'Gold', 'Medium_Barley', 'Dark_Barley', 'Dandelion', 'Beets', 'Chicory_Roots', 'Figs' ])
""" This example we will walk you over the basics of MindsDB The example code objective here is to predict the best retail price for a given property. """ from mindsdb_native import Predictor # use the model to make predictions result = Predictor(name='home_rentals_price').predict(when_data={ 'number_of_rooms': 2, 'number_of_bathrooms': 1, 'sqft': 1190 }) # you can now print the results print('The predicted price is ${price} with {conf} confidence'.format( price=result[0]['rental_price'], conf=result[0]['rental_price_confidence']))
""" """ from mindsdb_native import Predictor # Here we use the model to make predictions (NOTE: You need to run train.py first) result = Predictor(name='fuel').predict(when_data='fuel_predict.csv') # you can now print the results print('The predicted main engine fuel consumption') for row in result: print(row)
from mindsdb_native import Predictor # Should take about 13 minutes Predictor(name='fuel').learn( to_predict='Main_Engine_Fuel_Consumption_MT_day', from_data='fuel.csv', stop_training_in_x_seconds=60, # Time series arguments: timeseries_settings={ 'order_by': ['Time'], 'group_by': ['id'], 'window': 24, # just 24 hours } )
""" This example we will walk you over the basics of MindsDB The example code objective here is to: - learn a model to predict the best retal price for a given property. In order to to this we have a dataset "data_sources/home_rentals.csv" (or download from https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv) """ from mindsdb_native import Predictor # We tell mindsDB what we want to learn and from what data Predictor(name='home_rentals').learn( to_predict= 'days_on_market', # the column we want to learn to predict given all the data in the file from_data= "home_rentals.csv" # the path to the file where we can learn from, (note: can be url) , stop_training_in_x_seconds=10)
from mindsdb_native import Predictor Predictor(name='fuel').learn( to_predict='Main_Engine_Fuel_Consumption_MT_day', from_data = 'fuel.csv', # Time series arguments: timeseries_settings={ order_by='Time', group_by='id', window=24, # just 24 hours } )
def __init__(self, dataset): super().__init__(dataset) self.df = pd.read_csv(f"{self.dataset.name}_test.csv") self.predictor = Predictor(name=self.dataset.name)
def test_maria_ds(): import mysql.connector from mindsdb_native import MariaDS HOST = 'localhost' USER = '******' PASSWORD = '' DATABASE = 'mysql' PORT = 4306 con = mysql.connector.connect(host=HOST, port=PORT, user=USER, password=PASSWORD, database=DATABASE) cur = con.cursor() cur.execute('DROP TABLE IF EXISTS test_mindsdb') cur.execute("""CREATE TABLE test_mindsdb ( col_int BIGINT, col_float FLOAT, col_categorical Text, col_bool BOOL, col_text Text, col_date DATE, col_datetime DATETIME, col_timestamp TIMESTAMP, col_time TIME ) """) for i in range(0, 200): dt = datetime.datetime.now() - datetime.timedelta(days=i) query = f"""INSERT INTO test_mindsdb (col_int, col_float, col_categorical, col_bool, col_text, col_date, col_datetime, col_timestamp, col_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) """ ci = i % 5 values = (i, i + 0.01, f"Cat {ci}", i % 2 == 0, f"long long long text {i}", dt.date(), dt, dt.strftime('%Y-%m-%d %H:%M:%S.%f'), dt.strftime('%H:%M:%S.%f')) cur.execute(query, values) con.commit() con.close() maria_ds = MariaDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert maria_ds.name() == 'MariaDS: mysql/test_mindsdb' assert (len(maria_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) model_data = F.analyse_dataset(from_data=maria_ds) analysis = model_data['data_analysis_v2'] assert model_data assert analysis def assert_expected_type(column_typing, expected_type, expected_subtype): assert column_typing['data_type'] == expected_type assert column_typing['data_subtype'] == expected_subtype assert column_typing['data_type_dist'][expected_type] == 200 assert column_typing['data_subtype_dist'][expected_subtype] == 200 assert_expected_type(analysis['col_categorical']['typing'], DATA_TYPES.CATEGORICAL, DATA_SUBTYPES.MULTIPLE) assert_expected_type(analysis['col_bool']['typing'], DATA_TYPES.CATEGORICAL, DATA_SUBTYPES.SINGLE) assert_expected_type(analysis['col_int']['typing'], DATA_TYPES.NUMERIC, DATA_SUBTYPES.INT) assert_expected_type(analysis['col_float']['typing'], DATA_TYPES.NUMERIC, DATA_SUBTYPES.FLOAT) assert_expected_type(analysis['col_date']['typing'], DATA_TYPES.DATE, DATA_SUBTYPES.DATE) assert_expected_type(analysis['col_datetime']['typing'], DATA_TYPES.DATE, DATA_SUBTYPES.TIMESTAMP) assert_expected_type(analysis['col_timestamp']['typing'], DATA_TYPES.DATE, DATA_SUBTYPES.TIMESTAMP) # Subtype is expected to be either .SHORT or .RICH try: assert_expected_type(analysis['col_text']['typing'], DATA_TYPES.TEXT, DATA_SUBTYPES.SHORT) except AssertionError: assert_expected_type(analysis['col_text']['typing'], DATA_TYPES.TEXT, DATA_SUBTYPES.RICH)
def __init__(self, dataset): super().__init__(dataset) self.predictor = Predictor(name=self.dataset.name) self.query_template = f"SELECT * FROM test_data.{self.dataset.name} LIMIT %s"
import pandas as pd from mindsdb_native import Predictor mdb = Predictor(name='description_predictor') mdb.learn(from_data=pd.read_csv('processed_data/train.csv'), to_predict='description') predictions = mdb.predict('processed_data/test.csv') for pred in predictions: print(pred['description'])
def test_database_history(self): return from mindsdb_datasources import ClickhouseDS TEMP_DB = 'test_database_history_' + random_string() TEMP_TABLE = 'tmp_test_database_history_' + random_string() params = {'user': self.USER, 'password': self.PASSWORD} clickhouse_url = f'http://{self.HOST}:{self.PORT}' values = [] for i in range(200): values.append([str(i % 4), i, i * 2]) queries = [ f'CREATE DATABASE IF NOT EXISTS {TEMP_DB}', f'DROP TABLE IF EXISTS {TEMP_DB}.{TEMP_TABLE}', f''' CREATE TABLE {TEMP_DB}.{TEMP_TABLE}( col1 String ,col2 Int64 ,col3 Int64 ) ENGINE = MergeTree() ORDER BY col2 PARTITION BY col1 ''', ] gc.collect() for value in values: value_ins_str = str(value).replace('[','').replace(']','') queries.append(f"INSERT INTO {TEMP_DB}.{TEMP_TABLE} VALUES ({value_ins_str})") for q in queries: r = requests.post(clickhouse_url, data=q, params=params) assert r.status_code == 200, r.text clickhouse_ds = ClickhouseDS( f'SELECT * FROM {TEMP_DB}.{TEMP_TABLE}', host=self.HOST, port=self.PORT, user=self.USER, password=self.PASSWORD ) temp_predictor = Predictor(name='query_history_based_ts_predictor') temp_predictor.learn( to_predict='col3', from_data=clickhouse_ds, stop_training_in_x_seconds=5, timeseries_settings={ 'order_by': ['col2'] ,'window': 6 ,'group_by': ['col1'] } ) del temp_predictor ts_predictor = Predictor(name='query_history_based_ts_predictor') predictions = ts_predictor.predict( when_data={ 'col2': 800 ,'col1': '2' }, advanced_args={ 'use_database_history': True } ) assert predictions[0]['col3'] is not None r = requests.post( clickhouse_url, data=f'DROP DATABASE {TEMP_DB}', params=params ) assert r.status_code == 200, 'failed to drop temporary database "{}"'.format(TEMP_DB)