Exemple #1
0
class NativeDataFrame(BasePredictor):
    def __init__(self, dataset):
        super().__init__(dataset)
        self.df = pd.read_csv(f"{self.dataset.name}_test.csv")
        self.predictor = Predictor(name=self.dataset.name)

    def predict(self, row_number=1):
        return self.predictor.predict(self.df[:row_number])
def test_regressor():
    """
    Sanity check. MindsDB point predictions should be within range
    of predicted bounds by the inductive conformal predictor.
    """
    def _df_from_x(x, columns=None):
        x = pd.DataFrame(x)
        if columns is None:
            x.columns = 'c' + pd.Series([i for i in range(len(x.columns))
                                         ]).astype(str)
        else:
            x.columns = columns
        return x

    def _df_from_xy(x, y, target):
        x = _df_from_x(x)
        x[target] = pd.DataFrame(y)
        return x

    X, y = load_boston(return_X_y=True)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=5)
    target = 'medv'

    x_tr = _df_from_xy(X_train, Y_train, target)
    p = Predictor("ConformalTest")
    p.learn(from_data=x_tr, to_predict=target)

    x_te = _df_from_xy(X_test, Y_test, target)
    r = p.predict(when_data=x_te)
    r = [x.explanation[target] for x in r]

    for x in r:
        assert x['confidence_interval'][0] <= x['predicted_value'] <= x[
            'confidence_interval'][1]
Exemple #3
0
class NativeClickhouse(BasePredictor):
    host = CONFIG['database']['host']
    port = CONFIG['database']['port']
    user = CONFIG['database']['user']
    password = CONFIG['database']['password']

    def __init__(self, dataset):
        super().__init__(dataset)
        self.predictor = Predictor(name=self.dataset.name)
        self.query_template = f"SELECT * FROM test_data.{self.dataset.name} LIMIT %s"

    def predict(self, row_number=1):
        _query = self.query_template % row_number
        return self.predictor.predict(when_data=ClickhouseDS(
            _query, host=self.host, user=self.user, password=self.password))
def test_postgres_ds():
    import pg8000
    from mindsdb_native.libs.data_sources.postgres_ds import PostgresDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = ''
    DBNAME = 'postgres'
    PORT = 5432

    con = pg8000.connect(database=DBNAME,
                         user=USER,
                         password=PASSWORD,
                         host=HOST,
                         port=PORT)
    cur = con.cursor()

    cur.execute('DROP TABLE IF EXISTS test_mindsdb')
    cur.execute(
        'CREATE TABLE test_mindsdb(col_1 Text, col_2 Int,  col_3 Boolean, col_4 Date, col_5 Int [])'
    )
    for i in range(0, 200):
        dt = datetime.datetime.now() - datetime.timedelta(days=i)
        dt_str = dt.strftime('%Y-%m-%d')
        cur.execute(
            f'INSERT INTO test_mindsdb VALUES (\'String {i}\', {i}, {i % 2 == 0}, \'{dt_str}\', ARRAY [1, 2, {i}])'
        )
    con.commit()
    con.close()

    postgres_ds = PostgresDS(table='test_mindsdb',
                             host=HOST,
                             user=USER,
                             password=PASSWORD,
                             database=DBNAME,
                             port=PORT)

    assert postgres_ds.name() == 'PostgresDS: postgres/test_mindsdb'

    assert (len(postgres_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=postgres_ds)
def test_mongodb_ds():
    from pymongo import MongoClient
    from mindsdb_native.libs.data_sources.mongodb_ds import MongoDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = '******'
    DATABASE = 'database'
    COLLECTION_NAME = 'test_mindsdb'
    PORT = 27017

    con = MongoClient(host=HOST, port=PORT, username=USER, password=PASSWORD)

    db = con[DATABASE]

    if COLLECTION_NAME in db.list_collection_names():
        db[COLLECTION_NAME].drop()

    collection = db[COLLECTION_NAME]

    for i in range(0, 200):
        collection.insert_one({
            'col_1': "This is string number {}".format(i),
            'col_2': i,
            'col_3': (i % 2) == 0
        })

    mongodb_ds = MongoDS(collection=COLLECTION_NAME,
                         query={},
                         host=HOST,
                         port=PORT,
                         user=USER,
                         password=PASSWORD,
                         database=DATABASE)

    assert mongodb_ds.name() == 'MongoDS: database/test_mindsdb'

    assert (len(mongodb_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mongodb_ds)
Exemple #6
0
def test_mssql_ds():
    import pytds
    from mindsdb_native.libs.data_sources.ms_sql_ds import MSSQLDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = '******'
    DATABASE = 'master'
    PORT = 1433

    with pytds.connect(dsn=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE) as con:
        with con.cursor() as cur:
            cur.execute(
                "IF OBJECT_ID('dbo.test_mindsdb') IS NOT NULL DROP TABLE dbo.test_mindsdb"
            )
            cur.execute(
                'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BIT)'
            )
            for i in range(0, 200):
                cur.execute(
                    f"INSERT INTO test_mindsdb ([col_1], [col_2], [col_3]) VALUES ('This is string number {i}', {i}, {i % 2})"
                )
        con.commit()

    mssql_ds = MSSQLDS(table='test_mindsdb',
                       host=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE,
                       port=PORT)

    assert mssql_ds.name() == 'MSSQLDS: master/test_mindsdb'

    assert (len(mssql_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mssql_ds)
def test_mysql_ds():
    import mysql.connector
    from mindsdb_native.libs.data_sources.mysql_ds import MySqlDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = ''
    DATABASE = 'mysql'
    PORT = 3306

    con = mysql.connector.connect(host=HOST,
                                  port=PORT,
                                  user=USER,
                                  password=PASSWORD,
                                  database=DATABASE)
    cur = con.cursor()

    cur.execute('DROP TABLE IF EXISTS test_mindsdb')
    cur.execute(
        'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BOOL)')
    for i in range(0, 200):
        cur.execute(
            f'INSERT INTO test_mindsdb VALUES ("This is string number {i}", {i}, {i % 2 == 0})'
        )
    con.commit()
    con.close()

    mysql_ds = MySqlDS(table='test_mindsdb',
                       host=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE,
                       port=PORT)

    assert mysql_ds.name() == 'MySqlDS: mysql/test_mindsdb'

    assert (len(mysql_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    F.analyse_dataset(from_data=mysql_ds)
def test_clickhouse_ds():
    from mindsdb_native.libs.data_sources.clickhouse_ds import ClickhouseDS

    HOST = 'localhost'
    PORT = 8123

    clickhouse_url = f'http://{HOST}:{PORT}'
    queries = [
        'CREATE DATABASE IF NOT EXISTS test', 'DROP TABLE IF EXISTS test.mock',
        '''
            CREATE TABLE test.mock(
                col1 String
                ,col2 Int64
                ,col3 Array(UInt8)
            ) ENGINE = MergeTree()
                ORDER BY col2
                PARTITION BY col1
        ''', "INSERT INTO test.mock VALUES ('a',1,[1,2,3])",
        "INSERT INTO test.mock VALUES ('b',2,[2,3,1])",
        "INSERT INTO test.mock VALUES ('c',3,[3,1,2])"
    ]
    for q in queries:
        r = requests.post(clickhouse_url, data=q)
        assert r.status_code == 200

    clickhouse_ds = ClickhouseDS(
        'SELECT * FROM test.mock ORDER BY col2 DESC LIMIT 2',
        host=HOST,
        port=PORT)

    assert (len(clickhouse_ds.df) == 2)
    assert (sum(map(int, clickhouse_ds.df['col2'])) == 5)
    assert (len(list(clickhouse_ds.df['col3'][1])) == 3)
    assert (set(clickhouse_ds.df.columns) == set(['col1', 'col2', 'col3']))

    mdb = Predictor(name='analyse_dataset_test_predictor')
    F.analyse_dataset(from_data=clickhouse_ds)
 def train(self, train_df):
     mdb = Predictor(name=self.name)
     mdb.learn(to_predict=self.target_column,
               from_data=train_df,
               backend='lightwood')
     return mdb
Exemple #10
0
from mindsdb_native import Predictor

Predictor(name='fuel').learn(
    to_predict='Main_Engine_Fuel_Consumption_MT_day',
    from_data='fuel.csv',

    # Time series arguments:
    order_by='Time',
    group_by='id',
    window_size=24,  # just 24 hours
)
Exemple #11
0
from mindsdb_native import Predictor

mdb = Predictor(name='coffee_predictor')
mdb.learn(from_data='data.tsv',
          to_predict=[
              'Coffe_Malt', 'Chocolat', 'Gold', 'Medium_Barley', 'Dark_Barley',
              'Dandelion', 'Beets', 'Chicory_Roots', 'Figs'
          ])
Exemple #12
0
"""

This example we will walk you over the basics of MindsDB

The example code objective here is to predict the best retail price for a given property.

"""

from mindsdb_native import Predictor

# use the model to make predictions
result = Predictor(name='home_rentals_price').predict(when_data={
    'number_of_rooms': 2,
    'number_of_bathrooms': 1,
    'sqft': 1190
})

# you can now print the results
print('The predicted price is ${price} with {conf} confidence'.format(
    price=result[0]['rental_price'],
    conf=result[0]['rental_price_confidence']))
Exemple #13
0
"""

"""

from mindsdb_native import Predictor

# Here we use the model to make predictions (NOTE: You need to run train.py first)
result = Predictor(name='fuel').predict(when_data='fuel_predict.csv')

# you can now print the results
print('The predicted main engine fuel consumption')
for row in result:
    print(row)
Exemple #14
0
from mindsdb_native import Predictor


# Should take about 13 minutes
Predictor(name='fuel').learn(
    to_predict='Main_Engine_Fuel_Consumption_MT_day',
    from_data='fuel.csv',
    stop_training_in_x_seconds=60,

    # Time series arguments:

    timeseries_settings={
        'order_by': ['Time'],
        'group_by': ['id'],
        'window': 24,  # just 24 hours
    }

)
Exemple #15
0
"""

This example we will walk you over the basics of MindsDB

The example code objective here is to:

- learn a model to predict the best retal price for a given property.

In order to to this we have a dataset "data_sources/home_rentals.csv" (or download from https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv)

"""

from mindsdb_native import Predictor

# We tell mindsDB what we want to learn and from what data
Predictor(name='home_rentals').learn(
    to_predict=
    'days_on_market',  # the column we want to learn to predict given all the data in the file
    from_data=
    "home_rentals.csv"  # the path to the file where we can learn from, (note: can be url)
    ,
    stop_training_in_x_seconds=10)
Exemple #16
0
from mindsdb_native import Predictor


Predictor(name='fuel').learn(
    to_predict='Main_Engine_Fuel_Consumption_MT_day',
    from_data = 'fuel.csv',

    # Time series arguments:

    timeseries_settings={
        order_by='Time',
        group_by='id',
        window=24, # just 24 hours
    }

)
Exemple #17
0
 def __init__(self, dataset):
     super().__init__(dataset)
     self.df = pd.read_csv(f"{self.dataset.name}_test.csv")
     self.predictor = Predictor(name=self.dataset.name)
def test_maria_ds():
    import mysql.connector
    from mindsdb_native import MariaDS

    HOST = 'localhost'
    USER = '******'
    PASSWORD = ''
    DATABASE = 'mysql'
    PORT = 4306

    con = mysql.connector.connect(host=HOST,
                                  port=PORT,
                                  user=USER,
                                  password=PASSWORD,
                                  database=DATABASE)
    cur = con.cursor()

    cur.execute('DROP TABLE IF EXISTS test_mindsdb')
    cur.execute("""CREATE TABLE test_mindsdb (
                                col_int BIGINT,
                                col_float FLOAT,
                                col_categorical Text,
                                col_bool BOOL,
                                col_text Text,
                                col_date DATE,
                                col_datetime DATETIME,
                                col_timestamp TIMESTAMP,
                                col_time TIME
                                )
                                """)
    for i in range(0, 200):
        dt = datetime.datetime.now() - datetime.timedelta(days=i)

        query = f"""INSERT INTO test_mindsdb (col_int,
                                col_float,
                                col_categorical,
                                col_bool,
                                col_text,
                                col_date,
                                col_datetime,
                                col_timestamp,
                                col_time)
                                VALUES (%s, %s,  %s,  %s,  %s, %s, %s, %s, %s)
                                """
        ci = i % 5
        values = (i, i + 0.01, f"Cat {ci}", i % 2 == 0,
                  f"long long long text {i}", dt.date(), dt,
                  dt.strftime('%Y-%m-%d %H:%M:%S.%f'),
                  dt.strftime('%H:%M:%S.%f'))
        cur.execute(query, values)
    con.commit()
    con.close()

    maria_ds = MariaDS(table='test_mindsdb',
                       host=HOST,
                       user=USER,
                       password=PASSWORD,
                       database=DATABASE,
                       port=PORT)

    assert maria_ds.name() == 'MariaDS: mysql/test_mindsdb'

    assert (len(maria_ds._df) == 200)

    mdb = Predictor(name='analyse_dataset_test_predictor',
                    log_level=logging.ERROR)
    model_data = F.analyse_dataset(from_data=maria_ds)
    analysis = model_data['data_analysis_v2']
    assert model_data
    assert analysis

    def assert_expected_type(column_typing, expected_type, expected_subtype):
        assert column_typing['data_type'] == expected_type
        assert column_typing['data_subtype'] == expected_subtype
        assert column_typing['data_type_dist'][expected_type] == 200
        assert column_typing['data_subtype_dist'][expected_subtype] == 200

    assert_expected_type(analysis['col_categorical']['typing'],
                         DATA_TYPES.CATEGORICAL, DATA_SUBTYPES.MULTIPLE)
    assert_expected_type(analysis['col_bool']['typing'],
                         DATA_TYPES.CATEGORICAL, DATA_SUBTYPES.SINGLE)
    assert_expected_type(analysis['col_int']['typing'], DATA_TYPES.NUMERIC,
                         DATA_SUBTYPES.INT)
    assert_expected_type(analysis['col_float']['typing'], DATA_TYPES.NUMERIC,
                         DATA_SUBTYPES.FLOAT)
    assert_expected_type(analysis['col_date']['typing'], DATA_TYPES.DATE,
                         DATA_SUBTYPES.DATE)
    assert_expected_type(analysis['col_datetime']['typing'], DATA_TYPES.DATE,
                         DATA_SUBTYPES.TIMESTAMP)
    assert_expected_type(analysis['col_timestamp']['typing'], DATA_TYPES.DATE,
                         DATA_SUBTYPES.TIMESTAMP)

    # Subtype is expected to be either .SHORT or .RICH
    try:
        assert_expected_type(analysis['col_text']['typing'], DATA_TYPES.TEXT,
                             DATA_SUBTYPES.SHORT)
    except AssertionError:
        assert_expected_type(analysis['col_text']['typing'], DATA_TYPES.TEXT,
                             DATA_SUBTYPES.RICH)
Exemple #19
0
 def __init__(self, dataset):
     super().__init__(dataset)
     self.predictor = Predictor(name=self.dataset.name)
     self.query_template = f"SELECT * FROM test_data.{self.dataset.name} LIMIT %s"
Exemple #20
0
import pandas as pd
from mindsdb_native import Predictor

mdb = Predictor(name='description_predictor')

mdb.learn(from_data=pd.read_csv('processed_data/train.csv'),
          to_predict='description')

predictions = mdb.predict('processed_data/test.csv')

for pred in predictions:
    print(pred['description'])
Exemple #21
0
    def test_database_history(self):
        return
        from mindsdb_datasources import ClickhouseDS

        TEMP_DB = 'test_database_history_' + random_string()
        TEMP_TABLE = 'tmp_test_database_history_' + random_string()

        params = {'user': self.USER, 'password': self.PASSWORD}

        clickhouse_url = f'http://{self.HOST}:{self.PORT}'

        values = []
        for i in range(200):
            values.append([str(i % 4), i, i * 2])

        queries = [
            f'CREATE DATABASE IF NOT EXISTS {TEMP_DB}',
            f'DROP TABLE IF EXISTS {TEMP_DB}.{TEMP_TABLE}',
            f'''
                CREATE TABLE {TEMP_DB}.{TEMP_TABLE}(
                    col1 String
                    ,col2 Int64
                    ,col3 Int64
                ) ENGINE = MergeTree()
                    ORDER BY col2
                    PARTITION BY col1
            ''',
        ]
        gc.collect()

        for value in values:
            value_ins_str = str(value).replace('[','').replace(']','')
            queries.append(f"INSERT INTO {TEMP_DB}.{TEMP_TABLE} VALUES ({value_ins_str})")

        for q in queries:
            r = requests.post(clickhouse_url, data=q, params=params)
            assert r.status_code == 200, r.text

        clickhouse_ds = ClickhouseDS(
            f'SELECT * FROM {TEMP_DB}.{TEMP_TABLE}',
            host=self.HOST,
            port=self.PORT,
            user=self.USER,
            password=self.PASSWORD
        )

        temp_predictor = Predictor(name='query_history_based_ts_predictor')
        temp_predictor.learn(
            to_predict='col3',
            from_data=clickhouse_ds,
            stop_training_in_x_seconds=5,
            timeseries_settings={
                'order_by': ['col2']
                ,'window': 6
                ,'group_by': ['col1']
            }
        )
        del temp_predictor

        ts_predictor = Predictor(name='query_history_based_ts_predictor')
        predictions = ts_predictor.predict(
            when_data={
                'col2': 800
                ,'col1': '2'
            },
            advanced_args={
                'use_database_history': True
            }
        )

        assert predictions[0]['col3'] is not None

        r = requests.post(
            clickhouse_url,
            data=f'DROP DATABASE {TEMP_DB}',
            params=params
        )
        assert r.status_code == 200, 'failed to drop temporary database "{}"'.format(TEMP_DB)