def test_clickhouse_ds(self): from mindsdb_datasources import ClickhouseDS LIMIT = 100 clickhouse_ds = ClickhouseDS( host=self.HOST, port=self.PORT, user=self.USER, password=self.PASSWORD, query='SELECT * FROM {}.{} LIMIT {}'.format( self.DATABASE, 'home_rentals', LIMIT ) ) # test filter for val in clickhouse_ds.filter([['location', 'like','ood']])['location']: assert val == 'good' assert len(clickhouse_ds.filter([['rental_price', '>', 2500]], 3)) == 3 assert len(clickhouse_ds.filter([['initial_price', '<', 0]], 3)) == 0 # mess with the values inside then try to analyze it clickhouse_ds.df = break_dataset(clickhouse_ds.df) assert len(clickhouse_ds) <= LIMIT F.analyse_dataset(from_data=clickhouse_ds)
def test_predictor_deduplicate_data(self): n_points = 100 input_dataframe = pd.DataFrame({ 'numeric_int': [x % 44 for x in list(range(n_points))], 'numeric_int_2': [x % 20 for x in list(range(n_points))], }, index=list(range(n_points))) input_dataframe['y'] = input_dataframe['numeric_int'] % 10 # Add duplicate row input_dataframe = input_dataframe.append(input_dataframe.iloc[99], ignore_index=True) mdb = Predictor(name='test_drop_duplicates') mdb.learn( from_data=input_dataframe, to_predict='y', stop_training_in_x_seconds=1, use_gpu=False ) model_data = F.get_model_data('test_drop_duplicates') # Ensure duplicate row was not used for training, or analysis assert model_data['data_preparation']['total_row_count'] == n_points assert model_data['data_preparation']['used_row_count'] <= n_points assert sum([model_data['data_preparation']['train_row_count'], model_data['data_preparation']['validation_row_count'], model_data['data_preparation']['test_row_count']]) == n_points assert sum([mdb.transaction.input_data.train_df.shape[0], mdb.transaction.input_data.test_df.shape[0], mdb.transaction.input_data.validation_df.shape[0]]) == n_points # Disable deduplication and ensure the duplicate row is used mdb = Predictor(name='test_drop_duplicates') mdb.learn( from_data=input_dataframe, to_predict='y', stop_training_in_x_seconds=1, use_gpu=False, advanced_args={ 'deduplicate_data': False } ) model_data = F.get_model_data('test_drop_duplicates') # Duplicate row was used for analysis and training assert model_data['data_preparation']['total_row_count'] == n_points+1 assert model_data['data_preparation']['used_row_count'] <= n_points+1 assert sum([model_data['data_preparation']['train_row_count'], model_data['data_preparation']['validation_row_count'], model_data['data_preparation']['test_row_count']]) == n_points+1 assert sum([mdb.transaction.input_data.train_df.shape[0], mdb.transaction.input_data.test_df.shape[0], mdb.transaction.input_data.validation_df.shape[0]]) == n_points+1
def test_mysql_ds(self): from mindsdb_datasources import MySqlDS LIMIT = 400 mysql_ds = MySqlDS( host=self.HOST, user=self.USER, password=self.PASSWORD, database=self.DATABASE, port=self.PORT, query= ' (SELECT * FROM (SELECT * FROM {table} LIMIT {limit}) as t1) UNION ALL (SELECT * FROM (SELECT * FROM {table} LIMIT {limit}) as t1)' .format(table=self.TABLE, limit=int(LIMIT / 2))) mysql_ds.df = break_dataset(mysql_ds.df) assert len(mysql_ds) <= LIMIT F.analyse_dataset(mysql_ds) # Our SQL parsing fails here, test if we're still able to filter via the dataframe fallback for val in mysql_ds.filter([['sex', 'like', 'fem']])['sex']: assert val == 'female' assert len(mysql_ds.filter([['age', '>', 20]], 12)) == 12 assert len(mysql_ds.filter([['age', '=', 60]], 1)) == 1 assert len(mysql_ds.filter([['age', '>', 150]], 11)) == 0
def delete_model(self, name): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() id = predictor_record.id session.delete(predictor_record) session.commit() F.delete_model(name) self.dbw.unregister_predictor(name) self.fs_store.delete(f'predictor_{self.company_id}_{id}')
def delete_model(self, name): from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() id = predictor_record.id session.delete(predictor_record) session.commit() F.delete_model(name) self.dbw.unregister_predictor(name) self.fs_store.delete(f'predictor_{self.company_id}_{id}') return 0
def test_timeseries(self, tmp_path): ts_hours = 12 data_len = 120 train_file_name = os.path.join(str(tmp_path), 'train_data.csv') test_file_name = os.path.join(str(tmp_path), 'test_data.csv') features = generate_value_cols(['date', 'int'], data_len, ts_hours * 3600) labels = [generate_timeseries_labels(features)] feature_headers = list(map(lambda col: col[0], features)) label_headers = list(map(lambda col: col[0], labels)) # Create the training dataset and save it to a file columns_train = list( map(lambda col: col[1:int(len(col) * 3 / 4)], features)) columns_train.extend( list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels))) columns_to_file(columns_train, train_file_name, headers=[*feature_headers, *label_headers]) # Create the testing dataset and save it to a file columns_test = list( map(lambda col: col[int(len(col) * 3 / 4):], features)) columns_to_file(columns_test, test_file_name, headers=feature_headers) mdb = Predictor(name='test_timeseries') mdb.learn(from_data=train_file_name, to_predict=label_headers, timeseries_settings={ 'order_by': [feature_headers[0]], 'window': 3 }, stop_training_in_x_seconds=10, use_gpu=False, advanced_args={'force_predict': True}) results = mdb.predict(when_data=test_file_name, use_gpu=False) for row in results: expect_columns = [ label_headers[0], label_headers[0] + '_confidence' ] for col in expect_columns: assert col in row models = F.get_models() model_data = F.get_model_data(models[0]['name']) assert model_data
def test_analyze_dataset(self): n_points = 100 n_category_values = 4 input_dataframe = pd.DataFrame({ 'numeric_int': [x % 10 for x in list(range(n_points))], 'numeric_float': np.linspace(0, n_points, n_points), 'date_timestamp': [ (datetime.now() - timedelta(minutes=int(i))).isoformat() for i in range(n_points)], 'date_date': [ (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points)], 'categorical_str': [f'a{x}' for x in ( list(range(n_category_values)) * ( n_points // n_category_values))], 'categorical_int': [x for x in (list(range(n_category_values)) * ( n_points // n_category_values))], 'categorical_binary': [0, 1] * (n_points // 2), 'sequential_array': [f"1,2,3,4,5,{i}" for i in range(n_points)] }, index=list(range(n_points))) model_data = F.analyse_dataset(from_data=input_dataframe) for col, col_data in model_data['data_analysis_v2'].items(): expected_type = test_column_types[col][0] expected_subtype = test_column_types[col][1] assert col_data['typing']['data_type'] == expected_type assert col_data['typing']['data_subtype'] == expected_subtype assert col_data['empty'] assert col_data['histogram'] assert 'percentage_buckets' in col_data assert 'nr_warnings' in col_data assert not col_data['is_foreign_key'] assert isinstance(json.dumps(model_data), str)
def analyse_dataset(self, ds): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS from mindsdb_native import F ds = eval(ds['class'])(*ds['args'], **ds['kwargs']) analysis = F.analyse_dataset(ds) return self._pack(analysis)
def test_multilabel_prediction(self, tmp_path): train_file_name = os.path.join(str(tmp_path), 'train_data.csv') test_file_name = os.path.join(str(tmp_path), 'test_data.csv') data_len = 60 features = generate_value_cols(['int', 'float', 'int', 'float'], data_len) labels = [] labels.append(generate_log_labels(features)) labels.append(generate_timeseries_labels(features)) feature_headers = list(map(lambda col: col[0], features)) label_headers = list(map(lambda col: col[0], labels)) # Create the training dataset and save it to a file columns_train = list( map(lambda col: col[1:int(len(col) * 3 / 4)], features)) columns_train.extend( list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels))) columns_to_file(columns_train, train_file_name, headers=[*feature_headers, *label_headers]) # Create the testing dataset and save it to a file columns_test = list( map(lambda col: col[int(len(col) * 3 / 4):], features)) columns_to_file(columns_test, test_file_name, headers=feature_headers) mdb = Predictor(name='test_multilabel_prediction') mdb.learn( from_data=train_file_name, to_predict=label_headers, stop_training_in_x_seconds=1, use_gpu=False, advanced_args={'force_predict': True} ) results = mdb.predict(when_data=test_file_name) models = F.get_models() model_data = F.get_model_data(models[0]['name']) assert model_data for i in range(len(results)): row = results[i] for label in label_headers: expect_columns = [label, label + '_confidence'] for col in expect_columns: assert col in row
def test_category_tags_output(self): vocab = random.sample(SMALL_VOCAB, 10) vocab = {i: word for i, word in enumerate(vocab)} # x1 contains the index of first tag present # x2 contains the index of second tag present # if a tag is missing then x1/x2 contain -1 instead # Thus the dataset should be perfectly predicted n_points = 5000 x1 = [ random.randint(0, len(vocab) - 1) if random.random() > 0.1 else -1 for i in range(n_points) ] x2 = [ random.randint(0, len(vocab) - 1) if random.random() > 0.1 else -1 for i in range(n_points) ] tags = [] for x1_index, x2_index in zip(x1, x2): row_tags = set([vocab.get(x1_index), vocab.get(x2_index)]) row_tags = [x for x in row_tags if x is not None] tags.append(','.join(row_tags)) df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags}) df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor('test') predictor.learn(from_data=df_train, to_predict='tags', advanced_args=dict(deduplicate_data=False), stop_training_in_x_seconds=60, use_gpu=False) model_data = F.get_model_data('test') assert model_data['data_analysis_v2']['tags']['typing'][ 'data_type'] == DATA_TYPES.CATEGORICAL assert model_data['data_analysis_v2']['tags']['typing'][ 'data_subtype'] == DATA_SUBTYPES.TAGS predictions = predictor.predict(when_data=df_test) test_tags = df_test.tags.apply(lambda x: x.split(',')) predicted_tags = [] for i in range(len(predictions)): predicted_tags.append(predictions[i]['tags']) test_tags_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[ 'tags'].encode(test_tags) pred_labels_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[ 'tags'].encode(predicted_tags) score = f1_score(test_tags_encoded, pred_labels_encoded, average='weighted') assert score >= 0.3
def test_mongodb_ds(self): mongodb_ds = MongoDS(collection=self.COLLECTION, query={}, host=self.HOST, port=self.PORT, user=self.USER, password=self.PASSWORD, database=self.DATABASE) F.analyse_dataset(from_data=mongodb_ds) for val in mongodb_ds.filter([['location', 'like', 'ood']])['location']: assert val == 'good' assert len(mongodb_ds.filter([['rental_price', '>', 2500]], 3)) == 3 assert len(mongodb_ds.filter([['initial_price', '<', 0]], 3)) == 0
def test_postgres_ds(self): from mindsdb_datasources import PostgresDS LIMIT = 100 postgres_ds = PostgresDS(host=self.HOST, user=self.USER, password=self.PASSWORD, database=self.DATABASE, port=self.PORT, query='SELECT * FROM {}.{} LIMIT {}'.format( 'test_data', self.TABLE, LIMIT)) postgres_ds.df = break_dataset(postgres_ds.df) assert len(postgres_ds) == LIMIT F.analyse_dataset(postgres_ds)
def test_postgres_ds(): import pg8000 from mindsdb_native.libs.data_sources.postgres_ds import PostgresDS HOST = 'localhost' USER = '******' PASSWORD = '' DBNAME = 'postgres' PORT = 5432 con = pg8000.connect(database=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT) cur = con.cursor() cur.execute('DROP TABLE IF EXISTS test_mindsdb') cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 Int, col_3 Boolean, col_4 Date, col_5 Int [])' ) for i in range(0, 200): dt = datetime.datetime.now() - datetime.timedelta(days=i) dt_str = dt.strftime('%Y-%m-%d') cur.execute( f'INSERT INTO test_mindsdb VALUES (\'String {i}\', {i}, {i % 2 == 0}, \'{dt_str}\', ARRAY [1, 2, {i}])' ) con.commit() con.close() postgres_ds = PostgresDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DBNAME, port=PORT) assert postgres_ds.name() == 'PostgresDS: postgres/test_mindsdb' assert (len(postgres_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=postgres_ds)
def test_data_source_setting(self): data_url = 'https://raw.githubusercontent.com/mindsdb/mindsdb-examples/master/classics/german_credit_data/processed_data/test.csv' data_source = FileDS(data_url) data_source.set_subtypes({}) data_source_mod = FileDS(data_url) data_source_mod.set_subtypes({ 'credit_usage': 'Int', 'Average_Credit_Balance': 'Short Text', 'existing_credits': 'Binary Category' }) analysis = F.analyse_dataset(data_source) analysis_mod = F.analyse_dataset(data_source_mod) a1 = analysis['data_analysis_v2'] a2 = analysis_mod['data_analysis_v2'] assert (len(a1) == len(a2)) assert (a1['over_draft']['typing']['data_type'] == a2['over_draft'] ['typing']['data_type']) assert (a1['credit_usage']['typing']['data_type'] == a2['credit_usage'] ['typing']['data_type']) assert (a1['credit_usage']['typing']['data_subtype'] != a2['credit_usage']['typing']['data_subtype']) assert ( a2['credit_usage']['typing']['data_subtype'] == DATA_SUBTYPES.INT) assert (a1['Average_Credit_Balance']['typing']['data_type'] != a2['Average_Credit_Balance']['typing']['data_type']) assert (a1['Average_Credit_Balance']['typing']['data_subtype'] != a2['Average_Credit_Balance']['typing']['data_subtype']) assert (a2['Average_Credit_Balance']['typing']['data_subtype'] == DATA_SUBTYPES.SHORT) assert (a2['Average_Credit_Balance']['typing']['data_type'] == DATA_TYPES.TEXT) assert (a1['existing_credits']['typing']['data_type'] == a2['existing_credits']['typing']['data_type']) assert (a1['existing_credits']['typing']['data_subtype'] != a2['existing_credits']['typing']['data_subtype']) assert (a2['existing_credits']['typing']['data_subtype'] == DATA_SUBTYPES.SINGLE)
def test_analyze_dataset_empty_column(self): n_points = 100 input_dataframe = pd.DataFrame({ 'numeric_int': [x % 10 for x in list(range(n_points))], 'empty_column': [None for i in range(n_points)] }, index=list(range(n_points))) model_data = F.analyse_dataset(from_data=input_dataframe) assert model_data['data_analysis_v2']['empty_column']['empty']['is_empty'] is True
def test_mongodb_ds(): from pymongo import MongoClient from mindsdb_native.libs.data_sources.mongodb_ds import MongoDS HOST = 'localhost' USER = '******' PASSWORD = '******' DATABASE = 'database' COLLECTION_NAME = 'test_mindsdb' PORT = 27017 con = MongoClient(host=HOST, port=PORT, username=USER, password=PASSWORD) db = con[DATABASE] if COLLECTION_NAME in db.list_collection_names(): db[COLLECTION_NAME].drop() collection = db[COLLECTION_NAME] for i in range(0, 200): collection.insert_one({ 'col_1': "This is string number {}".format(i), 'col_2': i, 'col_3': (i % 2) == 0 }) mongodb_ds = MongoDS(collection=COLLECTION_NAME, query={}, host=HOST, port=PORT, user=USER, password=PASSWORD, database=DATABASE) assert mongodb_ds.name() == 'MongoDS: database/test_mindsdb' assert (len(mongodb_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mongodb_ds)
def test_analyze_dataset_empty_values(self): n_points = 100 input_dataframe = pd.DataFrame({ 'numeric_int': [x % 10 for x in list(range(n_points))], 'numeric_int2': list(range(n_points)), }, index=list(range(n_points))) input_dataframe['numeric_int'].iloc[::2] = None model_data = F.analyse_dataset(from_data=input_dataframe) assert model_data['data_analysis_v2']['numeric_int']['empty']['empty_percentage'] == 50
def test_mysql_ds(): import mysql.connector from mindsdb_native.libs.data_sources.mysql_ds import MySqlDS HOST = 'localhost' USER = '******' PASSWORD = '' DATABASE = 'mysql' PORT = 3306 con = mysql.connector.connect(host=HOST, port=PORT, user=USER, password=PASSWORD, database=DATABASE) cur = con.cursor() cur.execute('DROP TABLE IF EXISTS test_mindsdb') cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BOOL)') for i in range(0, 200): cur.execute( f'INSERT INTO test_mindsdb VALUES ("This is string number {i}", {i}, {i % 2 == 0})' ) con.commit() con.close() mysql_ds = MySqlDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert mysql_ds.name() == 'MySqlDS: mysql/test_mindsdb' assert (len(mysql_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mysql_ds)
def test_mssql_ds(): import pytds from mindsdb_native.libs.data_sources.ms_sql_ds import MSSQLDS HOST = 'localhost' USER = '******' PASSWORD = '******' DATABASE = 'master' PORT = 1433 with pytds.connect(dsn=HOST, user=USER, password=PASSWORD, database=DATABASE) as con: with con.cursor() as cur: cur.execute( "IF OBJECT_ID('dbo.test_mindsdb') IS NOT NULL DROP TABLE dbo.test_mindsdb" ) cur.execute( 'CREATE TABLE test_mindsdb(col_1 Text, col_2 BIGINT, col_3 BIT)' ) for i in range(0, 200): cur.execute( f"INSERT INTO test_mindsdb ([col_1], [col_2], [col_3]) VALUES ('This is string number {i}', {i}, {i % 2})" ) con.commit() mssql_ds = MSSQLDS(table='test_mindsdb', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert mssql_ds.name() == 'MSSQLDS: master/test_mindsdb' assert (len(mssql_ds._df) == 200) mdb = Predictor(name='analyse_dataset_test_predictor', log_level=logging.ERROR) F.analyse_dataset(from_data=mssql_ds)
def test_sample_for_analysis(self): n_points = 100 n_category_values = 4 input_dataframe = pd.DataFrame( { 'numeric_int': [x % 10 for x in list(range(n_points))], 'numeric_float': np.linspace(0, n_points, n_points), 'date_timestamp': [(datetime.now() - timedelta(minutes=int(i))).isoformat() for i in range(n_points)], 'date_date': [ (datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points) ], 'categorical_str': [ f'a{x}' for x in (list(range(n_category_values)) * (n_points // n_category_values)) ], 'categorical_int': [ x for x in (list(range(n_category_values)) * (n_points // n_category_values)) ], 'categorical_binary': [0, 1] * (n_points // 2), 'sequential_array': [f"1,2,3,4,5,{i}" for i in range(n_points)] }, index=list(range(n_points))) mock_function = PickableMock(spec=sample_data, wraps=sample_data) setattr(mock_function, '__name__', 'mock_sample_data') with mock.patch( 'mindsdb_native.libs.controllers.predictor.sample_data', mock_function): model_data = F.analyse_dataset( from_data=input_dataframe, sample_settings={'sample_for_analysis': True}) assert mock_function.called for col, col_data in model_data['data_analysis_v2'].items(): if col == 'columns': continue expected_type = test_column_types[col][0] expected_subtype = test_column_types[col][1] assert col_data['typing']['data_type'] == expected_type assert col_data['typing']['data_subtype'] == expected_subtype assert col_data['empty'] assert col_data['histogram'] assert 'percentage_buckets' in col_data assert 'nr_warnings' in col_data assert col_data['identifier'] is None
def get_models(self, status='any'): models = F.get_models() if status != 'any': models = [x for x in models if x['status'] == status] for i in range(len(models)): for k in ['train_end_at', 'updated_at', 'created_at']: if k in models[i] and models[i][k] is not None: try: models[i][k] = parse_datetime( str(models[i][k]).split('.')[0]) except Exception as e: models[i][k] = parse_datetime(str(models[i][k])) return models
def get_model_data(self, name, native_view=False): model = F.get_model_data(name) if native_view: return model data_analysis = model['data_analysis_v2'] for column in data_analysis['columns']: if len(data_analysis[column]) == 0 or data_analysis[column].get( 'empty', {}).get('is_empty', False): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } return model
def test_clickhouse_ds(): from mindsdb_native.libs.data_sources.clickhouse_ds import ClickhouseDS HOST = 'localhost' PORT = 8123 clickhouse_url = f'http://{HOST}:{PORT}' queries = [ 'CREATE DATABASE IF NOT EXISTS test', 'DROP TABLE IF EXISTS test.mock', ''' CREATE TABLE test.mock( col1 String ,col2 Int64 ,col3 Array(UInt8) ) ENGINE = MergeTree() ORDER BY col2 PARTITION BY col1 ''', "INSERT INTO test.mock VALUES ('a',1,[1,2,3])", "INSERT INTO test.mock VALUES ('b',2,[2,3,1])", "INSERT INTO test.mock VALUES ('c',3,[3,1,2])" ] for q in queries: r = requests.post(clickhouse_url, data=q) assert r.status_code == 200 clickhouse_ds = ClickhouseDS( 'SELECT * FROM test.mock ORDER BY col2 DESC LIMIT 2', host=HOST, port=PORT) assert (len(clickhouse_ds.df) == 2) assert (sum(map(int, clickhouse_ds.df['col2'])) == 5) assert (len(list(clickhouse_ds.df['col3'][1])) == 3) assert (set(clickhouse_ds.df.columns) == set(['col1', 'col2', 'col3'])) mdb = Predictor(name='analyse_dataset_test_predictor') F.analyse_dataset(from_data=clickhouse_ds)
def test_maria_ds(self): from mindsdb_datasources import MariaDS LIMIT = 200 maria_ds = MariaDS(host=self.HOST, user=self.USER, password=self.PASSWORD, database=self.DATABASE, port=self.PORT, query='SELECT * FROM `{}` LIMIT {}'.format( self.TABLE, LIMIT)) maria_ds.df = break_dataset(maria_ds.df) assert len(maria_ds) <= LIMIT F.analyse_dataset(from_data=maria_ds) # Our SQL parsing succeds here, but the query fails, test if we're still able to filter via the dataframe fallback maria_ds._query = maria_ds._query.replace(self.TABLE, 'wrongly_named_table') assert len(maria_ds.filter([['Population', '<', 33098932]], 8)) == 8 assert len(maria_ds.filter([['Development_Index', '!=', 3]], 12)) == 12
def test_category_tags_input(self): vocab = random.sample(SMALL_VOCAB, 10) # tags contains up to 2 randomly selected tags # y contains the sum of indices of tags # the dataset should be nearly perfectly predicted n_points = 5000 tags = [] y = [] for i in range(n_points): row_tags = [] row_y = 0 for k in range(2): if random.random() > 0.2: selected_index = random.randint(0, len(vocab) - 1) if vocab[selected_index] not in row_tags: row_tags.append(vocab[selected_index]) row_y += selected_index tags.append(','.join(row_tags)) y.append(row_y) df = pd.DataFrame({'tags': tags, 'y': y}) df_train = df.iloc[:round(n_points * 0.9)] df_test = df.iloc[round(n_points * 0.9):] predictor = Predictor(name='test') predictor.learn(from_data=df_train, to_predict='y', advanced_args=dict(deduplicate_data=False), stop_training_in_x_seconds=40, use_gpu=False) model_data = F.get_model_data('test') assert model_data['data_analysis_v2']['tags']['typing'][ 'data_type'] == DATA_TYPES.CATEGORICAL assert model_data['data_analysis_v2']['tags']['typing'][ 'data_subtype'] == DATA_SUBTYPES.TAGS predictions = predictor.predict(when_data=df_test) test_y = df_test.y.apply(str) predicted_y = [] for i in range(len(predictions)): predicted_y.append(predictions[i]['y']) score = accuracy_score(test_y, predicted_y) assert score >= 0.2
def get_model_data(self, name, db_fix=True): model = F.get_model_data(name) # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in data_analysis['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get( 'empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } return model
def test_mssql_ds(self): from mindsdb_datasources import MSSQLDS HOST = DB_CREDENTIALS['mssql']['host'] USER = DB_CREDENTIALS['mssql']['user'] PASSWORD = DB_CREDENTIALS['mssql']['password'] DATABASE = DB_CREDENTIALS['mssql']['database'] PORT = DB_CREDENTIALS['mssql']['port'] mssql_ds = MSSQLDS(query='SELECT * FROM dbo.insurance LIMIT', host=HOST, user=USER, password=PASSWORD, database=DATABASE, port=PORT) assert (len(mssql_ds.df) > 200) analysis = F.analyse_dataset(from_data=mssql_ds)
def get_model_data(self, name, db_fix=True): from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() predictor_record = self._try_outdate_db_status(predictor_record) model = predictor_record.data if model is None or model['status'] == 'training': try: self.fs_store.get( name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) new_model_data = F.get_model_data(name) except Exception: new_model_data = None if predictor_record.data is None or ( new_model_data is not None and len(new_model_data) > len(predictor_record.data)): predictor_record.data = new_model_data model = new_model_data session.commit() # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in model['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get( 'empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } model['created_at'] = str( parse_datetime(str(predictor_record.created_at).split('.')[0])) model['updated_at'] = str( parse_datetime(str(predictor_record.updated_at).split('.')[0])) model['predict'] = predictor_record.to_predict model['update'] = predictor_record.update_status return self._pack(model)
def predict(self, name, when_data=None, kwargs={}): if name not in self.predictor_cache: # Clear the cache entirely if we have less than .12 GB left if psutil.virtual_memory().available < 1.2 * pow(10, 9): self.predictor_cache = {} if F.get_model_data(name)['status'] == 'complete': self.predictor_cache[name] = { 'predictor': mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}), 'created': datetime.datetime.now() } predictions = self.predictor_cache[name]['predictor'].predict( when_data=when_data, **kwargs) return predictions
def get_models(self, status='any'): models = F.get_models() if status != 'any': models = [x for x in models if x['status'] == status] models = [ x for x in models if x['status'] != 'training' or parse_datetime(x['created_at']) > parse_datetime(self.config['mindsdb_last_started_at']) ] for i in range(len(models)): for k in ['train_end_at', 'updated_at', 'created_at']: if k in models[i] and models[i][k] is not None: try: models[i][k] = parse_datetime( str(models[i][k]).split('.')[0]) except Exception: models[i][k] = parse_datetime(str(models[i][k])) return models