Python ModelInterface.analyse_datasetの例

プログラミング言語: Python

名前空間/パッケージ名: mindsdb.interfaces.model.model_interface

クラス/型: ModelInterface

メソッド/関数: analyse_dataset

hotexamples.comのコード掲載数: 3

Python ModelInterface.analyse_dataset - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmindsdb.interfaces.model.model_interface.ModelInterface.analyse_datasetの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

ModelInterface(8)

predict(4)

analyse_dataset(3)

delete_model(1)

get_model_data(1)

get_models(1)

コード例 #1

ファイルを表示

ファイル: datastore.py プロジェクト: ZoranPandovski/mindsdb

class DataStore():
    def __init__(self):
        self.config = Config()
        self.fs_store = FsStore()
        self.dir = self.config['paths']['datasources']
        self.model_interface = ModelInterface()

    def get_analysis(self, name, company_id=None):
        datasource_record = session.query(Datasource).filter_by(
            company_id=company_id, name=name).first()
        if datasource_record.analysis is None:
            return None
        analysis = json.loads(datasource_record.analysis)
        return analysis

    def start_analysis(self, name, company_id=None):
        datasource_record = session.query(Datasource).filter_by(
            company_id=company_id, name=name).first()
        if datasource_record.analysis is not None:
            return None
        semaphor_record = session.query(Semaphor).filter_by(
            company_id=company_id,
            entity_id=datasource_record.id,
            entity_type='datasource').first()
        if semaphor_record is None:
            semaphor_record = Semaphor(company_id=company_id,
                                       entity_id=datasource_record.id,
                                       entity_type='datasource',
                                       action='write')
            session.add(semaphor_record)
            session.commit()
        else:
            return
        try:
            analysis = self.model_interface.analyse_dataset(
                ds=self.get_datasource_obj(name,
                                           raw=True,
                                           company_id=company_id),
                company_id=company_id)
            datasource_record = session.query(Datasource).filter_by(
                company_id=company_id, name=name).first()
            datasource_record.analysis = json.dumps(analysis,
                                                    cls=CustomJSONEncoder)
            session.commit()
        except Exception as e:
            log.error(e)
        finally:
            semaphor_record = session.query(Semaphor).filter_by(
                company_id=company_id,
                entity_id=datasource_record.id,
                entity_type='datasource').first()
            session.delete(semaphor_record)
            session.commit()

    def get_datasources(self, name=None, company_id=None):
        datasource_arr = []
        if name is not None:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=company_id, name=name)
        else:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=company_id)
        for datasource_record in datasource_record_arr:
            try:
                if datasource_record.data is None:
                    continue
                datasource = json.loads(datasource_record.data)
                datasource['created_at'] = datasource_record.created_at
                datasource['updated_at'] = datasource_record.updated_at
                datasource['name'] = datasource_record.name
                datasource['id'] = datasource_record.id
                datasource_arr.append(datasource)
            except Exception as e:
                log.error(e)
        return datasource_arr

    def get_data(self,
                 name,
                 where=None,
                 limit=None,
                 offset=None,
                 company_id=None):
        offset = 0 if offset is None else offset
        ds = self.get_datasource_obj(name, company_id=company_id)

        if limit is not None:
            # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset`
            filtered_ds = ds.filter(where=where,
                                    limit=limit + offset).iloc[offset:]
        else:
            filtered_ds = ds.filter(where=where)

        filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None)
        data = filtered_ds.to_dict(orient='records')
        return {
            'data': data,
            'rowcount': len(ds),
            'columns_names': list(data[0].keys())
        }

    def get_datasource(self, name, company_id=None):
        datasource_arr = self.get_datasources(name, company_id=company_id)
        if len(datasource_arr) == 1:
            return datasource_arr[0]
        # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy
        elif len(datasource_arr) > 1:
            log.error('Two or more datasource with the same name, (',
                      len(datasource_arr), ') | Full list: ', datasource_arr)
            raise Exception('Two or more datasource with the same name')
        return None

    def delete_datasource(self, name, company_id=None):
        datasource_record = Datasource.query.filter_by(company_id=company_id,
                                                       name=name).first()
        if not Config()["force_datasource_removing"]:
            linked_models = Predictor.query.filter_by(
                company_id=company_id,
                datasource_id=datasource_record.id).all()
            if linked_models:
                raise Exception(
                    "Can't delete {} datasource because there are next models linked to it: {}"
                    .format(name, [model.name for model in linked_models]))
        session.query(Semaphor).filter_by(company_id=company_id,
                                          entity_id=datasource_record.id,
                                          entity_type='datasource').delete()
        session.delete(datasource_record)
        session.commit()
        self.fs_store.delete(f'datasource_{company_id}_{datasource_record.id}')
        try:
            shutil.rmtree(os.path.join(self.dir, f'{company_id}@@@@@{name}'))
        except Exception:
            pass

    def get_vacant_name(self, base=None, company_id=None):
        ''' returns name of datasource, which starts from 'base' and ds with that name is not exists yet
        '''
        if base is None:
            base = 'datasource'
        datasources = session.query(
            Datasource.name).filter_by(company_id=company_id).all()
        datasources_names = [x[0] for x in datasources]
        if base not in datasources_names:
            return base
        for i in range(1, 1000):
            candidate = f'{base}_{i}'
            if candidate not in datasources_names:
                return candidate
        raise Exception(
            f"Can not find appropriate name for datasource '{base}'")

    def create_datasource(self,
                          source_type,
                          source,
                          file_path=None,
                          company_id=None,
                          ds_meta_dir=None):
        datasource_controller = DatasourceController()
        if source_type == 'file':
            source = os.path.join(ds_meta_dir, source)
            shutil.move(file_path, source)
            ds = FileDS(source)

            creation_info = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

        elif datasource_controller.get_db_integration(source_type,
                                                      company_id) is not None:
            integration = datasource_controller.get_db_integration(
                source_type, company_id)

            ds_class_map = {
                'clickhouse': ClickhouseDS,
                'mariadb': MariaDS,
                'mysql': MySqlDS,
                'singlestore': MySqlDS,
                'postgres': PostgresDS,
                'cockroachdb': PostgresDS,
                'mssql': MSSQLDS,
                'mongodb': MongoDS,
                'snowflake': SnowflakeDS,
                'athena': AthenaDS,
                'cassandra': CassandraDS,
                'scylladb': ScyllaDS,
                'trinodb': TrinoDS
            }

            try:
                dsClass = ds_class_map[integration['type']]
            except KeyError:
                raise KeyError(
                    f"Unknown DS type: {source_type}, type is {integration['type']}"
                )

            if dsClass is None:
                raise Exception(
                    f"Unsupported datasource: {source_type}, type is {integration['type']}, please install required dependencies!"
                )

            if integration['type'] in ['clickhouse']:
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'query': source['query'],
                        'user': integration['user'],
                        'password': integration['password'],
                        'host': integration['host'],
                        'port': integration['port']
                    }
                }
                ds = dsClass(**creation_info['kwargs'])

            elif integration['type'] in [
                    'mssql', 'postgres', 'cockroachdb', 'mariadb', 'mysql',
                    'singlestore', 'cassandra', 'scylladb'
            ]:
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'query': source['query'],
                        'user': integration['user'],
                        'password': integration['password'],
                        'host': integration['host'],
                        'port': integration['port']
                    }
                }
                kwargs = creation_info['kwargs']

                integration_folder_name = f'integration_files_{company_id}_{integration["id"]}'
                if integration['type'] in ('mysql', 'mariadb'):
                    kwargs['ssl'] = integration.get('ssl')
                    kwargs['ssl_ca'] = integration.get('ssl_ca')
                    kwargs['ssl_cert'] = integration.get('ssl_cert')
                    kwargs['ssl_key'] = integration.get('ssl_key')
                    for key in ['ssl_ca', 'ssl_cert', 'ssl_key']:
                        if isinstance(kwargs[key],
                                      str) and len(kwargs[key]) > 0:
                            kwargs[key] = os.path.join(
                                self.integrations_dir, integration_folder_name,
                                kwargs[key])
                elif integration['type'] in ('cassandra', 'scylla'):
                    kwargs['secure_connect_bundle'] = integration.get(
                        'secure_connect_bundle')
                    if (isinstance(kwargs['secure_connect_bundle'], str)
                            and len(kwargs['secure_connect_bundle']) > 0):
                        kwargs['secure_connect_bundle'] = os.path.join(
                            self.integrations_dir, integration_folder_name,
                            kwargs['secure_connect_bundle'])

                if 'database' in integration:
                    kwargs['database'] = integration['database']

                if 'database' in source:
                    kwargs['database'] = source['database']

                ds = dsClass(**kwargs)

            elif integration['type'] == 'snowflake':
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'query':
                        source['query'].replace('"', "'"),
                        'schema':
                        source.get('schema', integration['schema']),
                        'warehouse':
                        source.get('warehouse', integration['warehouse']),
                        'database':
                        source.get('database', integration['database']),
                        'host':
                        integration['host'],
                        'password':
                        integration['password'],
                        'user':
                        integration['user'],
                        'account':
                        integration['account']
                    }
                }

                ds = dsClass(**creation_info['kwargs'])

            elif integration['type'] == 'mongodb':
                if isinstance(source['find'], str):
                    source['find'] = json.loads(source['find'])
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'database': source['database'],
                        'collection': source['collection'],
                        'query': source['find'],
                        'user': integration['user'],
                        'password': integration['password'],
                        'host': integration['host'],
                        'port': integration['port']
                    }
                }

                ds = dsClass(**creation_info['kwargs'])

            elif integration['type'] == 'athena':
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'query': source['query'],
                        'staging_dir': source['staging_dir'],
                        'database': source['database'],
                        'access_key': source['access_key'],
                        'secret_key': source['secret_key'],
                        'region_name': source['region_name']
                    }
                }

                ds = dsClass(**creation_info['kwargs'])

            elif integration['type'] == 'trinodb':
                creation_info = {
                    'class': dsClass.__name__,
                    'args': [],
                    'kwargs': {
                        'query': source['query'],
                        'user': integration['user'],
                        'password': integration['password'],
                        'host': integration['host'],
                        'port': integration['port'],
                        'schema': integration['schema'],
                        'catalog': integration['catalog']
                    }
                }

                ds = dsClass(**creation_info['kwargs'])
        else:
            # This probably only happens for urls
            ds = FileDS(source)
            creation_info = {'class': 'FileDS', 'args': [source], 'kwargs': {}}
        return ds, creation_info

    def save_datasource(self,
                        name,
                        source_type,
                        source,
                        file_path=None,
                        company_id=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        datasource_record = session.query(Datasource).filter_by(
            company_id=company_id, name=name).first()
        while datasource_record is not None:
            raise Exception(f'Datasource with name {name} already exists')

        try:
            datasource_record = Datasource(
                company_id=company_id,
                name=name,
                datasources_version=mindsdb_datasources.__version__,
                mindsdb_version=mindsdb_version)
            session.add(datasource_record)
            session.commit()

            ds_meta_dir = os.path.join(self.dir, f'{company_id}@@@@@{name}')
            os.mkdir(ds_meta_dir)

            ds, creation_info = self.create_datasource(source_type, source,
                                                       file_path, company_id,
                                                       ds_meta_dir)

            if hasattr(ds, 'get_columns') and hasattr(ds, 'get_row_count'):
                try:
                    column_names = ds.get_columns()
                    row_count = ds.get_row_count()
                except Exception:
                    df = ds.df
                    column_names = list(df.keys())
                    row_count = len(df)
            else:
                df = ds.df
                column_names = list(df.keys())
                row_count = len(df)

            if '' in column_names or len(column_names) != len(
                    set(column_names)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique non-empty name'
                )

            datasource_record.creation_info = json.dumps(creation_info)
            datasource_record.data = json.dumps({
                'source_type':
                source_type,
                'source':
                source,
                'row_count':
                row_count,
                'columns': [dict(name=x) for x in column_names]
            })

            self.fs_store.put(
                f'{company_id}@@@@@{name}',
                f'datasource_{company_id}_{datasource_record.id}', self.dir)
            session.commit()

        except Exception as e:
            log.error(f'Error creating datasource {name}, exception: {e}')
            try:
                self.delete_datasource(name, company_id=company_id)
            except Exception:
                pass
            raise e

        return self.get_datasource_obj(name, raw=True, company_id=company_id)

    def get_datasource_obj(self,
                           name=None,
                           id=None,
                           raw=False,
                           company_id=None):
        try:
            if name is not None:
                datasource_record = session.query(Datasource).filter_by(
                    company_id=company_id, name=name).first()
            else:
                datasource_record = session.query(Datasource).filter_by(
                    company_id=company_id, id=id).first()

            self.fs_store.get(
                f'{company_id}@@@@@{name}',
                f'datasource_{company_id}_{datasource_record.id}', self.dir)
            creation_info = json.loads(datasource_record.creation_info)
            if raw:
                return creation_info
            else:
                return eval(creation_info['class'])(*creation_info['args'],
                                                    **creation_info['kwargs'])
        except Exception as e:
            log.error(f'Error getting datasource {name}, exception: {e}')
            return None

コード例 #2

ファイルを表示

class CustomModels():
    def __init__(self):
        self.config = Config()
        self.fs_store = FsSotre()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.dbw = DatabaseWrapper()
        self.storage_dir = self.config['paths']['custom_models']
        os.makedirs(self.storage_dir, exist_ok=True)
        self.model_cache = {}
        self.mindsdb_native = NativeInterface()
        self.dbw = DatabaseWrapper()

    def _dir(self, name):
        return str(os.path.join(self.storage_dir, name))

    def _internal_load(self, name):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)
        sys.path.insert(0, self._dir(name))
        module = __import__(name)

        try:
            model = module.Model.load(
                os.path.join(self._dir(name), 'model.pickle'))
        except Exception as e:
            model = module.Model()
            model.initialize_column_types()
            if hasattr(model, 'setup'):
                model.setup()

        self.model_cache[name] = model

        return model

    def learn(self, name, from_data, to_predict, datasource_id, kwargs={}):
        model_data = self.get_model_data(name)
        model_data['status'] = 'training'
        self.save_model_data(name, model_data)

        to_predict = to_predict if isinstance(to_predict,
                                              list) else [to_predict]

        data_source = getattr(mindsdb_datasources,
                              from_data['class'])(*from_data['args'],
                                                  **from_data['kwargs'])
        data_frame = data_source.df
        model = self._internal_load(name)
        model.to_predict = to_predict

        model_data = self.get_model_data(name)
        model_data['predict'] = model.to_predict
        self.save_model_data(name, model_data)

        data_analysis = self.mindsdb_native.analyse_dataset(
            data_source)['data_analysis_v2']

        model_data = self.get_model_data(name)
        model_data['data_analysis_v2'] = data_analysis
        self.save_model_data(name, model_data)

        model.fit(data_frame, to_predict, data_analysis, kwargs)

        model.save(os.path.join(self._dir(name), 'model.pickle'))
        self.model_cache[name] = model

        model_data = self.get_model_data(name)
        model_data['status'] = 'completed'
        model_data['columns'] = list(data_analysis.keys())
        self.save_model_data(name, model_data)
        self.fs_store.put(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        self.dbw.unregister_predictor(name)
        self.dbw.register_predictors([self.get_model_data(name)])

    def predict(self, name, when_data=None, from_data=None, kwargs=None):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)
        if kwargs is None:
            kwargs = {}
        if from_data is not None:
            if isinstance(from_data, dict):
                data_source = getattr(mindsdb_datasources, from_data['class'])(
                    *from_data['args'], **from_data['kwargs'])
            # assume that particular instance of any DataSource class is provided
            else:
                data_source = from_data
            data_frame = data_source.df
        elif when_data is not None:
            if isinstance(when_data, dict):
                for k in when_data:
                    when_data[k] = [when_data[k]]
                data_frame = pd.DataFrame(when_data)
            else:
                data_frame = pd.DataFrame(when_data)

        model = self._internal_load(name)
        predictions = model.predict(data_frame, kwargs)

        pred_arr = []
        for i in range(len(predictions)):
            pred_arr.append({})
            pred_arr[-1] = {}
            for col in predictions.columns:
                pred_arr[-1][col] = {}
                pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i]

        return pred_arr

    def get_model_data(self, name):
        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        return predictor_record.data

    def save_model_data(self, name, data):
        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        if predictor_record is None:
            predictor_record = Predictor(company_id=self.company_id,
                                         name=name,
                                         is_custom=True,
                                         data=data)
            session.add(predictor_record)
        else:
            predictor_record.data = data
        session.commit()

    def get_models(self):
        predictor_names = [
            x.name
            for x in Predictor.query.filter_by(company_id=self.company_id,
                                               is_custom=True)
        ]
        models = []
        for name in predictor_names:
            models.append(self.get_model_data(name))

        return models

    def delete_model(self, name):
        Predictor.query.filter_by(company_id=self.company_id,
                                  name=name,
                                  is_custom=True).delete()
        session.commit()
        shutil.rmtree(self._dir(name))
        self.dbw.unregister_predictor(name)
        self.fs_store.delete(f'custom_model_{self.company_id}_{name}')

    def rename_model(self, name, new_name):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        self.dbw.unregister_predictor(name)
        shutil.move(self._dir(name), self._dir(new_name))
        shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'),
                    os.path.join(self._dir(new_name), f'{new_name}.py'))

        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        predictor_record.name = new_name
        session.commit()

        self.dbw.register_predictors([self.get_model_data(new_name)])

        self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}',
                          self.storage_dir)
        self.fs_store.delete(f'custom_model_{self.company_id}_{name}')

    def export_model(self, name):
        shutil.make_archive(base_name=name,
                            format='zip',
                            root_dir=self._dir(name))
        return str(self._dir(name)) + '.zip'

    def load_model(self, fpath, name, trained_status):
        shutil.unpack_archive(fpath, self._dir(name), 'zip')
        shutil.move(os.path.join(self._dir(name), 'model.py'),
                    os.path.join(self._dir(name), f'{name}.py'))
        model = self._internal_load(name)
        model.to_predict = model.to_predict if isinstance(
            model.to_predict, list) else [model.to_predict]
        self.save_model_data(
            name, {
                'name': name,
                'data_analysis_v2': model.column_type_map,
                'predict': model.to_predict,
                'status': trained_status,
                'is_custom': True,
                'columns': list(model.column_type_map.keys())
            })

        with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp:
            fp.write('')

        self.fs_store.put(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        if trained_status == 'trained':
            self.dbw.register_predictors([self.get_model_data(name)])

コード例 #3

ファイルを表示

class DataStore():
    def __init__(self):
        self.config = Config()

        self.fs_store = FsSotre()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.dir = self.config.paths['datasources']
        self.mindsdb_native = NativeInterface()

    def get_analysis(self, name):
        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()
        if datasource_record.analysis is None:
            return None
        analysis = json.loads(datasource_record.analysis)
        return analysis

    def start_analysis(self, name):
        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()
        if datasource_record.analysis is not None:
            return None
        semaphor_record = session.query(Semaphor).filter_by(
            company_id=self.company_id,
            entity_id=datasource_record.id,
            entity_type='datasource').first()
        if semaphor_record is None:
            semaphor_record = Semaphor(company_id=self.company_id,
                                       entity_id=datasource_record.id,
                                       entity_type='datasource',
                                       action='write')
            session.add(semaphor_record)
            session.commit()
        else:
            return
        try:
            analysis = self.mindsdb_native.analyse_dataset(
                self.get_datasource_obj(name, raw=True))
            datasource_record = session.query(Datasource).filter_by(
                company_id=self.company_id, name=name).first()
            datasource_record.analysis = json.dumps(analysis)
            session.commit()
        except Exception as e:
            log.error(e)
        finally:
            semaphor_record = session.query(Semaphor).filter_by(
                company_id=self.company_id,
                entity_id=datasource_record.id,
                entity_type='datasource').first()
            session.delete(semaphor_record)
            session.commit()

    def get_datasources(self, name=None):
        datasource_arr = []
        if name is not None:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=self.company_id, name=name)
        else:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=self.company_id)
        for datasource_record in datasource_record_arr:
            try:
                if datasource_record.data is None:
                    continue
                datasource = json.loads(datasource_record.data)
                datasource['created_at'] = datasource_record.created_at
                datasource['updated_at'] = datasource_record.updated_at
                datasource['name'] = datasource_record.name
                datasource['id'] = datasource_record.id
                datasource_arr.append(datasource)
            except Exception as e:
                log.error(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        offset = 0 if offset is None else offset
        ds = self.get_datasource_obj(name)

        if limit is not None:
            # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset`
            filtered_ds = ds.filter(where=where,
                                    limit=limit + offset).iloc[offset:]
        else:
            filtered_ds = ds.filter(where=where)

        filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None)
        data = filtered_ds.to_dict(orient='records')
        return {
            'data': data,
            'rowcount': len(ds),
            'columns_names': filtered_ds.columns
        }

    def get_datasource(self, name):
        datasource_arr = self.get_datasources(name)
        if len(datasource_arr) == 1:
            return datasource_arr[0]
        # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy
        elif len(datasource_arr) > 1:
            log.error('Two or more datasource with the same name, (',
                      len(datasource_arr), ') | Full list: ', datasource_arr)
            raise Exception('Two or more datasource with the same name')
        return None

    def delete_datasource(self, name):
        datasource_record = Datasource.query.filter_by(
            company_id=self.company_id, name=name).first()
        id = datasource_record.id
        session.delete(datasource_record)
        session.commit()
        self.fs_store.delete(
            f'datasource_{self.company_id}_{datasource_record.id}')
        try:
            shutil.rmtree(os.path.join(self.dir, name))
        except Exception:
            pass

    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()
        while datasource_record is not None:
            raise Exception(f'Datasource with name {name} already exists')

        try:
            datasource_record = Datasource(
                company_id=self.company_id,
                name=name,
                datasources_version=mindsdb_datasources.__version__,
                mindsdb_version=mindsdb_version)
            session.add(datasource_record)
            session.commit()
            datasource_record = session.query(Datasource).filter_by(
                company_id=self.company_id, name=name).first()

            ds_meta_dir = os.path.join(self.dir, name)
            os.mkdir(ds_meta_dir)

            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS,
                    'athena': AthenaDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if dsClass is None:
                    raise Exception(
                        f'Unsupported datasource: {source_type}, please install required dependencies!'
                    )

                if integration['type'] in ['clickhouse']:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        creation_info['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        creation_info['kwargs']['database'] = source[
                            'database']

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'snowflake':
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'mongodb':
                    if isinstance(source['find'], str):
                        source['find'] = json.loads(source['find'])
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'athena':
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'staging_dir': source['staging_dir'],
                            'database': source['database'],
                            'access_key': source['access_key'],
                            'secret_key': source['secret_key'],
                            'region_name': source['region_name']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique non-empty name'
                )

            datasource_record.creation_info = json.dumps(creation_info)
            datasource_record.data = json.dumps({
                'source_type':
                source_type,
                'source':
                source,
                'row_count':
                len(df),
                'columns': [dict(name=x) for x in list(df.keys())]
            })

            self.fs_store.put(
                name, f'datasource_{self.company_id}_{datasource_record.id}',
                self.dir)
            session.commit()

        except Exception as e:
            log.error(f'{e}')
            try:
                self.delete_datasource(name)
            except Exception:
                pass
            raise e

        return self.get_datasource_obj(name, raw=True), name

    def get_datasource_obj(self, name, raw=False, id=None):
        try:
            if name is None:
                datasource_record = session.query(Datasource).filter_by(
                    company_id=self.company_id, id=id).first()
            else:
                datasource_record = session.query(Datasource).filter_by(
                    company_id=self.company_id, name=name).first()

            self.fs_store.get(
                name, f'datasource_{self.company_id}_{datasource_record.id}',
                self.dir)
            creation_info = json.loads(datasource_record.creation_info)
            if raw:
                return creation_info
            else:
                return eval(creation_info['class'])(*creation_info['args'],
                                                    **creation_info['kwargs'])
        except Exception as e:
            log.error(f'\n{e}\n')
            return None