コード例 #1
0
ファイル: datastore.py プロジェクト: wojohowitz00/mindsdb
    def save_datasource(self, name, source_type, source, file_path=None):
        datasource_record = Datasource(company_id=self.company_id, name=name)

        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        session.add(datasource_record)
        session.commit()
        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()

        try:
            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if integration['type'] in ['clickhouse']:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        creation_info['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        creation_info['kwargs']['database'] = source[
                            'database']

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'snowflake':
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'mongodb':
                    if isinstance(source['find'], str):
                        source['find'] = json.loads(source['find'])
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique non-empty name'
                )

            datasource_record.creation_info = json.dumps(creation_info)
            datasource_record.data = json.dumps({
                'source_type':
                source_type,
                'source':
                source,
                'row_count':
                len(df),
                'columns': [dict(name=x) for x in list(df.keys())]
            })

            self.fs_store.put(
                name, f'datasource_{self.company_id}_{datasource_record.id}',
                self.dir)

        except Exception:
            if os.path.isdir(ds_meta_dir):
                shutil.rmtree(ds_meta_dir)
            raise

        session.commit()
        return self.get_datasource_obj(name, raw=True), name
コード例 #2
0
    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        try:
            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if integration['type'] in ['clickhouse']:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        picklable['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        picklable['kwargs']['database'] = source['database']

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'snowflake':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'mongodb':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique name')

            # Not sure if needed
            #summary_analysis = self.get_analysis(ds.filter(limit=200))['data_analysis_v2']

            with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp:
                pickle.dump(picklable, fp)

            with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp:
                meta = {
                    'name': name,
                    'source_type': source_type,
                    'source': source,
                    'created_at': str(datetime.datetime.now()).split('.')[0],
                    'updated_at': str(datetime.datetime.now()).split('.')[0],
                    'row_count': len(df),
                    'columns': [dict(name=x) for x in list(df.keys())]
                }
                json.dump(meta, fp, indent=4, sort_keys=True)

            with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp:
                json.dump(self.config.versions, fp, indent=4, sort_keys=True)

        except Exception:
            if os.path.isdir(ds_meta_dir):
                shutil.rmtree(ds_meta_dir)
            raise

        return self.get_datasource_obj(name, raw=True), name
コード例 #3
0
    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        os.mkdir(ds_dir)

        if source_type == 'file':
            try:
                source = os.path.join(ds_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise

            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}
        elif source_type in self.config['integrations']:
            integration = self.config['integrations'][source_type]
            dsClass = None
            picklable = {
                'args': [],
                'kwargs': {
                    'query': source,
                    'user': integration['user'],
                    'password': integration['password'],
                    'host': integration['host'],
                    'port': integration['port']
                }
            }
            if integration['type'] == 'clickhouse':
                dsClass = ClickhouseDS
                picklable['class'] = 'ClickhouseDS'
            elif integration['type'] == 'mariadb':
                dsClass = MariaDS
                picklable['class'] = 'MariaDS'
            elif integration['type'] == 'mysql':
                dsClass = MySqlDS
                picklable['class'] = 'MySqlDS'
            elif integration['type'] == 'postgres':
                dsClass = PostgresDS
                picklable['class'] = 'PostgresDS'
            elif integration['type'] == 'mssql':
                dsClass = MSSQLDS
                picklable['class'] = 'MSSQLDS'
            else:
                raise ValueError(f'Unknown DS source_type: {source_type}')
            try:
                ds = dsClass(query=source,
                             user=integration['user'],
                             password=integration['password'],
                             host=integration['host'],
                             port=integration['port'])
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise
        else:
            # This probably only happens for urls
            print('Create URL data source !')
            try:
                ds = FileDS(source)
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

        df = ds.df

        df_with_types = cast_df_columns_types(
            df,
            self.get_analysis(df)['data_analysis_v2'])
        create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types)

        with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp:
            pickle.dump(picklable, fp)

        with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp:
            meta = {
                'name': name,
                'source_type': source_type,
                'source': source,
                'created_at': str(datetime.datetime.now()).split('.')[0],
                'updated_at': str(datetime.datetime.now()).split('.')[0],
                'row_count': len(df),
                'columns': [dict(name=x) for x in list(df.keys())]
            }
            json.dump(meta, fp)

        return self.get_datasource_obj(name, raw=True), name
コード例 #4
0
    def save_datasource(self, name, source_type, source, file_path=None):
        print(name, source_type, source)
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        os.mkdir(ds_dir)

        print(source_type)
        if source_type == 'file':
            source = os.path.join(ds_dir, source)
            os.replace(file_path, source)
            ds = FileDS(source)
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}
        elif source_type == 'clickhouse':
            user = self.config['integrations']['default_clickhouse']['user']
            password = self.config['integrations']['default_clickhouse'][
                'password']
            # TODO add host port params
            ds = ClickhouseDS(source, user=user, password=password)
            picklable = {
                'class': 'ClickhouseDS',
                'args': [source],
                'kwargs': {
                    'user': user,
                    'password': password
                }
            }
        elif source_type == 'mariadb':
            user = self.config['integrations']['default_mariadb']['user']
            password = self.config['integrations']['default_mariadb'][
                'password']
            host = self.config['integrations']['default_mariadb']['host']
            port = self.config['integrations']['default_mariadb']['port']
            ds = MariaDS(source,
                         user=user,
                         password=password,
                         host=host,
                         port=port)
            picklable = {
                'class': 'MariaDS',
                'args': [source],
                'kwargs': {
                    'user': user,
                    'password': password,
                    'host': host,
                    'port': port
                }
            }
        else:
            # This probably only happens for urls
            print('Create URL data source !')
            ds = FileDS(source)
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

        df = ds.df

        df_with_types = cast_df_columns_types(
            df,
            self.get_analysis(df)['data_analysis_v2'])
        create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types)

        print(picklable)
        with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp:
            pickle.dump(picklable, fp)

        with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp:
            json.dump(
                {
                    'name': name,
                    'source_type': source_type,
                    'source': source,
                    'created_at': str(datetime.datetime.now()).split('.')[0],
                    'updated_at': str(datetime.datetime.now()).split('.')[0],
                    'row_count': len(df),
                    'columns': [dict(name=x) for x in list(df.keys())]
                }, fp)

        return self.get_datasource_obj(name, avoid_crash=True)