Example #1
0
 def __init__(self):
     self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3)
     self._file_manager = FileManager()
     self._pending_imports = set()
     self._running_imports = set()
     self._import_lock = threading.Lock()
     self._app = None
Example #2
0
class FileManagerTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        os.environ[
            'CUSTOMIZED_FILE_MANAGER'] = 'testing.fake_file_manager:FakeFileManager'

    @classmethod
    def tearDownClass(cls):
        del os.environ['CUSTOMIZED_FILE_MANAGER']

    def setUp(self):
        self._fm = FileManager()

    def test_can_handle(self):
        self.assertTrue(self._fm.can_handle('fake://123'))
        # Falls back to default manager
        self.assertTrue(self._fm.can_handle('/data/123'))
        self.assertFalse(self._fm.can_handle('hdfs://123'))

    def test_ls(self):
        self.assertEqual(self._fm.ls('fake://data'), ['fake://data/f1.txt'])

    def test_move(self):
        self.assertTrue(self._fm.move('fake://move/123', 'fake://move/234'))
        self.assertFalse(
            self._fm.move('fake://do_not_move/123', 'fake://move/234'))
        # No file manager can handle this
        self.assertRaises(RuntimeError,
                          lambda: self._fm.move('hdfs://123', 'fake://abc'))

    def test_remove(self):
        self.assertTrue(self._fm.remove('fake://remove/123'))
        self.assertFalse(self._fm.remove('fake://do_not_remove/123'))
        # No file manager can handle this
        self.assertRaises(RuntimeError, lambda: self._fm.remove('hdfs://123'))
Example #3
0
class FilesApi(Resource):
    def __init__(self):
        self._file_manager = FileManager()

    def get(self):
        # TODO: consider the security factor
        if 'directory' in request.args:
            directory = request.args['directory']
        else:
            directory = os.path.join(current_app.config.get('STORAGE_ROOT'),
                                     'upload')
        files = self._file_manager.ls(directory, recursive=True)
        return {'data': [dict(file._asdict()) for file in files]}
Example #4
0
 def __init__(self, session: Session):
     self._session = session
     self._file_manager = FileManager()
Example #5
0
class DatasetService(object):
    def __init__(self, session: Session):
        self._session = session
        self._file_manager = FileManager()

    def get_dataset_preview(self, dataset_id: int = 0) -> dict:
        dataset = self._session.query(Dataset).filter(
            Dataset.id == dataset_id).first()
        if not dataset:
            raise NotFoundException(f'Failed to find dataset: {dataset_id}')
        dataset_path = dataset.path
        # meta is generated from sparkapp/pipeline/analyzer.py
        meta_path = dataset_meta_path(dataset_path)
        # data format:
        # {
        #   'dtypes': {
        #     'f01': 'bigint'
        #   },
        #   'samples': [
        #     [1],
        #     [0],
        #   ],
        #   'metrics': {
        #     'f01': {
        #       'count': '2',
        #       'mean': '0.0015716767309123998',
        #       'stddev': '0.03961485047808605',
        #       'min': '0',
        #       'max': '1',
        #       'missing_count': '0'
        #     }
        #   }
        # }
        val = {}
        try:
            val = json.loads(self._file_manager.read(meta_path))
        except Exception as e:  # pylint: disable=broad-except
            logging.info(
                f'failed to read meta file, path: {meta_path}, err: {e}')
            return {}
        # feature is generated from sparkapp/pipeline/analyzer.py
        feature_path = dataset_features_path(dataset_path)
        try:
            val['metrics'] = json.loads(self._file_manager.read(feature_path))
        except Exception as e:  # pylint: disable=broad-except
            logging.info(
                f'failed to read feature file, path: {feature_path}, err: {e}')
        return val

    def feature_metrics(self, name: str, dataset_id: int = 0) -> dict:
        dataset = self._session.query(Dataset).filter(
            Dataset.id == dataset_id).first()
        if not dataset:
            raise NotFoundException(f'Failed to find dataset: {dataset_id}')
        dataset_path = dataset.path
        feature_path = dataset_features_path(dataset_path)
        # data format:
        # {
        #    'name': 'f01',
        #    'metrics': {
        #      'count': '2',
        #      'mean': '0.0015716767309123998',
        #      'stddev': '0.03961485047808605',
        #      'min': '0',
        #      'max': '1',
        #      'missing_count': '0'
        #    },
        #    'hist': {
        #      'x': [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5,
        #             0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1],
        #      'y': [12070, 0, 0, 0, 0, 0, 0, 0, 0, 19]
        #    }
        # }
        val = {}
        try:
            feature_data = json.loads(self._file_manager.read(feature_path))
            val['name'] = name
            val['metrics'] = feature_data.get(name, {})
        except Exception as e:  # pylint: disable=broad-except
            logging.info(
                f'failed to read feature file, path: {feature_path}, err: {e}')
        # hist is generated from sparkapp/pipeline/analyzer.py
        hist_path = dataset_hist_path(dataset_path)
        try:
            hist_data = json.loads(self._file_manager.read(hist_path))
            val['hist'] = hist_data.get(name, {})
        except Exception as e:  # pylint: disable=broad-except
            logging.info(
                f'failed to read hist file, path: {hist_path}, err: {e}')
        return val

    def get_datasets(self, project_id: int = 0) -> List[Dataset]:
        q = self._session.query(Dataset).order_by(Dataset.created_at.desc())
        if project_id > 0:
            q = q.filter(Dataset.project_id == project_id)
        return q.all()
Example #6
0
 def __init__(self):
     self._file_manager = FileManager()
Example #7
0
class ImportHandler(object):
    def __init__(self):
        self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3)
        self._file_manager = FileManager()
        self._pending_imports = set()
        self._running_imports = set()
        self._import_lock = threading.Lock()
        self._app = None

    def __del__(self):
        self._executor.shutdown()

    def init(self, app):
        self._app = app

    def schedule_to_handle(self, dataset_batch_ids):
        if isinstance(dataset_batch_ids, int):
            dataset_batch_ids = [dataset_batch_ids]
        self._pending_imports.update(dataset_batch_ids)

    def _copy_file(self, source_path, destination_path,
                   move=False, num_retry=3):
        logging.info('%s from %s to %s',
                     'moving' if move else 'copying',
                     source_path,
                     destination_path)
        # Creates parent folders if needed
        parent_folder = os.path.dirname(destination_path)
        self._file_manager.mkdir(parent_folder)
        success = False
        error_message = ''
        for _ in range(num_retry):
            try:
                if move:
                    success = self._file_manager.move(
                        source_path,
                        destination_path)
                else:
                    success = self._file_manager.copy(
                        source_path,
                        destination_path)
                if not success:
                    error_message = 'Unknown error'
                else:
                    break
            except Exception as e:  # pylint: disable=broad-except
                logging.error(
                    'Error occurred when importing file from %s to %s',
                    source_path,
                    destination_path)
                error_message = str(e)
        file = dataset_pb2.File(
            source_path=source_path,
            destination_path=destination_path
        )
        if not success:
            file.error_message = error_message
            file.state = dataset_pb2.File.State.FAILED
        else:
            file.size = self._file_manager.ls(
                destination_path)[0].size
            file.state = dataset_pb2.File.State.COMPLETED
        return file

    def _import_batch(self, batch_id):
        self._import_lock.acquire()
        if batch_id in self._running_imports:
            return
        self._running_imports.add(batch_id)
        self._import_lock.release()

        # Pushes app context to make db session work
        self._app.app_context().push()

        logging.info('Importing batch %d', batch_id)
        batch = DataBatch.query.get(batch_id)
        batch.state = BatchState.IMPORTING
        db.session.commit()
        db.session.refresh(batch)
        details = batch.get_details()

        for file in details.files:
            if file.state == dataset_pb2.File.State.UNSPECIFIED:
                # Recovers the state
                destination_existed = len(
                    self._file_manager.ls(file.destination_path)) > 0
                if destination_existed:
                    file.state = dataset_pb2.File.State.COMPLETED
                    continue
                # Moves/Copies
                file.MergeFrom(self._copy_file(
                    source_path=file.source_path,
                    destination_path=file.destination_path,
                    move=batch.move))

        batch.set_details(details)
        db.session.commit()

        self._import_lock.acquire()
        self._running_imports.remove(batch_id)
        self._import_lock.release()


    def handle(self, pull=False):
        """Handles all the batches in the queue or all batches which
        should be imported."""
        batches_to_run = self._pending_imports
        self._pending_imports = set()
        if pull:
            # TODO: should separate pull logic to a cron job,
            # otherwise there will be a race condition that two handlers
            # are trying to move the same batch
            one_hour_ago = datetime.utcnow() - timedelta(hours=1)
            pulled_batches = db.session.query(DataBatch.id).filter(
                    (DataBatch.state == BatchState.NEW) |
                    (DataBatch.state == BatchState.IMPORTING))\
                .filter(DataBatch.updated_at < one_hour_ago)\
                .all()
            pulled_ids = [bid for bid, in pulled_batches]
            batches_to_run.update(pulled_ids)

        for batch in batches_to_run:
            self._executor.submit(self._import_batch, batch)
 def setUp(self):
     self._fm = FileManager()
Example #9
0
    def __init__(self) -> None:
        self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp')
        self._file_client = FileManager()

        self._file_client.mkdir(self._base_dir)
Example #10
0
class SparkAppService(object):
    def __init__(self) -> None:
        self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp')
        self._file_client = FileManager()

        self._file_client.mkdir(self._base_dir)

    def _clear_and_make_an_empty_dir(self, dir_name: str):
        try:
            self._file_client.remove(dir_name)
        except Exception as err:  # pylint: disable=broad-except
            logging.error('failed to remove %s with exception %s', dir_name,
                          err)
        finally:
            self._file_client.mkdir(dir_name)

    def _get_sparkapp_upload_path(self, name: str) -> Tuple[bool, str]:
        """get upload path for specific sparkapp

        Args:
            name (str): sparkapp name

        Returns:
            Tuple[bool, str]:
                bool: True if this directory already exists
                str:  upload path for this sparkapp

        """
        sparkapp_path = os.path.join(self._base_dir, name)
        existable = False
        try:
            self._file_client.ls(sparkapp_path)
            existable = True
        except ValueError:
            existable = False

        return existable, sparkapp_path

    def _copy_files_to_target_filesystem(self, source_filesystem_path: str,
                                         target_filesystem_path: str) -> bool:
        """ copy files to remote filesystem
            - untar if file is tared
            - copy files to remote filesystem

        Args:
            source_filesystem_path (str): local filesystem
            target_filesystem_path (str): remote filesystem

        Returns:
            bool: whether success
        """
        temp_path = source_filesystem_path
        if source_filesystem_path.find('.tar') != -1:
            temp_path = os.path.abspath(
                os.path.join(source_filesystem_path, '../tmp'))
            os.makedirs(temp_path)
            TarCli.untar_file(source_filesystem_path, temp_path)

        for root, dirs, files in os.walk(temp_path):
            relative_path = os.path.relpath(root, temp_path)
            for f in files:
                file_path = os.path.join(root, f)
                remote_file_path = os.path.join(target_filesystem_path,
                                                relative_path, f)
                self._file_client.copy(file_path, remote_file_path)
            for d in dirs:
                remote_dir_path = os.path.join(target_filesystem_path,
                                               relative_path, d)
                self._file_client.mkdir(remote_dir_path)

        return True

    def submit_sparkapp(self, config: SparkAppConfig) -> SparkAppInfo:
        """submit sparkapp

        Args:
            config (SparkAppConfig): sparkapp config

        Raises:
            InternalException: if fail to get sparkapp

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        sparkapp_path = config.files_path
        if config.files_path is None:
            _, sparkapp_path = self._get_sparkapp_upload_path(config.name)
            self._clear_and_make_an_empty_dir(sparkapp_path)

            with tempfile.TemporaryDirectory() as temp_dir:
                tar_path = os.path.join(temp_dir, 'files.tar')
                with open(tar_path, 'wb') as fwrite:
                    fwrite.write(config.files)
                self._copy_files_to_target_filesystem(
                    source_filesystem_path=tar_path,
                    target_filesystem_path=sparkapp_path)

        config_dict = config.build_config(sparkapp_path)
        logging.info(f'submit sparkapp, config: {config_dict}')
        resp = k8s_client.create_sparkapplication(config_dict)
        return SparkAppInfo.from_k8s_resp(resp)

    def get_sparkapp_info(self, name: str) -> SparkAppInfo:
        """ get sparkapp info

        Args:
            name (str): sparkapp name

        Raises:
            WebConsoleApiException

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        resp = k8s_client.get_sparkapplication(name)
        return SparkAppInfo.from_k8s_resp(resp)

    def delete_sparkapp(self, name: str) -> SparkAppInfo:
        """delete sparkapp
            - delete sparkapp. If failed, raise exception
            - delete the tmp filesystem


        Args:
            name (str): sparkapp name

        Raises:
            WebConsoleApiException

        Returns:
            SparkAppInfo: resp of sparkapp
        """
        existable, sparkapp_path = self._get_sparkapp_upload_path(name)
        if existable:
            self._file_client.remove(sparkapp_path)

        resp = k8s_client.delete_sparkapplication(name)
        sparkapp_info = SparkAppInfo.from_k8s_resp(resp)

        return sparkapp_info