def __init__(self): self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3) self._file_manager = FileManager() self._pending_imports = set() self._running_imports = set() self._import_lock = threading.Lock() self._app = None
class FileManagerTest(unittest.TestCase): @classmethod def setUpClass(cls): os.environ[ 'CUSTOMIZED_FILE_MANAGER'] = 'testing.fake_file_manager:FakeFileManager' @classmethod def tearDownClass(cls): del os.environ['CUSTOMIZED_FILE_MANAGER'] def setUp(self): self._fm = FileManager() def test_can_handle(self): self.assertTrue(self._fm.can_handle('fake://123')) # Falls back to default manager self.assertTrue(self._fm.can_handle('/data/123')) self.assertFalse(self._fm.can_handle('hdfs://123')) def test_ls(self): self.assertEqual(self._fm.ls('fake://data'), ['fake://data/f1.txt']) def test_move(self): self.assertTrue(self._fm.move('fake://move/123', 'fake://move/234')) self.assertFalse( self._fm.move('fake://do_not_move/123', 'fake://move/234')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.move('hdfs://123', 'fake://abc')) def test_remove(self): self.assertTrue(self._fm.remove('fake://remove/123')) self.assertFalse(self._fm.remove('fake://do_not_remove/123')) # No file manager can handle this self.assertRaises(RuntimeError, lambda: self._fm.remove('hdfs://123'))
class FilesApi(Resource): def __init__(self): self._file_manager = FileManager() def get(self): # TODO: consider the security factor if 'directory' in request.args: directory = request.args['directory'] else: directory = os.path.join(current_app.config.get('STORAGE_ROOT'), 'upload') files = self._file_manager.ls(directory, recursive=True) return {'data': [dict(file._asdict()) for file in files]}
def __init__(self, session: Session): self._session = session self._file_manager = FileManager()
class DatasetService(object): def __init__(self, session: Session): self._session = session self._file_manager = FileManager() def get_dataset_preview(self, dataset_id: int = 0) -> dict: dataset = self._session.query(Dataset).filter( Dataset.id == dataset_id).first() if not dataset: raise NotFoundException(f'Failed to find dataset: {dataset_id}') dataset_path = dataset.path # meta is generated from sparkapp/pipeline/analyzer.py meta_path = dataset_meta_path(dataset_path) # data format: # { # 'dtypes': { # 'f01': 'bigint' # }, # 'samples': [ # [1], # [0], # ], # 'metrics': { # 'f01': { # 'count': '2', # 'mean': '0.0015716767309123998', # 'stddev': '0.03961485047808605', # 'min': '0', # 'max': '1', # 'missing_count': '0' # } # } # } val = {} try: val = json.loads(self._file_manager.read(meta_path)) except Exception as e: # pylint: disable=broad-except logging.info( f'failed to read meta file, path: {meta_path}, err: {e}') return {} # feature is generated from sparkapp/pipeline/analyzer.py feature_path = dataset_features_path(dataset_path) try: val['metrics'] = json.loads(self._file_manager.read(feature_path)) except Exception as e: # pylint: disable=broad-except logging.info( f'failed to read feature file, path: {feature_path}, err: {e}') return val def feature_metrics(self, name: str, dataset_id: int = 0) -> dict: dataset = self._session.query(Dataset).filter( Dataset.id == dataset_id).first() if not dataset: raise NotFoundException(f'Failed to find dataset: {dataset_id}') dataset_path = dataset.path feature_path = dataset_features_path(dataset_path) # data format: # { # 'name': 'f01', # 'metrics': { # 'count': '2', # 'mean': '0.0015716767309123998', # 'stddev': '0.03961485047808605', # 'min': '0', # 'max': '1', # 'missing_count': '0' # }, # 'hist': { # 'x': [0.0, 0.1, 0.2, 0.30000000000000004, 0.4, 0.5, # 0.6000000000000001, 0.7000000000000001, 0.8, 0.9, 1], # 'y': [12070, 0, 0, 0, 0, 0, 0, 0, 0, 19] # } # } val = {} try: feature_data = json.loads(self._file_manager.read(feature_path)) val['name'] = name val['metrics'] = feature_data.get(name, {}) except Exception as e: # pylint: disable=broad-except logging.info( f'failed to read feature file, path: {feature_path}, err: {e}') # hist is generated from sparkapp/pipeline/analyzer.py hist_path = dataset_hist_path(dataset_path) try: hist_data = json.loads(self._file_manager.read(hist_path)) val['hist'] = hist_data.get(name, {}) except Exception as e: # pylint: disable=broad-except logging.info( f'failed to read hist file, path: {hist_path}, err: {e}') return val def get_datasets(self, project_id: int = 0) -> List[Dataset]: q = self._session.query(Dataset).order_by(Dataset.created_at.desc()) if project_id > 0: q = q.filter(Dataset.project_id == project_id) return q.all()
def __init__(self): self._file_manager = FileManager()
class ImportHandler(object): def __init__(self): self._executor = ThreadPoolExecutor(max_workers=os.cpu_count() * 3) self._file_manager = FileManager() self._pending_imports = set() self._running_imports = set() self._import_lock = threading.Lock() self._app = None def __del__(self): self._executor.shutdown() def init(self, app): self._app = app def schedule_to_handle(self, dataset_batch_ids): if isinstance(dataset_batch_ids, int): dataset_batch_ids = [dataset_batch_ids] self._pending_imports.update(dataset_batch_ids) def _copy_file(self, source_path, destination_path, move=False, num_retry=3): logging.info('%s from %s to %s', 'moving' if move else 'copying', source_path, destination_path) # Creates parent folders if needed parent_folder = os.path.dirname(destination_path) self._file_manager.mkdir(parent_folder) success = False error_message = '' for _ in range(num_retry): try: if move: success = self._file_manager.move( source_path, destination_path) else: success = self._file_manager.copy( source_path, destination_path) if not success: error_message = 'Unknown error' else: break except Exception as e: # pylint: disable=broad-except logging.error( 'Error occurred when importing file from %s to %s', source_path, destination_path) error_message = str(e) file = dataset_pb2.File( source_path=source_path, destination_path=destination_path ) if not success: file.error_message = error_message file.state = dataset_pb2.File.State.FAILED else: file.size = self._file_manager.ls( destination_path)[0].size file.state = dataset_pb2.File.State.COMPLETED return file def _import_batch(self, batch_id): self._import_lock.acquire() if batch_id in self._running_imports: return self._running_imports.add(batch_id) self._import_lock.release() # Pushes app context to make db session work self._app.app_context().push() logging.info('Importing batch %d', batch_id) batch = DataBatch.query.get(batch_id) batch.state = BatchState.IMPORTING db.session.commit() db.session.refresh(batch) details = batch.get_details() for file in details.files: if file.state == dataset_pb2.File.State.UNSPECIFIED: # Recovers the state destination_existed = len( self._file_manager.ls(file.destination_path)) > 0 if destination_existed: file.state = dataset_pb2.File.State.COMPLETED continue # Moves/Copies file.MergeFrom(self._copy_file( source_path=file.source_path, destination_path=file.destination_path, move=batch.move)) batch.set_details(details) db.session.commit() self._import_lock.acquire() self._running_imports.remove(batch_id) self._import_lock.release() def handle(self, pull=False): """Handles all the batches in the queue or all batches which should be imported.""" batches_to_run = self._pending_imports self._pending_imports = set() if pull: # TODO: should separate pull logic to a cron job, # otherwise there will be a race condition that two handlers # are trying to move the same batch one_hour_ago = datetime.utcnow() - timedelta(hours=1) pulled_batches = db.session.query(DataBatch.id).filter( (DataBatch.state == BatchState.NEW) | (DataBatch.state == BatchState.IMPORTING))\ .filter(DataBatch.updated_at < one_hour_ago)\ .all() pulled_ids = [bid for bid, in pulled_batches] batches_to_run.update(pulled_ids) for batch in batches_to_run: self._executor.submit(self._import_batch, batch)
def setUp(self): self._fm = FileManager()
def __init__(self) -> None: self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp') self._file_client = FileManager() self._file_client.mkdir(self._base_dir)
class SparkAppService(object): def __init__(self) -> None: self._base_dir = os.path.join(UPLOAD_PATH, 'sparkapp') self._file_client = FileManager() self._file_client.mkdir(self._base_dir) def _clear_and_make_an_empty_dir(self, dir_name: str): try: self._file_client.remove(dir_name) except Exception as err: # pylint: disable=broad-except logging.error('failed to remove %s with exception %s', dir_name, err) finally: self._file_client.mkdir(dir_name) def _get_sparkapp_upload_path(self, name: str) -> Tuple[bool, str]: """get upload path for specific sparkapp Args: name (str): sparkapp name Returns: Tuple[bool, str]: bool: True if this directory already exists str: upload path for this sparkapp """ sparkapp_path = os.path.join(self._base_dir, name) existable = False try: self._file_client.ls(sparkapp_path) existable = True except ValueError: existable = False return existable, sparkapp_path def _copy_files_to_target_filesystem(self, source_filesystem_path: str, target_filesystem_path: str) -> bool: """ copy files to remote filesystem - untar if file is tared - copy files to remote filesystem Args: source_filesystem_path (str): local filesystem target_filesystem_path (str): remote filesystem Returns: bool: whether success """ temp_path = source_filesystem_path if source_filesystem_path.find('.tar') != -1: temp_path = os.path.abspath( os.path.join(source_filesystem_path, '../tmp')) os.makedirs(temp_path) TarCli.untar_file(source_filesystem_path, temp_path) for root, dirs, files in os.walk(temp_path): relative_path = os.path.relpath(root, temp_path) for f in files: file_path = os.path.join(root, f) remote_file_path = os.path.join(target_filesystem_path, relative_path, f) self._file_client.copy(file_path, remote_file_path) for d in dirs: remote_dir_path = os.path.join(target_filesystem_path, relative_path, d) self._file_client.mkdir(remote_dir_path) return True def submit_sparkapp(self, config: SparkAppConfig) -> SparkAppInfo: """submit sparkapp Args: config (SparkAppConfig): sparkapp config Raises: InternalException: if fail to get sparkapp Returns: SparkAppInfo: resp of sparkapp """ sparkapp_path = config.files_path if config.files_path is None: _, sparkapp_path = self._get_sparkapp_upload_path(config.name) self._clear_and_make_an_empty_dir(sparkapp_path) with tempfile.TemporaryDirectory() as temp_dir: tar_path = os.path.join(temp_dir, 'files.tar') with open(tar_path, 'wb') as fwrite: fwrite.write(config.files) self._copy_files_to_target_filesystem( source_filesystem_path=tar_path, target_filesystem_path=sparkapp_path) config_dict = config.build_config(sparkapp_path) logging.info(f'submit sparkapp, config: {config_dict}') resp = k8s_client.create_sparkapplication(config_dict) return SparkAppInfo.from_k8s_resp(resp) def get_sparkapp_info(self, name: str) -> SparkAppInfo: """ get sparkapp info Args: name (str): sparkapp name Raises: WebConsoleApiException Returns: SparkAppInfo: resp of sparkapp """ resp = k8s_client.get_sparkapplication(name) return SparkAppInfo.from_k8s_resp(resp) def delete_sparkapp(self, name: str) -> SparkAppInfo: """delete sparkapp - delete sparkapp. If failed, raise exception - delete the tmp filesystem Args: name (str): sparkapp name Raises: WebConsoleApiException Returns: SparkAppInfo: resp of sparkapp """ existable, sparkapp_path = self._get_sparkapp_upload_path(name) if existable: self._file_client.remove(sparkapp_path) resp = k8s_client.delete_sparkapplication(name) sparkapp_info = SparkAppInfo.from_k8s_resp(resp) return sparkapp_info