def upload_zipped_flat_media_files( gc: GirderClient, manager: JobManager, folderId: str, working_directory: Path, create_subfolder=False, ): """Takes a flat folder of media files and/or annotation and generates a dataset from it""" listOfFileNames = os.listdir(working_directory) validation = gc.sendRestRequest('POST', '/dive_dataset/validate_files', json=listOfFileNames) root_folderId = folderId default_fps = gc.getFolder(root_folderId).get( f"meta.{constants.FPSMarker}", -1) if validation.get('ok', False): manager.write(f"Annotations: {validation['annotations']}\n") manager.write(f"Media: {validation['media']}\n") dataset_type = validation['type'] manager.write(f"Type: {dataset_type}\n") if create_subfolder != '': sub_folder = gc.createFolder( folderId, create_subfolder, reuseExisting=True, ) root_folderId = str(sub_folder["_id"]) # Upload all resulting items back into the root folder manager.updateStatus(JobStatus.PUSHING_OUTPUT) # create a source folder to place the zipFile inside of gc.upload(f'{working_directory}/*', root_folderId) if dataset_type == constants.ImageSequenceType and default_fps == -1: default_fps = 1 gc.addMetadataToFolder( str(root_folderId), { constants.TypeMarker: dataset_type, constants.FPSMarker: default_fps }, ) # After uploading the default files we do a the postprocess for video conversion now gc.sendRestRequest("POST", f"/dive_rpc/postprocess/{str(root_folderId)}") else: manager.write(f"Message: {validation['message']}\n") raise Exception("Could not Validate media Files")
def test_upload_zip_data(dataset: dict): user = zipUser client = GirderClient(apiUrl='http://localhost:8010/api/v1') client.authenticate(username=user['login'], password=user['password']) dsPath = localDataRoot / str(dataset['path']) privateFolder = getTestFolder(client) newDatasetFolder = client.createFolder( privateFolder['_id'], dataset['name'], metadata={ 'fps': dataset['fps'], 'type': dataset['type'], }, ) if Path(dsPath).is_file(): client.uploadFileToFolder(newDatasetFolder['_id'], str(dsPath)) client.post(f'dive_rpc/postprocess/{newDatasetFolder["_id"]}') wait_for_jobs(client, max_wait_timeout=30, expected_status=dataset['job_status']) resultFolder = client.getFolder(newDatasetFolder['_id']) # verify sub datasets if they exist if dataset.get('subDatasets', False): folders = list(client.listFolder(newDatasetFolder['_id'])) for item in dataset["subDatasets"]: matches = [x for x in folders if x["name"] == item["name"]] if len(matches) > 0: meta = matches[0].get("meta", {}) assert meta.get("fps", -1) == item["fps"] assert meta.get("type", "") == item["type"] assert meta.get("annotate", False) elif dataset['job_status'] == JobStatus.SUCCESS: assert resultFolder['meta'].get("annotate", False) assert type(resultFolder['meta'].get("fps")) in [int, float] assert type(resultFolder['meta'].get("type")) == str else: assert resultFolder['meta'].get("annotate", None) is None
class CumulusClient(): '''Application interface to cumulus-based client for HPC systems supporting NEWT API. Note: the methods must be called in a specific order! create_cluster() create_omega3p_script() create_job() upload_inputs() submit_job() Then optionally: monitor_job() download_results() release_resources() ''' # --------------------------------------------------------------------- def __init__(self, girder_url, newt_sessionid): ''' ''' self._client = None self._cluster_id = None self._girder_url = girder_url self._input_folder_id = None self._job_folder_id = None self._job_id = None self._output_folder_id = None self._private_folder_id = None self._script_id = None self._session = requests.Session() # Authenticate with Girder using the newt session id url = '%s/api/v1/newt/authenticate/%s' % \ (self._girder_url, newt_sessionid) r = self._session.put(url) if r.status_code != 200: raise HttpError(r.status_code, r.text, r.url, r.request.method) # Instantiate Girder client url = '%s/api/v1' % self._girder_url self._client = GirderClient(apiUrl=url) self._client.token = self._session.cookies['girderToken'] user = self._client.get('user/me') #print 'user', user user_id = user['_id'] r = self._client.listFolder(user_id, 'user', name='Private') if len(r) != 1: raise Exception('Wrong number of users; should be 1 got %s' % len(r)) self._private_folder_id = r[0]['_id'] print 'private_folder_id', self._private_folder_id # --------------------------------------------------------------------- def job_id(self): '''Returns current job id (which may be None) ''' return self._job_id # --------------------------------------------------------------------- def create_cluster(self, machine_name, cluster_name=None): ''' ''' if cluster_name is None: user = self._client.get('user/me') user_name = user.get('firstName', 'user') cluster_name = '%s.%s' % (machine_name, user_name) cluster = None cluster_list = self._client.get('clusters') for extant_cluster in cluster_list: if extant_cluster['name'] == cluster_name: cluster = extant_cluster self._cluster_id = extant_cluster['_id'] break if not cluster: body = { 'config': { 'host': machine_name }, 'name': cluster_name, 'type': 'newt' } r = self._client.post('clusters', data=json.dumps(body)) self._cluster_id = r['_id'] print 'cluster_id', self._cluster_id # Reset the state of the cluster body = {'status': 'created'} r = self._client.patch('clusters/%s' % self._cluster_id, data=json.dumps(body)) # Now test the connection r = self._client.put('clusters/%s/start' % self._cluster_id) sleeps = 0 while True: time.sleep(1) r = self._client.get('clusters/%s/status' % self._cluster_id) if r['status'] == 'running': break elif r['status'] == 'error': r = self._client.get('clusters/%s/log' % self._cluster_id) print r raise Exception('ERROR creating cluster') if sleeps > 9: raise Exception('Cluster never moved into running state') sleeps += 1 # --------------------------------------------------------------------- def create_omega3p_script(self, omega3p_filename, name=None, number_of_tasks=1): '''Creates script to submit omega3p job ''' command = 'srun -n %s /project/projectdirs/ace3p/{{machine}}/omega3p %s' % \ (number_of_tasks, omega3p_filename) if name is None: name = omega3p_filename body = {'commands': [command], 'name': name} r = self._client.post('scripts', data=json.dumps(body)) self._script_id = r['_id'] print 'script_id', self._script_id # --------------------------------------------------------------------- def create_input(self, input_paths, folder_name='input_files'): '''DEPRECATED Uploads input files ''' folder_id = self.get_folder(self._private_folder_id, folder_name) if folder_id is None: return print 'input_folder_id', folder_id self._input_folder_id = folder_id def upload_file(path): name = os.path.basename(path) size = os.path.getsize(path) with open(path, 'rb') as fp: self._client.uploadFile(self._input_folder_id, fp, name, size, parentType='folder') for input_path in input_paths: if not input_path or not os.path.exists(input_path): raise Exception('Input file not found: %s' % input_path) upload_file(input_path) # --------------------------------------------------------------------- def create_output_folder(self, folder_name='output_files'): '''DEPRECATED ''' folder_id = self.get_folder(self._private_folder_id, folder_name) print 'output_folder_id', folder_id self._output_folder_id = folder_id # --------------------------------------------------------------------- def create_job(self, job_name, tail=None): ''' ''' # Create job folders folder_name = uuid.uuid4().hex # unique name self._job_folder_id = self.get_folder(self._private_folder_id, folder_name) print 'Created job folder', folder_name self._input_folder_id = self.get_folder(self._job_folder_id, 'input_files') self._output_folder_id = self.get_folder(self._job_folder_id, 'output_files') # Make sure job_name isn't null if not job_name: job_name = 'CumulusJob' # Create job spec body = { 'name': job_name, 'scriptId': self._script_id, 'output': [{ 'folderId': self._output_folder_id, 'path': '.' }], 'input': [{ 'folderId': self._input_folder_id, 'path': '.' }] } if tail: body['output'].append({"path": tail, "tail": True}) job = self._client.post('jobs', data=json.dumps(body)) self._job_id = job['_id'] print 'Created job_id', self._job_id # --------------------------------------------------------------------- def upload_inputs(self, input_paths): '''Uploads input files to input folder ''' if not self._input_folder_id: raise Exception('Input folder missing') def upload_file(path): name = os.path.basename(path) size = os.path.getsize(path) with open(path, 'rb') as fp: self._client.uploadFile(self._input_folder_id, fp, name, size, parentType='folder') for input_path in input_paths: if not input_path or not os.path.exists(input_path): raise Exception('Input file not found: %s' % input_path) upload_file(input_path) # --------------------------------------------------------------------- def submit_job(self, machine, project_account, timeout_minutes, queue='debug', qos=None, number_of_nodes=1, job_output_dir=None): ''' ''' body = { 'machine': machine, 'account': project_account, 'numberOfNodes': number_of_nodes, 'maxWallTime': { 'hours': 0, 'minutes': timeout_minutes, 'seconds': 0 }, 'queue': queue, } if qos: body['qualityOfService'] = qos #print 'jobOutputDir', job_output_dir if job_output_dir: body['jobOutputDir'] = job_output_dir print 'Setting jobOutputDir', job_output_dir url = 'clusters/%s/job/%s/submit' % (self._cluster_id, self._job_id) self._client.put(url, data=json.dumps(body)) print 'Submitted job', self._job_id # --------------------------------------------------------------------- def monitor_job(self, tail=None): '''Periodically monitors job status ''' log_offset = 0 job_timeout = 60 * timeout_minutes start = time.time() while True: time.sleep(2) # Provide some feedback at startup if log_offset == 0: sys.stdout.write('.') #print 'Checking status' r = self._client.get('jobs/%s' % self._job_id) #print r if r['status'] in ['error', 'unexpectederror']: r = self._client.get('jobs/%s/log' % self._job_id) raise Exception(str(r)) elif r['status'] == 'complete': break # Tail log file if tail: params = {'offset': log_offset, 'path': tail} #print 'Checking tail' r = self._client.get('jobs/%s/output' % self._job_id, parameters=params) #print r output = r['content'] if output and log_offset == 0: print # end the user feedback dots log_offset += len(output) for l in output: print l sys.stdout.flush() if time.time() - start > job_timeout: raise Exception('Job timeout') # --------------------------------------------------------------------- def download_results(self, destination_folder): '''Downloads all output files to a local directory ''' if not os.path.exists(destination_folder): os.makedirs(destination_folder) self._client.downloadFolderRecursive(self._output_folder_id, destination_folder) print 'Downloaded files to %s' % destination_folder # --------------------------------------------------------------------- def release_resources(self): '''Closes/deletes any current resources ''' resource_info = { 'clusters': [self._cluster_id], 'jobs': [self._job_id], 'scripts': [self._script_id], 'folder': [self._job_folder] } for resource_type, id_list in resource_info.items(): for resource_id in id_list: if resource_id is not None: url = '%s/%s' % (resource_type, resource_id) self._client.delete(url) self._input_folder_id = None self._job_folder_id = None self._job_id = None self._output_folder_id = None self._script_id = None # --------------------------------------------------------------------- def get_folder(self, parent_id, name): '''Returns folder_id, creating one if needed ''' # Check if folder already exists folder_list = self._client.listFolder(parent_id, name=name) if folder_list: folder = folder_list[0] #print 'found folder %s: %s' % (name, str(folder)) return folder['_id'] # (else) try: r = self._client.createFolder(parent_id, name) return r['_id'] except HttpError as e: print e.responseText return None
def upload_exported_zipped_dataset( gc: GirderClient, manager: JobManager, folderId: str, working_directory: Path, create_subfolder='', ): """Uploads a folder that is generated from the export of a zip file and sets metadata""" listOfFileNames = os.listdir(working_directory) potential_meta_files = list( filter(constants.metaRegex.match, listOfFileNames)) if len(potential_meta_files) == 0: manager.write( "Could not find meta.json or config.json file within the subdirectroy\n" ) return print(listOfFileNames) # load meta.json to get datatype and verify list of files meta = {} for meta_name in potential_meta_files: with open(f"{working_directory}/{meta_name}") as f: meta = json.load(f) type = meta[constants.TypeMarker] if type == constants.ImageSequenceType: imageData = meta['imageData'] for image in imageData: if image["filename"] not in listOfFileNames: manager.write( "Could not find {item['filename']} file within the list of files\n" ) return elif type == constants.VideoType: video = meta["video"] if video["filename"] not in listOfFileNames: manager.write( "Could not find {item['filename']} file within the list of files\n" ) return # remove the auxilary directory so we don't have to tag them all if constants.AuxiliaryFolderName in listOfFileNames and os.path.isdir( f'{working_directory}/{constants.AuxiliaryFolderName}'): shutil.rmtree(f'{working_directory}/{constants.AuxiliaryFolderName}') root_folderId = folderId if create_subfolder != '': sub_folder = gc.createFolder( folderId, create_subfolder, reuseExisting=True, ) root_folderId = str(sub_folder['_id']) manager.updateStatus(JobStatus.PUSHING_OUTPUT) # create a source folder to place the zipFile inside of gc.upload(f'{working_directory}/*', root_folderId) # Now we set all the metadata for the folders and items all_files = list(gc.listItem(root_folderId)) root_meta = { "type": type, "attributes": meta.get("attributes", None), "customTypeStyling": meta.get("customTypeStyling", None), "confidenceFilters": meta.get("confidenceFilters", None), "fps": meta["fps"], "version": meta["version"], } if type == constants.VideoType: # set transcoded and non-transcoded versions transcoded_video = list( gc.listItem(root_folderId, name=video["filename"])) if len(transcoded_video) == 1: ffprobe = meta["ffprobe_info"] avgFpsString = ffprobe["avg_frame_rate"] dividend, divisor = [int(v) for v in avgFpsString.split('/')] originalFps = dividend / divisor transcoded_metadata = { "codec": "h264", "originalFps": originalFps, "originalFpsString": avgFpsString, "source_video": False, "transcoder": "ffmpeg", } gc.addMetadataToItem(str(transcoded_video[0]['_id']), transcoded_metadata) # other video is tagged as the source video for item in all_files: if (item["name"].endswith(tuple(constants.validVideoFormats)) and item["name"] != video["filename"]): source_metadata = { "codec": ffprobe["codec_name"], "originalFps": originalFps, "originalFpsString": avgFpsString, "source_video": False, } gc.addMetadataToItem(str(item['_id']), source_metadata) root_meta["originalFps"] = originalFps root_meta["originalFpsString"] = avgFpsString # Need to tag folder Level data (annotate, and others) root_meta[constants.DatasetMarker] = True gc.addMetadataToFolder(root_folderId, root_meta) gc.post(f'dive_rpc/postprocess/{root_folderId}', data={"skipJobs": True})
class BaseIntegrationTest(unittest.TestCase): def __init__(self, name, girder_url, girder_user, girder_password, job_timeout=60, cleanup=True): super(BaseIntegrationTest, self).__init__(name) self._job_id = None self._script_id = None self._output_folder_id = None self._input_folder_id = None self._girder_url = girder_url self._girder_user = girder_user self._girder_password = girder_password self._job_timeout = job_timeout self._data = 'Need more input!' self._cleanup = cleanup def setUp(self): url = '%s/api/v1' % self._girder_url self._client = GirderClient(apiUrl=url) self._client.authenticate(self._girder_user, self._girder_password) user = self._client.get('user/me') self._user_id = user['_id'] r = list(self._client.listFolder(self._user_id, 'user', name='Private')) self.assertEqual(len(r), 1) self._private_folder_id = r[0]['_id'] def tearDown(self): if not self._cleanup: return if self._job_id: try: url = 'jobs/%s' % self._job_id self._client.delete(url) except Exception as e: traceback.print_exc() if self._script_id: try: url = 'scripts/%s' % self._script_id self._client.delete(url) except Exception: traceback.print_exc() if self._output_folder_id: try: url = 'folder/%s' % self._output_folder_id self._client.delete(url) except Exception: traceback.print_exc() if self._input_folder_id: try: url = 'folder/%s' % self._input_folder_id self._client.delete(url) except Exception: traceback.print_exc() def create_script(self, commands=[ 'sleep 10', 'cat CumulusIntegrationTestInput' ]): body = { 'commands': commands, 'name': 'CumulusIntegrationTestLob' } r = self._client.post('scripts', data=json.dumps(body)) self._script_id = r['_id'] def create_input(self, folder_name='CumulusInput'): r = self._client.createFolder(self._private_folder_id, folder_name) self._input_folder_id = r['_id'] size = len(self._data) item = self._client.uploadFile(self._input_folder_id, StringIO(self._data), 'CumulusIntegrationTestInput', size, parentType='folder') self._item_id = item['itemId'] def create_output_folder(self, folder_name='CumulusOutput'): r = self._client.createFolder(self._private_folder_id, folder_name) self._output_folder_id = r['_id'] def create_job(self, job_name='CumulusIntegrationTestJob', tail=None): body = { 'name': job_name, 'scriptId': self._script_id, 'output': [{ 'folderId': self._output_folder_id, 'path': '.' }], 'input': [ { 'folderId': self._input_folder_id, 'path': '.' } ] } if tail: body['output'].append({ "path": tail, "tail": True }) job = self._client.post('jobs', data=json.dumps(body)) self._job_id = job['_id'] def submit_job(self, job_params={}, timeout=None): url = 'clusters/%s/job/%s/submit' % (self._cluster_id, self._job_id) self._client.put(url, data=json.dumps(job_params)) start = time.time() while True: time.sleep(1) r = self._client.get('jobs/%s' % self._job_id) if r['status'] in ['error', 'unexpectederror']: r = self._client.get('jobs/%s/log' % self._job_id) self.fail(str(r)) elif r['status'] == 'complete': break if time.time() - start > timeout: self.fail('Job didn\'t complete in timeout') def assert_output(self): r = self._client.listItem(self._output_folder_id) self.assertEqual(len(r), 4) stdout_item = None for i in r: if i['name'].startswith('CumulusIntegrationTestJob-%s.o' % self._job_id): stdout_item = i break self.assertIsNotNone(stdout_item) r = self._client.get('item/%s/files' % i['_id']) self.assertEqual(len(r), 1) path = os.path.join(tempfile.gettempdir(), self._job_id) try: self._client.downloadFile(r[0]['_id'], path) with open(path, 'rb') as fp: self.assertEqual(fp.read(), self._data) finally: if os.path.exists(path): os.remove(path)
class BaseIntegrationTest(unittest.TestCase): def __init__(self, name, girder_url, girder_user, girder_password, job_timeout=60, cleanup=True): super(BaseIntegrationTest, self).__init__(name) self._job_id = None self._script_id = None self._output_folder_id = None self._input_folder_id = None self._girder_url = girder_url self._girder_user = girder_user self._girder_password = girder_password self._job_timeout = job_timeout self._data = 'Need more input!' self._cleanup = cleanup def setUp(self): url = '%s/api/v1' % self._girder_url self._client = GirderClient(apiUrl=url) self._client.authenticate(self._girder_user, self._girder_password) user = self._client.get('user/me') self._user_id = user['_id'] r = list(self._client.listFolder(self._user_id, 'user', name='Private')) self.assertEqual(len(r), 1) self._private_folder_id = r[0]['_id'] def tearDown(self): if not self._cleanup: return if self._job_id: try: url = 'jobs/%s' % self._job_id self._client.delete(url) except Exception as e: traceback.print_exc() if self._script_id: try: url = 'scripts/%s' % self._script_id self._client.delete(url) except Exception: traceback.print_exc() if self._output_folder_id: try: url = 'folder/%s' % self._output_folder_id self._client.delete(url) except Exception: traceback.print_exc() if self._input_folder_id: try: url = 'folder/%s' % self._input_folder_id self._client.delete(url) except Exception: traceback.print_exc() def create_script(self, commands=['sleep 10', 'cat CumulusIntegrationTestInput']): body = {'commands': commands, 'name': 'CumulusIntegrationTestLob'} r = self._client.post('scripts', data=json.dumps(body)) self._script_id = r['_id'] def create_input(self, folder_name='CumulusInput'): r = self._client.createFolder(self._private_folder_id, folder_name) self._input_folder_id = r['_id'] size = len(self._data) item = self._client.uploadFile(self._input_folder_id, StringIO(self._data), 'CumulusIntegrationTestInput', size, parentType='folder') self._item_id = item['itemId'] def create_output_folder(self, folder_name='CumulusOutput'): r = self._client.createFolder(self._private_folder_id, folder_name) self._output_folder_id = r['_id'] def create_job(self, job_name='CumulusIntegrationTestJob', tail=None): body = { 'name': job_name, 'scriptId': self._script_id, 'output': [{ 'folderId': self._output_folder_id, 'path': '.' }], 'input': [{ 'folderId': self._input_folder_id, 'path': '.' }] } if tail: body['output'].append({"path": tail, "tail": True}) job = self._client.post('jobs', data=json.dumps(body)) self._job_id = job['_id'] def submit_job(self, job_params={}, timeout=None): url = 'clusters/%s/job/%s/submit' % (self._cluster_id, self._job_id) self._client.put(url, data=json.dumps(job_params)) start = time.time() while True: time.sleep(1) r = self._client.get('jobs/%s' % self._job_id) if r['status'] in ['error', 'unexpectederror']: r = self._client.get('jobs/%s/log' % self._job_id) self.fail(str(r)) elif r['status'] == 'complete': break if time.time() - start > timeout: self.fail('Job didn\'t complete in timeout') def assert_output(self): r = self._client.listItem(self._output_folder_id) self.assertEqual(len(r), 4) stdout_item = None for i in r: if i['name'].startswith('CumulusIntegrationTestJob-%s.o' % self._job_id): stdout_item = i break self.assertIsNotNone(stdout_item) r = self._client.get('item/%s/files' % i['_id']) self.assertEqual(len(r), 1) path = os.path.join(tempfile.gettempdir(), self._job_id) try: self._client.downloadFile(r[0]['_id'], path) with open(path, 'rb') as fp: self.assertEqual(fp.read(), self._data) finally: if os.path.exists(path): os.remove(path)