def create_directory(dir_data, session): working_dir = WorkingDir() working_dir.user_id = dir_data['user'].id working_dir.project_id = dir_data['project'].id if dir_data.get('jobs_to_sync'): working_dir.jobs_to_sync = dir_data.get('jobs_to_sync') session.add(working_dir) regular_methods.commit_with_rollback(session) if dir_data.get('files'): file_list = dir_data.get('files') for file in file_list: WorkingDirFileLink.add(session, working_dir.id, file) regular_methods.commit_with_rollback(session) return working_dir
def directory_not_equal_to_status(session, directory_id, status="success", return_type="count"): """ Returns 0 if there are no files equal to status otherwise returns count of files != to status """ file_link_sub_query = WorkingDirFileLink.get_sub_query( session, directory_id) assert file_link_sub_query is not None # TODO should we exclude # failed ones optionally?... # We could do status not in list [failed_flag, success] etc.. query = session.query(Input).filter( Input.file_id == file_link_sub_query.c.file_id, Input.status != status, Input.archived != True) if return_type == "count": return query.count() if return_type == "objects": return query.all()
def create_label_file(label_file_data, session): label_file = File() label_file.label = label_file_data.get('label') label_file.label_id = label_file_data.get('label').id label_file.project_id = label_file_data['project_id'] label_file.state = label_file_data.get('state', 'added') label_file.type = 'label' session.add(label_file) regular_methods.commit_with_rollback(session) project = Project.get_by_id(session, label_file.project_id) if project: WorkingDirFileLink.add(session, project.directory_default_id, label_file) project.refresh_label_dict(session) session.add(label_file) regular_methods.commit_with_rollback(session) return label_file
def refresh_label_dict(self, session): file_list = WorkingDirFileLink.file_list( session=session, working_dir_id=self.directory_default_id, limit=10000000, type="label", exclude_removed=False) # eg for permissions if not self.label_dict: self.label_dict = {} self.label_dict['label_file_id_list'] = [file.id for file in file_list]
def build_name_to_file_id_dict(session, directory_id): directory_id = int(directory_id) sub_query = WorkingDirFileLink.get_sub_query(session=session, working_dir_id=directory_id, type="label") # Could also try and merge on labels or just filter through here file_list = session.query(File).filter(File.id == sub_query.c.file_id, File.state != "removed").all() out = {} for file in file_list: out[file.label.name] = file.id return out, True
def update_file_count_statistic(self, session): """ In theory we could count each file as it gets added but that seems prone to off by 1 errors in distributed systems context. Instead we just query the count, and update it here? Slight problem is that this statistic could be out if the file doesn't get remove properly from the job... """ self.file_count_statistic = WorkingDirFileLink.file_list( session=session, working_dir_id=self.directory_id, counts_only=True, limit=None) session.add(self)
def regenerate_preview_file_list(self): """ Returns Empty array if no files Otherwise array of preview files Treating like a "cache" thing so can query this to get new ones. Thought process is that a) We may want to use files for other things b) Not clear if we want to use preview image or not c) More work to try and parse it into URLs only upfront, and not clear of benefit since not actually storing that much data and we may *want* to say get a preview of instances or something else too... Assumes: using self.session default directory """ preview_file_list = [] if self.session is None: return preview_file_list file_list = WorkingDirFileLink.file_list( session=self.session, working_dir_id=self.directory_default_id, limit=3, root_files_only=True # Excludes labels at time of writing ) if not file_list: return preview_file_list for file in file_list: preview_file_list.append(file.serialize_with_type(self.session)) return preview_file_list
def new_external_export(session, project, export_id, version=None, working_dir=None, use_request_context=True): """ Create a new export data file This is run on first load session, session object project, project object Designed for external consumptions returns {"success" : True} if successfully Security model this is an internal function export web DOES the validation Job_permissions.check_job_after_project_already_valid() """ logger.info("[Export processor] Started") result = False start_time = time.time() export = session.query(Export).filter(Export.id == export_id).first() member = None if use_request_context: user = User.get(session) export.user = user if user: member = user.member else: client_id = request.authorization.get('username', None) auth = Auth_api.get(session, client_id) member = auth.member session.add(export) if export.source == "task": if export.task and export.task.file: # Caution export.task not task file_list = [export.task.file] # While job could be None and still get files # if we do have a job id we may want to get # files not replaced in the directory yet. if export.source == "job": file_list = WorkingDirFileLink.file_list( session=session, limit=None, root_files_only=True, job_id=export.job_id, ann_is_complete=export.ann_is_complete) if export.source == "directory": # Question, why are we declaring this here? # Doesn't really make sense as export already has # it when created? export.working_dir_id = working_dir.id file_list = WorkingDirFileLink.file_list( session=session, working_dir_id=working_dir.id, limit=None, root_files_only=True, ann_is_complete=export.ann_is_complete) result, annotations = annotation_export_core(session=session, project=project, export=export, file_list=file_list) if result is False or result is None: return False filename = generate_file_name_from_export(export, session) if export.kind == "Annotations": export.yaml_blob_name = settings.EXPORT_DIR + \ str(export.id) + filename + '.yaml' export.json_blob_name = settings.EXPORT_DIR + \ str(export.id) + filename + '.json' try: yaml_data = yaml.dump(annotations, default_flow_style=False) data_tools.upload_from_string(export.yaml_blob_name, yaml_data, content_type='text/yaml', bucket_type='ml') except Exception as exception: trace_data = traceback.format_exc() logger.error("[Export, YAML] {}".format(str(exception))) logger.error(trace_data) json_data = json.dumps(annotations) data_tools.upload_from_string(export.json_blob_name, json_data, content_type='text/json', bucket_type='ml') end_time = time.time() logger.info("[Export processor] ran in {}".format(end_time - start_time)) Event.new(kind="export_generation", session=session, member=member, success=result, project_id=project.id, run_time=end_time - start_time) return True, annotations
def annotation_export_core(session, project, export, file_list): """ Generic method to export a file list """ images_dir = settings.PROJECT_IMAGES_BASE_DIR + \ str(project.id) + "/" export.file_list_length = len(file_list) errors_result = check_for_errors(export=export, session=session) if errors_result is False: return False, None # If we build annotations directly then we could return them # If tf records than not # But some clean up stuff (ie marking complete) that's joint # Also not clear where we would be using a return dict of annotations here # Ohhh it returns annotations since we upload in YAML or JSON format for that # Maybe that should just be part of that process (instead of returning with a # seperate flag?) annotations = None # This is here as it's shared with with masks and # not masks, but needs to run before masks if masks # So we can have mask values increase in series # instead of using ids for example # Careful, want to use project default directory for labels for now label_file_list = WorkingDirFileLink.file_list( session=session, working_dir_id=export.project.directory_default_id, limit=None, type="label") if export.kind == "TF Records": label_dict = data_tools.label_dict_builder(file_list=label_file_list) export_label_map = {} for label_file in label_file_list: export_label_map[label_file.id] = label_file.label.name # TODO masks if not part of TF records is not really handled great right now # TODO pass export object to track it? """ Would be good to allow masks for regular records / JSON too, but not supported yet, so for now we do the tf records check too. """ if export.masks is True and export.kind == "TF Records": # Assumes deep lab style for now? semantic_prep = Semantic_segmentation_data_prep() semantic_prep.generate_mask_core(session=session, project=project, file_list=file_list, type="joint", label_dict=label_dict) if export.kind == "TF Records": export.tf_records_blob_name = settings.EXPORT_DIR + \ str(export.id) # Still need to check masks again here # To determine what building method we are using? if export.masks is True: result = data_tools.tf_records_new( session=session, file_list=file_list, project_id=export.project_id, method="semantic_segmentation", output_blob_dir=export.tf_records_blob_name) export.tf_records_blob_name += "/train-0.record" if export.masks is False: result = data_tools.tf_records_new( session=session, project_id=export.project_id, file_list=file_list, method="object_detection", label_dict=label_dict, output_blob_dir=export.tf_records_blob_name) export.tf_records_blob_name += "/tfrecords_0.record" if export.kind == "Annotations": annotations = {} annotations['readme'] = export.serialize_readme() annotations['label_map'] = export_label_map annotations['label_colour_map'] = build_label_colour_map( session, export_label_map) # TODO maybe, would like "annotations" # To be one layer "deeper" in terms of nesting. annotations['export_info'] = export.serialize_for_inside_export_itself( ) # Other / shared stuff annotations[ "attribute_groups_reference"] = build_attribute_groups_reference( session=session, project=project) # TODO # so I guess the "new" yaml one can do it "on demand" # if you substitute version for working directory? for index, file in enumerate(file_list): # Image URL? packet = build_packet( file=file, session=session, file_comparison_mode=export.file_comparison_mode) # What about by filename? # Original filename is not gauranteed to be unique # Careful! if this is not unique it will overwrite # on export and difficult to debug # as it looks like its' working (ie file count is there) # but first file is "null"... # Prior we used hash here, but in context of a task # We may not re hash file (something to look at in future, maybe # we do want to hash it...) annotations[file.id] = packet export.percent_complete = (index / export.file_list_length) * 100 if index % 10 == 0: # TODO would need to commit the session for this to be useful right? logger.info("Percent done {}".format(export.percent_complete)) try_to_commit(session=session) # push update export.status = "complete" export.percent_complete = 100 return True, annotations
def task_template_launch_limits(session, task_template, log): """ """ # Different permissions depending on conditions ie share type # For now don't require billing to be enabled for non market jobs # sending to Market clearly needs billing enabled # Future may want to still restrict jobs to paid accounts # For now in context of wanting trainer orgs to try it this seems reasonable # Potentially a lot to think about here... project = task_template.project if task_template.share_type == "Market": if project.api_billing_enabled is not True: log['error'][ 'billing'] = "Please enable billing or select Project / Org for share type. " # TODO Limit count of active jobs? ie default to 3 active jobs? # Limit on number of files? ie default to 500 files max per job? # Basic info # For now this is checked by new job creation # so low priorty to double check here if task_template.status not in ['draft']: log['error']['job_status'] = "Job already launched." # Files task_template.update_file_count_statistic(session=session) attached_dir_list = session.query(JobWorkingDir).filter( JobWorkingDir.job_id == task_template.id).all() if task_template.file_count_statistic == 0 and len(attached_dir_list) == 0: log['error'][ 'attached_dir_list'] = "Must attach at least 1 file or directory" if task_template.file_count: if task_template.file_count_statistic != task_template.file_count: log['error']['file_count'] = str(task_template.file_count_statistic) + " processed files " + \ "does not match set file_count: " + str(task_template.file_count) # note we are querying the input table here # suspect this is better then getting all the files # and doing a query for each to input # ie for getting bulk file status? # For retrying we may want to not include "removed" files # But a challenge here is that we are querying input not other thing # Also not sure if this really handles "failed" ones well... result = Input.directory_not_equal_to_status( session=session, directory_id=task_template.directory_id) # TODO may be some cases that this is overbearing / needs to be handled better # ie could call directory_not_equal_to_status with return type # of "objects" or something... print(result) if result > 0: log['error']['file_status'] = "Files processing. " + \ "Try again in 30-60 minutes." # Credentials # ie Warn if missing ... # ie log['warn']['credentials'] = "No credentials required" # TODO if job type is exam check if grants at least one credential? # Guides if task_template.share_type in ["market"]: if task_template.guide_default_id is None: log['error']['guide_default'] = "Missing default guide" if task_template.type == "Normal": if task_template.guide_review_id is None: # Default review guide to being same as defualt guide # until we can handle this in better way task_template.guide_review = task_template.guide_default session.add(task_template) # Don't log error for now, see above default # log['error']['guide_review'] = "Missing review guide" # Bid(S) # Label check label_count = WorkingDirFileLink.file_list( session=session, working_dir_id=task_template.project.directory_default_id, type="label", counts_only=True, ) if label_count == 0: log['error']['count'] = "Project must have at least 1 label" return log
def task_template_label_attach( session, task_template, project_directory=None, ): """ Get label files from project and attach to job want full project_directory object for label_file_colour_map A key part of the rationale here is that a job may have labels that are distinct from the project. Main point of having this here is flexability that if we change the way we represent jobs, we don't have to change the "upfront" logic in terms of attach ids. ie decouple which ids are attached to a job to whatever muck we need to do at launch time / "run" time. """ if task_template.label_mode == "closed_all_available": label_file_list_serialized = [] # Provided label_file_list = task_template.label_dict.get('label_file_list') if label_file_list: file_list = File.get_by_id_list(session, label_file_list) else: # Temporary fall back for migration print("label file list did not exist, using fall back") file_list = WorkingDirFileLink.file_list( session=session, working_dir_id=project_directory.id, limit=25, type="label") # Store for future reference here task_template.label_dict['label_file_list'] = [ file.id for file in file_list ] # Work for file in file_list: file_serialized = file.serialize_with_label_and_colour( session=session) # Make sure time stamps are wrapped in str() to avoid nested json / dict issues label_file_list_serialized.append(file_serialized) # For debugging issue with serailization here. # print(label_file_list_serialized) task_template.label_dict[ 'label_file_list_serialized'] = label_file_list_serialized # Now in context of users being able to choose labels, # We rebuild this on launching task_template.label_dict['label_file_colour_map'] = rebuild_label_map( file_list) return True
def new(session, working_dir_id=None, project_id=None, file_type=None, image_id=None, text_file_id=None, video_id=None, frame_number=None, label_id=None, colour=None, original_filename=None, video_parent_file=None, input_id=None, parent_id=None, task=None, file_metadata=None): """ "file_added" case Given a new image create a new file to track this image This assumes a new image is completely new We are always creating a new file at init so there will be A file, question is if there is a previous file too It was confusing it to have two different ways to assign project here so remove in favour of just having one. Careful with object.id, since if the object can be None it won't work as expected then... video_parent_file_id issue video_parent_file (not id ) FAILs because it does NOT exist, we have it as a function due to a work around issue with sql alchemy so MUST store the actual id """ from shared.database.source_control.working_dir import WorkingDirFileLink video_parent_file_id = None if video_parent_file: video_parent_file_id = video_parent_file.id file = File(original_filename=original_filename, image_id=image_id, state="added", type=file_type, project_id=project_id, label_id=label_id, text_file_id=text_file_id, video_id=video_id, video_parent_file_id=video_parent_file_id, frame_number=frame_number, colour=colour, input_id=input_id, parent_id=parent_id, task=task, file_metadata=file_metadata) File.new_file_new_frame(file, video_parent_file) session.add(file) session.flush() # Question do we still need to be running this here? file.hash_update() # Video frames don't need a working dir? # Or should we still put them in anyway... # in context of video frames # we don't want them to be in a working directory directly # so we can smoothly move files if working_dir_id: WorkingDirFileLink.add(session, working_dir_id, file) return file
def create_file_links_for_attached_dirs(self, sync_only=False, create_tasks=False, file_to_link=None, file_to_link_dataset=None, related_input=None, member=None): """ Called once before launch. This function will check all directories in JobWorkingDir table and create the file links for all the related files. This function will create links for both "sync" type and "select". "select" type dirs will just be linked once (ie new files added to dir wont be updated), sync types dirs will update links on process_media when a new file is attached to the dir or when a file is copied or moved to the sync directory. :param session: :param job: :param log: :return: """ # Now create a file link for all the files on all the directories on the job and attach them. if sync_only: directory_list = self.job.get_attached_dirs(session=self.session) else: directory_list = self.job.get_attached_dirs( session=self.session, sync_types=['sync', 'select_once']) if len(directory_list) == 0: self.log['info'][ 'attached_directories_list'] = 'No directories attached.' return directory_list if file_to_link is None or file_to_link_dataset is None: # Case where we do not provide a single file for sync (i.e no file_to_link or file_to_link_dataset) for directory in directory_list: if self.job.instance_type in ['text_tokens']: files = WorkingDirFileLink.file_list( self.session, working_dir_id=directory.id, root_files_only= True, # TODO do we need to get child files too? limit=None, type='text') else: files = WorkingDirFileLink.file_list( self.session, working_dir_id=directory.id, root_files_only= True, # TODO do we need to get child files too? limit=None, ) for file in files: logger.debug( 'Single file sync event with file: {} and folder {}'. format(directory, file)) sync_event_manager = SyncEventManager.create_sync_event_and_manager( session=self.session, dataset_source_id=directory.id, dataset_destination=None, description= 'Sync file {} from dataset {} to job {} and create task' .format(file.original_filename, directory.nickname, self.job.name), file=file, job=self.job, input=related_input, project=self.job.project, event_effect_type='create_task', event_trigger_type='file_added', status='init', member_created=member) logger.debug('Created sync_event {}'.format( sync_event_manager.sync_event.id)) result, log = self.__add_file_into_job( file, directory, create_tasks=create_tasks, sync_event_manager=sync_event_manager) if result is not True: log['error'][ 'sync_file_dirs'] = 'Error syncing dirs for file id: {}'.format( file.id) if len(log['error'].keys()) > 1: return False, log else: logger.debug( 'Single file sync event with file: {} and folder {}'.format( file_to_link_dataset.id, file_to_link.id)) sync_event_manager = SyncEventManager.create_sync_event_and_manager( session=self.session, dataset_source_id=file_to_link_dataset.id, dataset_destination=None, description= 'Sync file {} from dataset {} to job {} and create task'. format(file_to_link.original_filename, file_to_link_dataset.nickname, self.job.name), file=file_to_link, job=self.job, input=related_input, project=self.job.project, event_effect_type='create_task', event_trigger_type='file_added', status='init', member_created=member) logger.debug('Created sync_event {}'.format( sync_event_manager.sync_event.id)) result, log = self.__add_file_into_job( file_to_link, file_to_link_dataset, create_tasks=create_tasks, sync_event_manager=sync_event_manager, ) if result is not True: log['error'][ 'sync_file_dirs'] = 'Error syncing dirs for file id: {}'.format( file_to_link.id) if len(log['error'].keys()) > 1: return False, log self.job.update_file_count_statistic(session=self.session) return True, self.log
def __add_file_into_job(self, file: File, incoming_directory: WorkingDir, job: Job = None, create_tasks: bool = False, sync_event_manager=None): """ Given a file, add the link to the job directory and create a task if create_tasks=True. :param session: :param file: :param dir: :param job: :param log: :param create_tasks: :return: """ job_obj = self.job if job is not None: job_obj = job result, log = WorkingDirFileLink.file_link_update( session=self.session, add_or_remove='add', incoming_directory=incoming_directory, directory=job_obj.directory, file_id=file.id, job=job_obj, log=self.log) logger.debug('File {} added to job {}'.format(file.id, job_obj.id)) if create_tasks is False: log['info']['create_tasks flag'] = "create_tasks is False" return True, log valid_status_to_create_tasks = ['active', 'in_review', 'complete'] if job_obj.status not in valid_status_to_create_tasks: log['info']['job status'] = "not in " + str( valid_status_to_create_tasks) logger.debug( 'Job status not active, skipping. Statuses must be one of {}'. format(str(valid_status_to_create_tasks))) return True, log logger.debug('Creating task...') potential_existing_task = self.__check_if_task_exists(job=job_obj, file=file) if potential_existing_task is None: task = self.create_task_from_file( file, job=job_obj, incoming_directory=incoming_directory) task.is_root = True logger.debug('New task created. {}'.format(task.id)) else: task = potential_existing_task if sync_event_manager: sync_event_manager.add_create_task(task) sync_event_manager.set_status('completed') if result is not True: log['error'][ 'create_file_links'] = 'Error creating links for file id: {}'.format( file.id) return False, log if len(log['error'].keys()) > 1: return False, log return True, log
def file_transfer_core( session, source_directory, destination_directory, transfer_action: str, file, log: dict, member=None, copy_instances: bool = False, sync_event_manager=None, log_sync_events=True, defer_sync=False, defer_copy=True, batch_id=None, update_project_for_copy=False, ): """ source_directory and destination_directory are trusted, assumed to be valid here copy_instances, bool """ if transfer_action == "copy": new_file = File.copy_file_from_existing( session=session, working_dir=destination_directory, orginal_directory_id=source_directory.id if source_directory else None, existing_file=file, copy_instance_list=copy_instances, log=log, add_link=True, remove_link=False, flush_session=True, defer_copy=defer_copy, batch_id=batch_id) if defer_copy: return log perform_sync_events_after_file_transfer( session=session, source_directory=source_directory, destination_directory=destination_directory, log=log, log_sync_events=log_sync_events, transfer_action=transfer_action, file=file, member=member, new_file=new_file, defer_sync=defer_sync, sync_event_manager=None) if not log['info'].get('new_file', []): if new_file: log['info']['new_file'] = [ new_file.serialize_with_type(session) ] else: if new_file: log['info']['new_file'].append( new_file.serialize_with_type(session)) if not log['info'].get('message'): log['info']['message'] = 'File Copy Success.' return log if transfer_action == "move": # Get existing link link = WorkingDirFileLink.file_link(session=session, working_dir_id=source_directory.id, file_id=file.id) if link is None: log["error"][ 'file_link'] = 'File link of file: {} and workingdir: {}. Does not exists'.format( source_directory.id, file.id) return log # TODO consider how this effects committed # Is it safe to just "update" it this way? # SHould this be a built in method of WorkingDirFileLink new_link = WorkingDirFileLink.file_link( session=session, working_dir_id=destination_directory.id, file_id=file.id) if new_link is not None: log["error"][ 'file_link'] = 'File link of file: {} and Destination workingdir: {}. Already Exists'.format( source_directory.id, file.id) return log link.working_dir_id = destination_directory.id session.add(link) perform_sync_events_after_file_transfer( session=session, source_directory=source_directory, destination_directory=destination_directory, log=log, log_sync_events=log_sync_events, transfer_action=transfer_action, file=file, member=member, new_file=None, defer_sync=defer_sync, sync_event_manager=sync_event_manager) return log if transfer_action == "mirror": existing_link = WorkingDirFileLink.file_link( session=session, working_dir_id=destination_directory.id, file_id=file.id) if existing_link is not None: log["error"][str(file.id)] = "File already in dataset id: " + \ str(destination_directory.id) return log link = WorkingDirFileLink.add(session=session, working_dir_id=destination_directory.id, file=file) log["info"][str(file.id)] = True return log