def __format_frame_for_update(self, frame_number: int, parent_input: Input): """ frame may or may not exist yet """ input = Input.new( project= None, # required, but see project_id detached session below. media_type="frame") frame_number = int(frame_number) # cast to avoid future problems input.project_id = self.project.id # Avoids detached session issues for parallel processing input.mode = "update" if parent_input.mode == 'update_with_existing': input.mode = 'update_with_existing' input.parent_input_id = parent_input.id input.parent_file_id = parent_input.file.id # Assume downstream process will use this to get frame input.frame_number = frame_number input.video_parent_length = self.highest_frame_encountered # Returns input because it does formatting too, TODO adjust function name input = self.get_instance_list_from_packet_map( input=input, frame_number=frame_number) return input
def generate_sample_files_for_dataset(self, dataset): NUM_IMAGES = 3 NUM_VIDEOS = 3 files_list_count = WorkingDirFileLink.file_list( self.session, working_dir_id=dataset.id, root_files_only=True, # TODO do we need to get child files too? limit=None, counts_only=True, type=['image', 'video']) if files_list_count >= NUM_IMAGES: return for i in range(0, NUM_IMAGES): diffgram_input = Input(project_id=dataset.project_id, url='https://picsum.photos/1000', media_type='image', directory_id=dataset.id, type='from_url') self.session.add(diffgram_input) self.session.flush() process_media = Process_Media(session=self.session, input_id=diffgram_input.id, input=diffgram_input, item=None) process_media.main_entry() # Commit right away for future querying. commit_with_rollback(self.session)
def new_input(self): """ Careful for video_split_duration...shouldn't add it since we condition on it NOTE here we have self.parent_input available as stored on Video Preprocess class but parent_input is not available on a generic input object see notes in Input class """ self.input = Input.new(parent_input_id=self.parent_input.id, project=self.project, media_type="video", type="from_video_split", job_id=self.parent_input.job_id, directory_id=self.parent_input.directory_id) self.session.add(self.input) self.session.flush() # Do we need .mp4 on end here? self.input.raw_data_blob_path = settings.PROJECT_VIDEOS_BASE_DIR + \ str(self.project.id) + "/raw/" + str(self.input.id) self.extension = ".mp4"
def create_input( self, project, request, filename ): self.input = Input.new( project=project, media_type=None, job_id=request.form.get('job_id'), directory_id=request.form.get('directory_id'), # Not trusted video_split_duration=request.form.get('video_split_duration') ) self.session.add(self.input) self.input = Upload.upload_limits( input=self.input, file_size=self.dztotalfilesize) self.input.original_filename = secure_filename( filename) # http://flask.pocoo.org/docs/0.12/patterns/fileuploads/ self.input.extension = os.path.splitext(self.input.original_filename)[1].lower() self.input.original_filename = os.path.split(self.input.original_filename)[1] # At somepoint should really declare # From UI here... self.input.type = "from_resumable" self.input.dzuuid = self.dzuuid self.input.action_flow_id = request.headers.get('flow_id') if self.input.action_flow_id: if self.input.flow is None: self.input.status = "failed" self.input.status_text = "No flow found" return self.input.mode = request.headers.get('mode') self.input.media_type = Process_Media.determine_media_type( extension=self.input.extension) if not self.input.media_type: self.input.status = "failed" self.input.status_text = "Invalid file type: " + self.input.extension # self.input.user = self.session.flush() # For ID for path self.input.raw_data_blob_path = settings.PROJECT_RAW_IMPORT_BASE_DIR + \ str(self.input.project.id) + "/raw/" + str(self.input.id) data_tools.create_resumable_upload_session( blob_path=self.input.raw_data_blob_path, content_type=None, input = self.input )
def push_frames_for_copy_to_queue(self, source_video_parent_file_id, destination_video_parent_file_id): """ Give the current data at self.input, get the video frame of the existing file and push them to the ProcessMedia Queue. :return: """ source_video_frames = WorkingDirFileLink.image_file_list_from_video( session=self.session, video_parent_file_id=source_video_parent_file_id, order_by_frame=True) frame_completion_controller = FrameCompletionControl() for frame in source_video_frames: frame_completion_controller.add_pending_frame(frame.frame_number) for frame in source_video_frames: ### HOW TOD AVOID DETACHED SESSION ### Must only pass IDs and not pass any other objects # Actually the add remove link thing could be different too... # Careful the file id is the newly copied video # The previous video id should come from the NEW file id not the previous one frame_input = Input.new( parent_input_id=self.input.id, sequence_map=self.input.sequence_map, file_id=frame.id, # existing video_parent_length=len(source_video_frames), directory_id=self.input.directory_id, source_directory_id=self.input.source_directory_id, remove_link=self.input.remove_link, add_link=self.input.add_link, copy_instance_list=self.input.copy_instance_list, parent_file_id=destination_video_parent_file_id, # This is the parent video file where all data is going to be copied. project_id=self.input.project_id, mode='copy_file', type=None, media_type='frame', ) item = process_media.PrioritizedItem( input=frame_input, frame_completion_controller=frame_completion_controller, total_frames=source_video_frames[len(source_video_frames) - 1].frame_number, num_frames_to_update=len(source_video_frames), media_type=frame_input. media_type, # declaring here helps with routing priority=100 + frame.frame_number, # Process in frame priority frame_number=frame. frame_number # Careful, downstream process currently expects it ) process_media.add_item_to_queue(item) return source_video_frames
def input_detail_core(session, project: Project, input_id: int, log: dict): """ TODO put as part of Input class """ input = Input.get_by_id(session, id=input_id) if input.project_id != project.id: log['error']['project_id'] = 'Input and project ID mismatch' return False, log return input.serialize_with_frame_packet(), log
def test_s3_add_to_diffgram(self): created_input = self.s3conn.fetch_data({ 'action_type': 'fetch_object', 'path': 'tests3connector/pablo/patrick.png', 'bucket_name': '1', 'event_data': { 'request_user': 1, 'date_time': datetime.datetime.now().strftime('%m/%d/%Y, %H:%M:%S'), 'connection_id': -1 } }) self.assertEqual(type(created_input), type(Input()))
def test_packet_endpoint_refactor(self): packet_data = { 'media': { 'url': 'https://thumbor.forbes.com/thumbor/250x382/https://blogs-images.forbes.com/dorothypomerantz/files/2011/09/Spongebob-squarepants.jpg?width=960', 'type': 'image' } } created_input = packet.enqueue_packet( self.project_string_id, session=self.session, media_url=packet_data['media']['url'], media_type=packet_data['media']['type'], job_id=None, directory_id=None) self.session.commit() self.assertEqual(type(created_input), type(Input()))
def enqueue_packet(project_string_id, session, media_url = None, media_type = None, file_id = None, file_name = None, job_id = None, batch_id = None, directory_id = None, source_directory_id = None, instance_list = None, video_split_duration = None, frame_packet_map = None, remove_link = None, add_link = None, copy_instance_list = None, commit_input = False, task_id = None, video_parent_length = None, type = None, task_action = None, external_map_id = None, original_filename = None, external_map_action = None, enqueue_immediately = False, mode = None, allow_duplicates = False, extract_labels_from_batch = False): """ Creates Input() object and enqueues it for media processing Returns Input() object that was created :param packet_data: :return: """ diffgram_input = Input() project = Project.get(session, project_string_id) diffgram_input.file_id = file_id diffgram_input.task_id = task_id diffgram_input.batch_id = batch_id diffgram_input.video_parent_length = video_parent_length diffgram_input.remove_link = remove_link diffgram_input.add_link = add_link diffgram_input.copy_instance_list = copy_instance_list diffgram_input.external_map_id = external_map_id diffgram_input.original_filename = original_filename diffgram_input.external_map_action = external_map_action diffgram_input.task_action = task_action diffgram_input.mode = mode diffgram_input.project = project diffgram_input.media_type = media_type diffgram_input.type = "from_url" diffgram_input.url = media_url diffgram_input.video_split_duration = video_split_duration diffgram_input.allow_duplicates = allow_duplicates if instance_list: diffgram_input.instance_list = {} diffgram_input.instance_list['list'] = instance_list if frame_packet_map: diffgram_input.frame_packet_map = frame_packet_map # print(diffgram_input.frame_packet_map) session.add(diffgram_input) session.flush() if batch_id and extract_labels_from_batch: upload_tools = Upload(session = session, project = project, request = None) upload_tools.extract_instance_list_from_batch(input = diffgram_input, input_batch_id = batch_id, file_name = file_name) # Expect temp dir to be None here. # because each machine should assign it's own temp dir # Something else to consider for future here! # Once this is part of input, it will be smoothly handled at right time as part of # processing queue diffgram_input.job_id = job_id # Process media handles checking if the directory id is valid diffgram_input.directory_id = directory_id diffgram_input.source_directory_id = source_directory_id diffgram_input_id = diffgram_input.id queue_limit = 0 if media_type == "image": queue_limit = 30 # 50 if media_type == "video": queue_limit = 1 if settings.PROCESS_MEDIA_ENQUEUE_LOCALLY_IMMEDIATELY is True or enqueue_immediately: print('diffgram_input_id', diffgram_input_id) if commit_input: regular_methods.commit_with_rollback(session = session) item = PrioritizedItem( priority = 10000, # individual frames have a priority here. input_id = diffgram_input_id, media_type = media_type) add_item_to_queue(item) else: diffgram_input.processing_deferred = True # Default return diffgram_input
def task_template_launch_limits(session, task_template, log): """ """ # Different permissions depending on conditions ie share type # For now don't require billing to be enabled for non market jobs # sending to Market clearly needs billing enabled # Future may want to still restrict jobs to paid accounts # For now in context of wanting trainer orgs to try it this seems reasonable # Potentially a lot to think about here... project = task_template.project if task_template.share_type == "Market": if project.api_billing_enabled is not True: log['error'][ 'billing'] = "Please enable billing or select Project / Org for share type. " # TODO Limit count of active jobs? ie default to 3 active jobs? # Limit on number of files? ie default to 500 files max per job? # Basic info # For now this is checked by new job creation # so low priorty to double check here if task_template.status not in ['draft']: log['error']['job_status'] = "Job already launched." # Files task_template.update_file_count_statistic(session=session) attached_dir_list = session.query(JobWorkingDir).filter( JobWorkingDir.job_id == task_template.id).all() if task_template.file_count_statistic == 0 and len(attached_dir_list) == 0: log['error'][ 'attached_dir_list'] = "Must attach at least 1 file or directory" if task_template.file_count: if task_template.file_count_statistic != task_template.file_count: log['error']['file_count'] = str(task_template.file_count_statistic) + " processed files " + \ "does not match set file_count: " + str(task_template.file_count) # note we are querying the input table here # suspect this is better then getting all the files # and doing a query for each to input # ie for getting bulk file status? # For retrying we may want to not include "removed" files # But a challenge here is that we are querying input not other thing # Also not sure if this really handles "failed" ones well... result = Input.directory_not_equal_to_status( session=session, directory_id=task_template.directory_id) # TODO may be some cases that this is overbearing / needs to be handled better # ie could call directory_not_equal_to_status with return type # of "objects" or something... print(result) if result > 0: log['error']['file_status'] = "Files processing. " + \ "Try again in 30-60 minutes." # Credentials # ie Warn if missing ... # ie log['warn']['credentials'] = "No credentials required" # TODO if job type is exam check if grants at least one credential? # Guides if task_template.share_type in ["market"]: if task_template.guide_default_id is None: log['error']['guide_default'] = "Missing default guide" if task_template.type == "Normal": if task_template.guide_review_id is None: # Default review guide to being same as defualt guide # until we can handle this in better way task_template.guide_review = task_template.guide_default session.add(task_template) # Don't log error for now, see above default # log['error']['guide_review'] = "Missing review guide" # Bid(S) # Label check label_count = WorkingDirFileLink.file_list( session=session, working_dir_id=task_template.project.directory_default_id, type="label", counts_only=True, ) if label_count == 0: log['error']['count'] = "Project must have at least 1 label" return log
def input_from_local(session, log, project_string_id, http_input, file, directory_id): # TODO review how we want to handle header options # Especially if needs to be outside of function for python requests... # immediate_mode = request.headers['immediate_mode'] # Issues to be careful with ie string treamtment of 'True' vs True... immediate_mode = True input = Input() input.directory_id = directory_id if http_input['instance_list']: input.instance_list = {} input.instance_list['list'] = http_input['instance_list'] if http_input['frame_packet_map']: input.frame_packet_map = http_input['frame_packet_map'] # only need to make temp dir if file doesn't already exist... original_filename = secure_filename( file.filename ) # http://flask.pocoo.org/docs/0.12/patterns/fileuploads/ input.extension = os.path.splitext(original_filename)[1].lower() input.original_filename = os.path.split(original_filename)[1] input.temp_dir = tempfile.mkdtemp() input.temp_dir_path_and_filename = input.temp_dir + \ "/" + original_filename + input.extension project = Project.get(session, project_string_id) input.project = project input.media_type = None input.media_type = Process_Media.determine_media_type(input.extension) if not input.media_type: input.status = "failed" input.status_text = "Invalid file type: " + input.extension return False, log, input session.add(input) session.flush() with open(input.temp_dir_path_and_filename, "wb") as f: f.write(file.stream.read()) # For LOCAL not normal upload file_size_limit = 9 * 1024 * 1024 * 1024 file_size = os.path.getsize( input.temp_dir_path_and_filename) # gets size in bytes if file_size > file_size_limit: input.status = "failed" input.status_text = "Exceeded max file size" return False, log, input if immediate_mode == True or immediate_mode is None: # Leave this as a direct call for time being, as we pass # the input back to thing on front end process_media = Process_Media(session=session, input=input) result = process_media.main_entry() # Always return input along with file? if result == True: return True, log, input if result == False: return False, log, input # Default priority = 100 item = PrioritizedItem(priority=priority, input_id=input.id, media_type=input.media_type) add_item_to_queue(item) return True, log, input
def load(self, video_file_name, original_filename, extension, input: Input, directory_id=None): """ Convert to .mp4 format if needed Upload .mp4 video Process each frame Arguments video_file_name, String, complete file path including directory, filename, and extension original_filename, String extension, String, includes ".", ie ".mp4" Returns None """ try: clip = moviepy_editor.VideoFileClip(video_file_name) input.status = "loaded_video" input.time_loaded_video = datetime.datetime.utcnow() input.percent_complete = 20.0 self.try_to_commit() except Exception as exception: input.status = "failed" input.status_text = "Could not load video. Try again, try a different format or contact us." # only for internal use # could look at storing in DB later or Using event logging. logger.error( 'Could not load video. Try again, try a different format or contact us. Exception: {}' .format(str(exception))) return None # https://stackoverflow.com/questions/43966523/getting-oserror-winerror-6-the-handle-is-invalid-in-videofileclip-function clip.reader.close() # Audio thing here too still doesn't seem to fix it... # clip.audio.reader.close_proc() # fps handling fps = self.project.settings_input_video_fps if fps is None: fps = 5 if fps < 0 or fps > 120: input.status = "failed" input.status_text = "Invalid fps setting of " + fps return None original_fps = clip.fps # Cache, since it will change # Always using original. FPS conversion is now deprecated fps = original_fps clip = clip.set_fps(fps) # https://zulko.github.io/moviepy/ref/VideoClip/VideoClip.html#moviepy.video.VideoClip.VideoClip.set_fps # Returns a copy of the clip with a new default fps for functions like write_videofile, iterframe, etc. # TODO do we want to save original # note these statements need to be after here in order to make sure # we update fps properly # otherwise have fps of say 0 and it's funny length = int( clip.duration * fps ) # Frame count (ESTIMATED) otherwise requires iteration / loop to get exact # temp higher limit for testing stuff # enough for a 120fps 5 minutes, or 60 fps 10 minutes frame_count_limit = 36000 if length > frame_count_limit: input.status = "failed" input.status_text = "Frame count of " + str(length) + \ " exceeded limit of " + str(frame_count_limit) + " (per video)" + \ " Lower FPS conversion in settings, split into seperate files, or upgrade account." return None max_size = settings.DEFAULT_MAX_SIZE if clip.w > max_size or clip.h > max_size: clip = resize_video(clip) video_file_name = os.path.splitext( video_file_name)[0] + "_re_saved.mp4" if settings.PROCESS_MEDIA_TRY_BLOCK_ON is True: try: # See https://zulko.github.io/moviepy/ref/VideoClip/VideoClip.html?highlight=write_videofile#moviepy.video.io.VideoFileClip.VideoFileClip.write_videofile # And https://github.com/Zulko/moviepy/issues/645 # BUT note it's been renamed to "logger" # TODO maybe capture log output somewhere else for debugging? # Maybe we could use log to update input status / percent complete """ Feb 9 2020 Audio to True seems to add issues ie : index -100001 is out of bounds for axis 0 with size 0 ffmpeg found this but I don't think that's it https://stackoverflow.com/questions/59358680/how-to-fix-out-of-bounds-error-in-to-soundarray-in-moviepy The strange part is that some of it works... TODO IF audio is a common issue, could have 2 try blocks but would want to have this as a function then. ie video with no audio is perhaps better then total failure, or total no audio. """ clip.write_videofile(video_file_name, audio=False, threads=4, logger=None) except Exception as exception: input.status = "failed" input.status_text = "Could not write video file. Try a different format or contact us." logger.error( 'Could not write video file. Try a different format or contact us.)' ) return None else: clip.write_videofile(video_file_name, audio=False, threads=4, logger=None) if not directory_id: directory_id = self.project.directory_default_id # Video file gets created in advance so # be careful to add project here """ This is in the context of Video potentially wanting more stuff from the "parent video". This needs a lot of work. For the moment we just get the parent input and copy a single attribute here for easier access later on. Directionally we want to think about stronger connections between split clips. And forest wise we need to grab this here because going back to get the input afterwards from file can be challenging becasue as the system does various modifications the parent gets further and further removed. """ parent_video_split_duration = None try: parent_input = input.parent_input(self.session) if parent_input: parent_video_split_duration = parent_input.video_split_duration except: print("Could not get parent input") video, input.file = Video.new( session=self.session, project=self.project, filename=original_filename, frame_rate=clip.fps, frame_count=0, width=clip.w, height=clip.h, directory_id=directory_id, parent_input_id=input.parent_input_id, parent_video_split_duration=parent_video_split_duration, file_metadata=input.file_metadata, ) if self.input.frame_packet_map: self.__prepare_sequences(parent_input=input) if self.check_update_log_errors() is False: return input.file.input_id = input.id # revsere link is sometimes handy to have. # Jan 13, 2020 these are both computed above # Video object is not created yet so stored locally and then used here... video.original_fps = original_fps video.fps = fps video.offset_in_seconds = input.offset_in_seconds video.root_blob_path_to_frames = settings.PROJECT_IMAGES_BASE_DIR + \ str(self.project.id) + "/" + str(video.id) + "/frames/" self.upload_video_file(video_file_name, ".mp4", video) input.status = "finished_writing_video_file" input.time_video_write_finished = datetime.datetime.utcnow() input.percent_complete = 30.0 self.try_to_commit() self.session.add(video) initial_global_frame = 0 if input.type == 'from_video_split': initial_global_frame = video.fps * input.offset_in_seconds for index, frame in enumerate(clip.iter_frames()): global_frame_number = frame if input.type == 'from_video_split': seconds_offset = input.offset_in_seconds offset_in_frames = video.fps * seconds_offset global_frame_number = index + offset_in_frames if index == 0: input.status = "pushing_frames_into_processing_queue" # This setups up input, see function below self.add_frame_to_queue( frame, index, original_filename, self.project, directory_id, video, length, input.file, # assumes this is video_parent_file global_frame_number, initial_global_frame) # TODO clarify if this is actually showing up the queue as expected video.frame_count += 1 # This is really key for monitoring efforts # Because at the moment this loop can be fairly slow if index % 10 == 0: # Where 10 is adding this every 10 frames # to be completed by next phase # at most this adds 1 when compelte so multiple by 30 to represent # this portion of the work input.percent_complete += (10 / length) * 30 self.try_to_commit() # Clean up handled in process media.. input.time_pushed_all_frames_to_queue = datetime.datetime.utcnow() return input.file
def add_frame_to_queue(self, frame, index: int, original_filename: str, project: Project, directory_id, video, length, video_parent_file: File, global_frame_number=None, initial_global_frame=None): """ Where frame is: a HxWxN np.array, where N=1 for mask clips and N=3 for RGB clips. https://zulko.github.io/moviepy/ref/VideoClip/VideoClip.html Careful we don't have self. context here Cautions * We purposely do not not pass the job id, since we only want to original video to be added to the job Question, is it correct we create input class in part to maintain same concepts / format even for video frames? Answer: For example see frame_end_number is used to pass information Makes more sense to have it all in there then the PrioritizedItem() thing long term Also thinking in terms of logging And yes of course, then it's complete reuse of the component Jan 20, 2020 Note we purposely do NOT commit this as it creates unneeded db overhead, so instead we only use it as a local object to maintain consistency of design which means we do NOT want to add to add it a sesion ie self.session.add(input) """ input = Input() # Use input for class attributes, # but don't add it to the session for video? # TODO use File.new() for consistency here (ie as we add new things) # Single frame naming input.original_filename = original_filename + "_" + str(index) input.extension = ".jpg" input.media_type = "frame" input.temp_dir = tempfile.mkdtemp() input.project = project input.directory_id = directory_id input.parent_file_id = video_parent_file.id input.frame_packet_map = self.input.frame_packet_map # caution length is estimated. frame_count # is calculated as we roll through this so can't use it yet # Question: clarity on difference between numbers. # (I know estimate but still.) input.video_parent_length = length input.parent_input_id = self.input.id input.project_id = self.project.id # This is a temporary usage thing only # Note database persisted # Context of needing it to be defined so existing instances # Can use it (vs having to get video from db each time, # prior we defined this on first frame. input.root_blob_path_to_frames = video.root_blob_path_to_frames input = self.get_instance_list_from_packet_map( input=input, frame_number=index, global_frame_number=global_frame_number, initial_global_frame=initial_global_frame, from_video_split=self.input.type == 'from_video_split') """ For frame priority, the original genesis was doing the last frame last but, I think it also makese sense to process in order in general. An alternative would be to say put a flag on the last frame but using order feels like a more general solution, assuming no suprises or extra overhead. Storing frames Maybe don't attach video_parent_file because it leads to not bound errors in ORM fairly easily. """ # TODO, consider sending data as a "raw" blob # to cloud storage, then setting "processing deferred" to True here. # Process frames of videos started before new videos item = process_media.PrioritizedItem( priority=100 + index, # Process in frame priority input=input, raw_numpy_image=frame, file_is_numpy_array=True, video_id=video.id, frame_number=index, global_frame_number=global_frame_number, media_type=input.media_type) process_media.add_item_to_queue(item)