def post_create_tracking_asset(request): if request.method == 'POST': # TODO Access rights if not request.user.is_superuser: return HttpResponse(status=403) j = json.loads(request.body) src_take = Take.objects.get(pk=j['take_id']) if not src_take: return HttpResponse(status=404) newasset = TrackingAsset(take=src_take, start_time=j['start_time'], end_time=j['end_time']) newasset.save() newasset.work_folder = os.path.join( os.path.join(src_take.export_path, 'work_TA%d' % newasset.id)) newasset.save() # Automatically Launch thumbnail job job = FarmJob(job_class='jobs.thumbnails.GenerateThumbnail', created_by=request.user.username, status='ready', ext_tracking_assets=newasset, req_gpu=False) job.save() return HttpResponse()
def submit_test_job(request): # TODO Access rights if not request.user.is_superuser: return HttpResponse(status=403) # DEBUGGING ONLY job = FarmJob(job_class='jobs.test.DummyJob', created_by='submit_test_job', params=request.body, status='ready') job.save() return HttpResponse()
def post_export_takes(request): if request.method == 'POST': # TODO Access rights if not request.user.is_superuser: return HttpResponse(status=403) j = json.loads(request.body) # Submit jobs to export these Take IDs on each node if 'takeid_list' in j and 'export_path' in j: job_priority = 50 # Create list of machines, and the list of files to copy for each machine list_of_take_ids = j['takeid_list'] export_path = j['export_path'] userdata = getUserData(request) userdata.export_path = export_path userdata.save() for take_id in list_of_take_ids: # Create a list of files to export for each node files_to_export = {} take = Take.objects.get(pk=take_id) take.export_path = os.path.join(export_path, take.shot.session.name, take.shot.name, '%s_%d' % (take.name, take_id)) take.work_folder = os.path.join(take.export_path, 'work_T%d' % int(take_id)) take.save() take_frame_count = take.frame_count() # Set job priority according to take type if take.flag == 'calib': job_priority = 55 elif take.flag == 'best': job_priority = 52 if take_frame_count == 1: job_priority = job_priority + 1 # Build list of nodes and files for cam in take.cameras.all(): if cam.machine_name not in files_to_export: files_to_export[cam.machine_name] = [] for filepath in cam.all_files.split(';'): files_to_export[cam.machine_name].append( (filepath, take.export_path)) params = {} params['root_export_path'] = export_path params['export_path'] = take.export_path params['nodes'] = files_to_export params_json = json.dumps(params) # Create Job job1 = FarmJob(job_class='jobs.archive.ExportTake', created_by=request.user.username, params=params_json, status='ready', ext_take=take, req_gpu=False, priority=job_priority) job1.save() if take.is_scan_burst: # If there is no scan asset associated with this Take, create it if not StaticScanAsset.objects.filter(take=take).exists(): asset = StaticScanAsset( project=take.shot.session.project, name=take.full_name(), take=take, image_folder=take.export_path, has_tracking= False # because this asset was created with the Burst button ) asset.save() asset.work_folder = os.path.join( take.export_path, 'work_SA%d' % asset.id) asset.save() # Create Job for GenerateThumbnails for the new static scan asset params = {} params['take_export_path'] = take.export_path params_json = json.dumps(params) job2 = FarmJob( job_class='jobs.thumbnails.GenerateThumbnail', created_by=request.user.username, params=params_json, status='created', ext_scan_assets=asset, req_gpu=False, priority=job_priority) job2.save() job2.dependencies.add(job1) job2.status = 'ready' job2.save() # Create Job for GenerateThumbnails if not take_frame_count == 1: params = {} params['take_export_path'] = take.export_path params_json = json.dumps(params) job2 = FarmJob( job_class='jobs.thumbnails.GenerateThumbnail', created_by=request.user.username, params=params_json, status='created', ext_take=take, req_gpu=False, priority=job_priority) job2.save() job2.dependencies.add(job1) job2.status = 'ready' job2.save() return HttpResponse()
def post_delete_takes(request): if request.method == 'POST': # TODO Access rights if not request.user.is_superuser: return HttpResponse(status=403) j = json.loads(request.body) if 'takeid_list' in j: # List of take ids to delete list_of_take_ids = j['takeid_list'] # Go thru the list of takes to delete, and check if we should delete the captured files local_files_to_delete = [] remote_files_to_delete = {} # Create list of files to delete, both on server and on the nodes, and on the exported location for take in Take.objects.filter(pk__in=list_of_take_ids): if take.video_thumb: local_files_to_delete.append( full_thumb_path( take.video_thumb)) # delete thumbnail from server for cam in take.cameras.all(): if cam.thumbnail_filename: local_files_to_delete.append( full_thumb_path(cam.thumbnail_filename) ) # delete thumbnail from server # if the take is not exported yet, delete on local capture machines files = cam.all_files.split(';') if take.export_path: # Take is exported, need to delete files from the server files.extend([ os.path.join( take.export_path, os.path.split(f.replace('\\', '/'))[1]) for f in cam.all_files.split(';') ]) if cam.machine_name not in remote_files_to_delete: remote_files_to_delete[cam.machine_name] = [] remote_files_to_delete[cam.machine_name].extend(files) # Delete takes from database Take.objects.filter(pk__in=list_of_take_ids).delete() # Create jobs to delete local files on nodes, or exported files for key in remote_files_to_delete: params = {} params['files'] = remote_files_to_delete[key] params['node'] = key params_json = json.dumps(params) # Create Job job = FarmJob(job_class='jobs.archive.DeleteFiles', created_by=request.user.username, params=params_json, status='ready', req_gpu=False, node=make_sure_node_exists(key)) job.save() # Delete local server files (thumbnails) for f in local_files_to_delete: try: os.remove(f) except: pass return HttpResponse()
def post_client_discover(request): if request.method == 'POST': update_aws_status = False cleanup_dead_jobs(request) # TODO This could be on a schedule # Update database from recieved data r = json.loads(request.body) if 'status' not in r: raise Exception('Invalid request') # Look for existing machine in the database, with the same ip nodes = FarmNode.objects.filter(machine_name__iexact=r['machine_name']) if nodes: # Node exists in database, update it node = nodes[0] node.ip_address = request.data['ip_address'] if node.aws_instance_state != 'running': update_aws_status = True node.last_seen = timezone.now() else: # Node does not exist, create it node = FarmNode(ip_address=r['ip_address'], machine_name=r['machine_name']) update_aws_status = True if 'system' in r: node.system = r['system'] if 'system_bits' in r: node.system_bits = r['system_bits'] if 'cpu_brand' in r: node.cpu_brand = r['cpu_brand'] if 'cpu_cores' in r: node.cpu_cores = r['cpu_cores'] if 'cuda_device_count' in r: node.gpu_count = r['cuda_device_count'] if 'restarted' in r and r['restarted']: node.req_restart = False if 'cpu_percent' in r: node.cpu_percent = r['cpu_percent'] if 'mem_used' in r: node.virt_percent = r['mem_used'] if 'os_version' in r: node.os_version = r['os_version'] # AWS Cloud integration if update_aws_status: node.aws_instance_id, node.aws_instance_region, node.aws_instance_state = aws.instance_id_from_private_ip( node.ip_address) else: # AWS, check if this instance should be stopped for inactivity if node.aws_instance_should_be_stopped(): nb_aws_running = FarmNode.objects.filter( aws_instance_state='running').count() slack_notification( 'Stopping inactive AWS instance: *%s* (running:%d)' % (node.machine_name, nb_aws_running - 1), color='warning') node.aws_instance_state = aws.stop_instance( node.aws_instance_id, node.aws_instance_region) if FarmJob.objects.filter(status='running', node=node).count() > 0: node.last_job_activity = timezone.now() node.code_version = r['code_version'] if 'code_version' in r else 0 node.git_version = r['git_version'] if 'git_version' in r else '' node.status = r['status'] node.save() # Update tags on farm node, if client_tags are supplied, otherwise, keep tags in DB if 'client_tags' in r: tags = r.get('client_tags', []) if node.aws_instance_id: tags.append('aws') if tags != node.tags: with transaction.atomic(): node.tags.set(*tags, clear=True) # don't need node.save() else: # client did not specify any tags, use the ones in DB tags = node.tags.names() # In order to filter jobs by tags, we start with the list of all possible tags, then # remove the tags supported by this node. What remains is the list of tags that # cannot be fulfilled. Jobs with these tags should be filtered out. all_possible_tags = FarmNode.tags.all().values_list('name', flat=True) excluded_tags = [x for x in all_possible_tags if not x in tags] available_jobs = r['available_jobs'] if 'available_jobs' in r else [] jobs_to_terminate = [ job.id for job in FarmJob.objects.filter(status='terminating', node=node) ] # Update database from running and finished jobs (if they are not 'terminating') if 'running_jobs_progress' in r: for job_id, progress in r['running_jobs_progress']: FarmJob.objects.filter( id=job_id).filter(~Q(status='terminating')).filter( ~Q(status='running') | ~Q(progress=progress)).update( status='running', progress=progress, modified=timezone.now()) elif 'running_jobs' in r: FarmJob.objects.filter(id__in=r['running_jobs']).filter(~Q( status='terminating')).filter(~Q(status='running')).update( status='running', modified=timezone.now()) if 'finished_jobs' in r: for job in r['finished_jobs']: progress = job['progress'] if 'progress' in job else '' try: # Update job with new status this_job = FarmJob.objects.get(pk=job['job_id']) this_job.progress = progress if 'children' in job: # Yield to children for job_info in job['children']: if isinstance(job_info, dict): # Create child job target_node = None if 'node_name' in job_info: target_node = make_sure_node_exists( job_info['node_name']) child = FarmJob( job_class=job_info['job_class'], created_by=this_job.created_by, params=job_info['params'], status='ready', parent=this_job, node=target_node, req_version=this_job.req_version, req_gpu=this_job.req_gpu) child.save() else: # Backward compatibility code child = FarmJob(job_class=job_info[0], created_by=this_job.created_by, params=job_info[1], status='ready', parent=this_job) child.save() g_logger.info('Job #%s set to WAITING' % (job['job_id'])) this_job.status = 'waiting' elif 'success' in job and job['success']: g_logger.info('Job #%s set to SUCCESS' % (job['job_id'])) this_job.status = 'success' this_job.end_time = timezone.now() else: g_logger.info('Job #%s set to FAILED' % (job['job_id'])) this_job.status = 'failed' this_job.exception = job['exception'] this_job.end_time = timezone.now() # Update parent job, if it exists this_job.save() onJobChanged(this_job, request) except ObjectDoesNotExist: pass # Job does not exist anymore if 'running_jobs' in r: # Jobs that are running according to the DB, but not according to the node for lost_job in FarmJob.objects.filter( Q(status='running') | Q(status='terminating')).filter( node=node).exclude(pk__in=r['running_jobs']): g_logger.info('Job #%d failed because not in running_jobs' % (lost_job.id)) lost_job.status = 'failed' lost_job.save() onJobChanged(lost_job, request) data = {} if node.status == 'accepting' and (node.aws_instance_id is None or node.aws_instance_state == 'running'): data['jobs'] = [] data['jobs_to_kill'] = [] data['req_restart'] = node.req_restart # Scheduler, reserve some tasks for specific machines if not node.req_restart: try: with transaction.atomic(): # Classes representing 2 different job channels, one for light jobs, one for heavy jobs # These two channels will be executing concurrently on the machines light_job_classes = [ 'jobs.thumbnails.GenerateThumbnail', 'jobs.test.SpeedTest' ] class Channel(): def __init__(self): self.max_instances = 1 self.nb_running = FarmJob.objects.filter( status='running', node=node).filter(self.filter_q()).count() def can_run(self): return self.nb_running < self.max_instances class LightChannel(Channel): def filter_q(self): return Q(job_class__in=light_job_classes) class HeavyChannel(Channel): def filter_q(self): return ~Q(job_class__in=light_job_classes) channels = [LightChannel(), HeavyChannel()] if True in [c.can_run() for c in channels]: # Query for all jobs we could run on this node if node.active: next_jobs = FarmJob.objects.select_for_update( ).filter( status='ready', req_version__lte=node.code_version ).filter(Q(node=node) | Q(node=None)).filter( job_class__in=available_jobs).exclude( tags__name__in=excluded_tags) else: next_jobs = FarmJob.objects.select_for_update( ).filter( status='ready', req_version__lte=node.code_version).filter( Q(node=node)).filter( job_class__in=available_jobs ).exclude(tags__name__in=excluded_tags) # Add filter for GPU if node.gpu_count <= 0: next_jobs = next_jobs.filter(req_gpu=False) # Sort jobs by priority next_jobs = next_jobs.order_by('-priority') # Create filters for each channel filter_q_list = [ c.filter_q() for c in channels if c.can_run() ] if filter_q_list: # Apply filter for each channel next_jobs = next_jobs.filter( or_list(filter_q_list)) # Go thru each job, check dependency, and exit as soon as one good job is found for next_job in next_jobs: # Check Job Dependencies (filter if that there are no dependencies that are not 'success') if next_job.dependencies.filter(~Q( status='success')).count() == 0: # TODO This should be in the same query, otherwise we may be looping for no reason g_logger.info( 'Job #%s RESERVED for %s' % (next_job.id, node.machine_name)) # Make sure there are no child on this job next_job.children.all().delete() # Send a single job to this machine next_job.status = 'reserved' next_job.node = node next_job.exception = None next_job.start_time = timezone.now() next_job.save() break except Exception as e: g_logger.error('Scheduler failed %s' % e) # Send reserved jobs to node jobs = FarmJob.objects.filter(status='reserved', node=node) for job in jobs: g_logger.info('Job #%s SUBMIT to %s' % (job.id, node.machine_name)) job_data = { 'job_id': job.id, 'job_class': job.job_class, 'params': job.params } data['jobs'].append(job_data) # Send jobs to kill to node for job_id in jobs_to_terminate: g_logger.info('Job #%s KILL to %s' % (job_id, node.machine_name)) data['jobs_to_kill'].append(job_id) return JSONResponse(data)
def restart_job(request): if not 'job_id' in request.data: return HttpResponse(status=500) clone_job = request.data[ 'clone_job'] if 'clone_job' in request.data else False use_same_machine = request.data[ 'use_same_machine'] if 'use_same_machine' in request.data else False # Find Job to be restarted src_job = FarmJob.objects.get(pk=request.data['job_id']) if not src_job: return HttpResponse(status=404) if not src_job.has_write_access(request.user): return HttpResponse(status=403) if clone_job: # Create duplicate job job = FarmJob(job_class=src_job.job_class, created_by=request.user.username, params=src_job.params, node=src_job.node if use_same_machine else None, ext_take=src_job.ext_take, ext_scan_assets=src_job.ext_scan_assets, ext_tracking_assets=src_job.ext_tracking_assets, req_gpu=src_job.req_gpu, priority=src_job.priority, status='created') job.save() # copy tags in a second pass (required for ManyToMany) job.tags.set(*src_job.tags.names(), clear=True) job.status = 'ready' job.save() g_logger.info('Job #%d restarted as job #%d' % (src_job.id, job.id)) else: # If some child are still running, refust to restart if src_job.children.filter(Q(status='running') | Q(status='waiting')).count() > 0: return JSONResponse({'message': 'Error, child running'}, status=403) # Delete all child jobs src_job.children.all().delete() on_job_restart(src_job.id) # Update job status src_job.status = 'ready' src_job.exception = None src_job.image_filename = None src_job.mesh_filename = None src_job.progress = None src_job.start_time = None src_job.end_time = None if not use_same_machine: src_job.node = None src_job.save() g_logger.info('Job #%d restarted' % (src_job.id)) return HttpResponse()
def post_export_takes(request): if request.method == 'POST': # TODO Access rights if not request.user.is_superuser: return HttpResponse(status=403) j = json.loads(request.body) # Submit jobs to export these Take IDs on each node if 'takeid_list' in j and 'export_path' in j: # Create list of machines, and the list of files to copy for each machine list_of_take_ids = j['takeid_list'] export_path = j['export_path'] userdata = getUserData(request) userdata.export_path = export_path userdata.save() for take_id in list_of_take_ids: # Create a list of files to export for each node files_to_export = {} take = Take.objects.get(pk=take_id) take.export_path = os.path.join(export_path, take.shot.session.name, take.shot.name, take.name) take.save() # Build list of nodes and files for cam in take.cameras.all(): if cam.machine_name not in files_to_export: files_to_export[cam.machine_name] = [] for filepath in cam.all_files.split(';'): files_to_export[cam.machine_name].append( (filepath, take.export_path)) params = {} params['root_export_path'] = export_path params['export_path'] = take.export_path params['nodes'] = files_to_export params_json = json.dumps(params) # Create Job job1 = FarmJob(job_class='jobs.archive.ExportTake', created_by=request.user.username, params=params_json, status='ready', ext_take=take, req_gpu=False) job1.save() # Create Job for GenerateThumbnails # TODO Only if AVI, not if Single shot TIF params = {} params['take_export_path'] = take.export_path params_json = json.dumps(params) job2 = FarmJob(job_class='jobs.thumbnails.GenerateThumbnail', created_by=request.user.username, params=params_json, status='created', ext_take=take, req_gpu=False) job2.save() job2.dependencies.add(job1) job2.status = 'ready' job2.save() return HttpResponse('Ok')