Ejemplo n.º 1
0
def post_create_tracking_asset(request):
    if request.method == 'POST':

        # TODO Access rights
        if not request.user.is_superuser:
            return HttpResponse(status=403)

        j = json.loads(request.body)

        src_take = Take.objects.get(pk=j['take_id'])
        if not src_take:
            return HttpResponse(status=404)

        newasset = TrackingAsset(take=src_take,
                                 start_time=j['start_time'],
                                 end_time=j['end_time'])
        newasset.save()

        newasset.work_folder = os.path.join(
            os.path.join(src_take.export_path, 'work_TA%d' % newasset.id))
        newasset.save()

        # Automatically Launch thumbnail job
        job = FarmJob(job_class='jobs.thumbnails.GenerateThumbnail',
                      created_by=request.user.username,
                      status='ready',
                      ext_tracking_assets=newasset,
                      req_gpu=False)
        job.save()

    return HttpResponse()
Ejemplo n.º 2
0
def submit_test_job(request):

    # TODO Access rights
    if not request.user.is_superuser:
        return HttpResponse(status=403)

    # DEBUGGING ONLY

    job = FarmJob(job_class='jobs.test.DummyJob',
                  created_by='submit_test_job',
                  params=request.body,
                  status='ready')
    job.save()

    return HttpResponse()
Ejemplo n.º 3
0
def post_export_takes(request):
    if request.method == 'POST':

        # TODO Access rights
        if not request.user.is_superuser:
            return HttpResponse(status=403)

        j = json.loads(request.body)

        # Submit jobs to export these Take IDs on each node

        if 'takeid_list' in j and 'export_path' in j:

            job_priority = 50

            # Create list of machines, and the list of files to copy for each machine
            list_of_take_ids = j['takeid_list']
            export_path = j['export_path']

            userdata = getUserData(request)
            userdata.export_path = export_path
            userdata.save()

            for take_id in list_of_take_ids:

                # Create a list of files to export for each node
                files_to_export = {}

                take = Take.objects.get(pk=take_id)
                take.export_path = os.path.join(export_path,
                                                take.shot.session.name,
                                                take.shot.name,
                                                '%s_%d' % (take.name, take_id))
                take.work_folder = os.path.join(take.export_path,
                                                'work_T%d' % int(take_id))
                take.save()

                take_frame_count = take.frame_count()

                # Set job priority according to take type
                if take.flag == 'calib':
                    job_priority = 55
                elif take.flag == 'best':
                    job_priority = 52
                if take_frame_count == 1:
                    job_priority = job_priority + 1

                # Build list of nodes and files
                for cam in take.cameras.all():

                    if cam.machine_name not in files_to_export:
                        files_to_export[cam.machine_name] = []

                    for filepath in cam.all_files.split(';'):
                        files_to_export[cam.machine_name].append(
                            (filepath, take.export_path))

                params = {}
                params['root_export_path'] = export_path
                params['export_path'] = take.export_path
                params['nodes'] = files_to_export
                params_json = json.dumps(params)

                # Create Job
                job1 = FarmJob(job_class='jobs.archive.ExportTake',
                               created_by=request.user.username,
                               params=params_json,
                               status='ready',
                               ext_take=take,
                               req_gpu=False,
                               priority=job_priority)
                job1.save()

                if take.is_scan_burst:
                    # If there is no scan asset associated with this Take, create it
                    if not StaticScanAsset.objects.filter(take=take).exists():
                        asset = StaticScanAsset(
                            project=take.shot.session.project,
                            name=take.full_name(),
                            take=take,
                            image_folder=take.export_path,
                            has_tracking=
                            False  # because this asset was created with the Burst button
                        )
                        asset.save()
                        asset.work_folder = os.path.join(
                            take.export_path, 'work_SA%d' % asset.id)
                        asset.save()

                        # Create Job for GenerateThumbnails for the new static scan asset
                        params = {}
                        params['take_export_path'] = take.export_path
                        params_json = json.dumps(params)
                        job2 = FarmJob(
                            job_class='jobs.thumbnails.GenerateThumbnail',
                            created_by=request.user.username,
                            params=params_json,
                            status='created',
                            ext_scan_assets=asset,
                            req_gpu=False,
                            priority=job_priority)
                        job2.save()
                        job2.dependencies.add(job1)
                        job2.status = 'ready'
                        job2.save()

                # Create Job for GenerateThumbnails
                if not take_frame_count == 1:
                    params = {}
                    params['take_export_path'] = take.export_path
                    params_json = json.dumps(params)
                    job2 = FarmJob(
                        job_class='jobs.thumbnails.GenerateThumbnail',
                        created_by=request.user.username,
                        params=params_json,
                        status='created',
                        ext_take=take,
                        req_gpu=False,
                        priority=job_priority)
                    job2.save()
                    job2.dependencies.add(job1)
                    job2.status = 'ready'
                    job2.save()

    return HttpResponse()
Ejemplo n.º 4
0
def post_delete_takes(request):
    if request.method == 'POST':

        # TODO Access rights
        if not request.user.is_superuser:
            return HttpResponse(status=403)

        j = json.loads(request.body)

        if 'takeid_list' in j:

            # List of take ids to delete
            list_of_take_ids = j['takeid_list']

            # Go thru the list of takes to delete, and check if we should delete the captured files
            local_files_to_delete = []
            remote_files_to_delete = {}

            # Create list of files to delete, both on server and on the nodes, and on the exported location
            for take in Take.objects.filter(pk__in=list_of_take_ids):

                if take.video_thumb:
                    local_files_to_delete.append(
                        full_thumb_path(
                            take.video_thumb))  # delete thumbnail from server

                for cam in take.cameras.all():

                    if cam.thumbnail_filename:
                        local_files_to_delete.append(
                            full_thumb_path(cam.thumbnail_filename)
                        )  # delete thumbnail from server

                    # if the take is not exported yet, delete on local capture machines
                    files = cam.all_files.split(';')

                    if take.export_path:
                        # Take is exported, need to delete files from the server
                        files.extend([
                            os.path.join(
                                take.export_path,
                                os.path.split(f.replace('\\', '/'))[1])
                            for f in cam.all_files.split(';')
                        ])

                    if cam.machine_name not in remote_files_to_delete:
                        remote_files_to_delete[cam.machine_name] = []
                    remote_files_to_delete[cam.machine_name].extend(files)

            # Delete takes from database
            Take.objects.filter(pk__in=list_of_take_ids).delete()

            # Create jobs to delete local files on nodes, or exported files
            for key in remote_files_to_delete:

                params = {}
                params['files'] = remote_files_to_delete[key]
                params['node'] = key
                params_json = json.dumps(params)

                # Create Job
                job = FarmJob(job_class='jobs.archive.DeleteFiles',
                              created_by=request.user.username,
                              params=params_json,
                              status='ready',
                              req_gpu=False,
                              node=make_sure_node_exists(key))
                job.save()

            # Delete local server files (thumbnails)
            for f in local_files_to_delete:
                try:
                    os.remove(f)
                except:
                    pass

    return HttpResponse()
Ejemplo n.º 5
0
def post_client_discover(request):
    if request.method == 'POST':

        update_aws_status = False

        cleanup_dead_jobs(request)  # TODO This could be on a schedule

        # Update database from recieved data
        r = json.loads(request.body)

        if 'status' not in r:
            raise Exception('Invalid request')

        # Look for existing machine in the database, with the same ip
        nodes = FarmNode.objects.filter(machine_name__iexact=r['machine_name'])
        if nodes:
            # Node exists in database, update it
            node = nodes[0]
            node.ip_address = request.data['ip_address']

            if node.aws_instance_state != 'running':
                update_aws_status = True

            node.last_seen = timezone.now()

        else:
            # Node does not exist, create it
            node = FarmNode(ip_address=r['ip_address'],
                            machine_name=r['machine_name'])
            update_aws_status = True

        if 'system' in r:
            node.system = r['system']
        if 'system_bits' in r:
            node.system_bits = r['system_bits']
        if 'cpu_brand' in r:
            node.cpu_brand = r['cpu_brand']
        if 'cpu_cores' in r:
            node.cpu_cores = r['cpu_cores']
        if 'cuda_device_count' in r:
            node.gpu_count = r['cuda_device_count']
        if 'restarted' in r and r['restarted']:
            node.req_restart = False
        if 'cpu_percent' in r:
            node.cpu_percent = r['cpu_percent']
        if 'mem_used' in r:
            node.virt_percent = r['mem_used']
        if 'os_version' in r:
            node.os_version = r['os_version']

        # AWS Cloud integration
        if update_aws_status:
            node.aws_instance_id, node.aws_instance_region, node.aws_instance_state = aws.instance_id_from_private_ip(
                node.ip_address)
        else:
            # AWS, check if this instance should be stopped for inactivity
            if node.aws_instance_should_be_stopped():
                nb_aws_running = FarmNode.objects.filter(
                    aws_instance_state='running').count()
                slack_notification(
                    'Stopping inactive AWS instance: *%s* (running:%d)' %
                    (node.machine_name, nb_aws_running - 1),
                    color='warning')
                node.aws_instance_state = aws.stop_instance(
                    node.aws_instance_id, node.aws_instance_region)

        if FarmJob.objects.filter(status='running', node=node).count() > 0:
            node.last_job_activity = timezone.now()
        node.code_version = r['code_version'] if 'code_version' in r else 0
        node.git_version = r['git_version'] if 'git_version' in r else ''
        node.status = r['status']
        node.save()

        # Update tags on farm node, if client_tags are supplied, otherwise, keep tags in DB
        if 'client_tags' in r:
            tags = r.get('client_tags', [])
            if node.aws_instance_id:
                tags.append('aws')
            if tags != node.tags:
                with transaction.atomic():
                    node.tags.set(*tags, clear=True)  # don't need node.save()
        else:
            # client did not specify any tags, use the ones in DB
            tags = node.tags.names()

        # In order to filter jobs by tags, we start with the list of all possible tags, then
        # remove the tags supported by this node. What remains is the list of tags that
        # cannot be fulfilled. Jobs with these tags should be filtered out.
        all_possible_tags = FarmNode.tags.all().values_list('name', flat=True)
        excluded_tags = [x for x in all_possible_tags if not x in tags]

        available_jobs = r['available_jobs'] if 'available_jobs' in r else []
        jobs_to_terminate = [
            job.id
            for job in FarmJob.objects.filter(status='terminating', node=node)
        ]

        # Update database from running and finished jobs (if they are not 'terminating')
        if 'running_jobs_progress' in r:
            for job_id, progress in r['running_jobs_progress']:
                FarmJob.objects.filter(
                    id=job_id).filter(~Q(status='terminating')).filter(
                        ~Q(status='running') | ~Q(progress=progress)).update(
                            status='running',
                            progress=progress,
                            modified=timezone.now())

        elif 'running_jobs' in r:
            FarmJob.objects.filter(id__in=r['running_jobs']).filter(~Q(
                status='terminating')).filter(~Q(status='running')).update(
                    status='running', modified=timezone.now())

        if 'finished_jobs' in r:
            for job in r['finished_jobs']:

                progress = job['progress'] if 'progress' in job else ''

                try:

                    # Update job with new status
                    this_job = FarmJob.objects.get(pk=job['job_id'])
                    this_job.progress = progress
                    if 'children' in job:
                        # Yield to children
                        for job_info in job['children']:

                            if isinstance(job_info, dict):
                                # Create child job
                                target_node = None
                                if 'node_name' in job_info:
                                    target_node = make_sure_node_exists(
                                        job_info['node_name'])

                                child = FarmJob(
                                    job_class=job_info['job_class'],
                                    created_by=this_job.created_by,
                                    params=job_info['params'],
                                    status='ready',
                                    parent=this_job,
                                    node=target_node,
                                    req_version=this_job.req_version,
                                    req_gpu=this_job.req_gpu)
                                child.save()
                            else:
                                # Backward compatibility code
                                child = FarmJob(job_class=job_info[0],
                                                created_by=this_job.created_by,
                                                params=job_info[1],
                                                status='ready',
                                                parent=this_job)
                                child.save()

                        g_logger.info('Job #%s set to WAITING' %
                                      (job['job_id']))
                        this_job.status = 'waiting'
                    elif 'success' in job and job['success']:
                        g_logger.info('Job #%s set to SUCCESS' %
                                      (job['job_id']))
                        this_job.status = 'success'
                        this_job.end_time = timezone.now()
                    else:
                        g_logger.info('Job #%s set to FAILED' %
                                      (job['job_id']))
                        this_job.status = 'failed'
                        this_job.exception = job['exception']
                        this_job.end_time = timezone.now()

                    # Update parent job, if it exists
                    this_job.save()
                    onJobChanged(this_job, request)

                except ObjectDoesNotExist:
                    pass  # Job does not exist anymore

        if 'running_jobs' in r:
            # Jobs that are running according to the DB, but not according to the node
            for lost_job in FarmJob.objects.filter(
                    Q(status='running') | Q(status='terminating')).filter(
                        node=node).exclude(pk__in=r['running_jobs']):
                g_logger.info('Job #%d failed because not in running_jobs' %
                              (lost_job.id))
                lost_job.status = 'failed'
                lost_job.save()
                onJobChanged(lost_job, request)

        data = {}

        if node.status == 'accepting' and (node.aws_instance_id is None
                                           or node.aws_instance_state
                                           == 'running'):

            data['jobs'] = []
            data['jobs_to_kill'] = []
            data['req_restart'] = node.req_restart

            # Scheduler, reserve some tasks for specific machines
            if not node.req_restart:
                try:
                    with transaction.atomic():

                        # Classes representing 2 different job channels, one for light jobs,  one for heavy jobs
                        # These two channels will be executing concurrently on the machines
                        light_job_classes = [
                            'jobs.thumbnails.GenerateThumbnail',
                            'jobs.test.SpeedTest'
                        ]

                        class Channel():
                            def __init__(self):
                                self.max_instances = 1
                                self.nb_running = FarmJob.objects.filter(
                                    status='running',
                                    node=node).filter(self.filter_q()).count()

                            def can_run(self):
                                return self.nb_running < self.max_instances

                        class LightChannel(Channel):
                            def filter_q(self):
                                return Q(job_class__in=light_job_classes)

                        class HeavyChannel(Channel):
                            def filter_q(self):
                                return ~Q(job_class__in=light_job_classes)

                        channels = [LightChannel(), HeavyChannel()]

                        if True in [c.can_run() for c in channels]:

                            # Query for all jobs we could run on this node
                            if node.active:
                                next_jobs = FarmJob.objects.select_for_update(
                                ).filter(
                                    status='ready',
                                    req_version__lte=node.code_version
                                ).filter(Q(node=node) | Q(node=None)).filter(
                                    job_class__in=available_jobs).exclude(
                                        tags__name__in=excluded_tags)
                            else:
                                next_jobs = FarmJob.objects.select_for_update(
                                ).filter(
                                    status='ready',
                                    req_version__lte=node.code_version).filter(
                                        Q(node=node)).filter(
                                            job_class__in=available_jobs
                                        ).exclude(tags__name__in=excluded_tags)

                            # Add filter for GPU
                            if node.gpu_count <= 0:
                                next_jobs = next_jobs.filter(req_gpu=False)

                            # Sort jobs by priority
                            next_jobs = next_jobs.order_by('-priority')

                            # Create filters for each channel
                            filter_q_list = [
                                c.filter_q() for c in channels if c.can_run()
                            ]
                            if filter_q_list:

                                # Apply filter for each channel
                                next_jobs = next_jobs.filter(
                                    or_list(filter_q_list))

                                # Go thru each job, check dependency, and exit as soon as one good job is found
                                for next_job in next_jobs:

                                    # Check Job Dependencies (filter if that there are no dependencies that are not 'success')
                                    if next_job.dependencies.filter(~Q(
                                            status='success')).count() == 0:

                                        # TODO This should be in the same query, otherwise we may be looping for no reason

                                        g_logger.info(
                                            'Job #%s RESERVED for %s' %
                                            (next_job.id, node.machine_name))

                                        # Make sure there are no child on this job
                                        next_job.children.all().delete()

                                        # Send a single job to this machine
                                        next_job.status = 'reserved'
                                        next_job.node = node
                                        next_job.exception = None
                                        next_job.start_time = timezone.now()
                                        next_job.save()

                                        break

                except Exception as e:
                    g_logger.error('Scheduler failed %s' % e)

            # Send reserved jobs to node
            jobs = FarmJob.objects.filter(status='reserved', node=node)
            for job in jobs:

                g_logger.info('Job #%s SUBMIT to %s' %
                              (job.id, node.machine_name))

                job_data = {
                    'job_id': job.id,
                    'job_class': job.job_class,
                    'params': job.params
                }
                data['jobs'].append(job_data)

            # Send jobs to kill to node
            for job_id in jobs_to_terminate:

                g_logger.info('Job #%s KILL to %s' %
                              (job_id, node.machine_name))

                data['jobs_to_kill'].append(job_id)

        return JSONResponse(data)
Ejemplo n.º 6
0
def restart_job(request):

    if not 'job_id' in request.data:
        return HttpResponse(status=500)

    clone_job = request.data[
        'clone_job'] if 'clone_job' in request.data else False
    use_same_machine = request.data[
        'use_same_machine'] if 'use_same_machine' in request.data else False

    # Find Job to be restarted
    src_job = FarmJob.objects.get(pk=request.data['job_id'])

    if not src_job:
        return HttpResponse(status=404)

    if not src_job.has_write_access(request.user):
        return HttpResponse(status=403)

    if clone_job:

        # Create duplicate job
        job = FarmJob(job_class=src_job.job_class,
                      created_by=request.user.username,
                      params=src_job.params,
                      node=src_job.node if use_same_machine else None,
                      ext_take=src_job.ext_take,
                      ext_scan_assets=src_job.ext_scan_assets,
                      ext_tracking_assets=src_job.ext_tracking_assets,
                      req_gpu=src_job.req_gpu,
                      priority=src_job.priority,
                      status='created')
        job.save()
        # copy tags in a second pass (required for ManyToMany)
        job.tags.set(*src_job.tags.names(), clear=True)
        job.status = 'ready'
        job.save()

        g_logger.info('Job #%d restarted as job #%d' % (src_job.id, job.id))

    else:

        # If some child are still running, refust to restart
        if src_job.children.filter(Q(status='running')
                                   | Q(status='waiting')).count() > 0:
            return JSONResponse({'message': 'Error, child running'},
                                status=403)

        # Delete all child jobs
        src_job.children.all().delete()

        on_job_restart(src_job.id)

        # Update job status
        src_job.status = 'ready'
        src_job.exception = None
        src_job.image_filename = None
        src_job.mesh_filename = None
        src_job.progress = None
        src_job.start_time = None
        src_job.end_time = None
        if not use_same_machine:
            src_job.node = None
        src_job.save()

        g_logger.info('Job #%d restarted' % (src_job.id))

    return HttpResponse()
Ejemplo n.º 7
0
def post_export_takes(request):
    if request.method == 'POST':

        # TODO Access rights
        if not request.user.is_superuser:
            return HttpResponse(status=403)

        j = json.loads(request.body)

        # Submit jobs to export these Take IDs on each node

        if 'takeid_list' in j and 'export_path' in j:

            # Create list of machines, and the list of files to copy for each machine
            list_of_take_ids = j['takeid_list']
            export_path = j['export_path']

            userdata = getUserData(request)
            userdata.export_path = export_path
            userdata.save()

            for take_id in list_of_take_ids:

                # Create a list of files to export for each node
                files_to_export = {}

                take = Take.objects.get(pk=take_id)
                take.export_path = os.path.join(export_path,
                                                take.shot.session.name,
                                                take.shot.name, take.name)
                take.save()

                # Build list of nodes and files
                for cam in take.cameras.all():

                    if cam.machine_name not in files_to_export:
                        files_to_export[cam.machine_name] = []

                    for filepath in cam.all_files.split(';'):
                        files_to_export[cam.machine_name].append(
                            (filepath, take.export_path))

                params = {}
                params['root_export_path'] = export_path
                params['export_path'] = take.export_path
                params['nodes'] = files_to_export
                params_json = json.dumps(params)

                # Create Job
                job1 = FarmJob(job_class='jobs.archive.ExportTake',
                               created_by=request.user.username,
                               params=params_json,
                               status='ready',
                               ext_take=take,
                               req_gpu=False)
                job1.save()

                # Create Job for GenerateThumbnails
                # TODO Only if AVI, not if Single shot TIF
                params = {}
                params['take_export_path'] = take.export_path
                params_json = json.dumps(params)
                job2 = FarmJob(job_class='jobs.thumbnails.GenerateThumbnail',
                               created_by=request.user.username,
                               params=params_json,
                               status='created',
                               ext_take=take,
                               req_gpu=False)
                job2.save()
                job2.dependencies.add(job1)
                job2.status = 'ready'
                job2.save()

    return HttpResponse('Ok')