Beispiel #1
0
def extract_features(pid,job):
    def log_callback(msg):
        logging.warn('FEATURES %s' % msg)
        client.heartbeat(pid,message=msg)
    parsed_pid = parse_pid(pid)
    bin_lid = parsed_pid[LID]
    bin_pid = ''.join([parsed_pid[NAMESPACE], parsed_pid[LID]]) 
    binzip_url = ''.join([bin_pid,'_binzip.zip'])
    blob_url = ''.join([bin_pid,'_blob.zip'])
    features_url = ''.join([bin_pid,'_features.csv'])
    multiblob_url = ''.join([bin_pid,'_multiblob.csv'])
    if exists(features_url):
        log_callback('skipping %s - features exist' % pid)
        return
    log_callback('computing features for %s' % pid)
    with safe_tempdir() as binzip_dir:
        # download bin zip
        binzip_path = os.path.join(binzip_dir, '%s.zip' % bin_lid)
        log_callback('downloading %s to %s' % (binzip_url, binzip_path))
        download(binzip_url, binzip_path)
        # download blob zip
        blob_path = os.path.join(binzip_dir, '%s_blob.zip' % bin_lid)
        log_callback('downloading %s to %s' % (blob_url, blob_path))
        download(blob_url, blob_path)
        # compute features
        with safe_tempdir() as job_dir:
            # output of matlab job
            feature_csv = os.path.join(job_dir, csvname(bin_pid))
            multiblob_csv = os.path.join(job_dir, 'multiblob', multiblobname(bin_pid))
            # params for matlab job
            namespace = os.path.dirname(binzip_path) + '/'
            lid = os.path.basename(binzip_path)
            matlab = Matlab(MATLAB_EXEC_PATH, MATLAB_PATH, output_callback=log_callback)
            cmd = 'bin_features(\'%s\',\'%s\',\'%s\',\'chatty\')' % (namespace, lid, job_dir + '/')
            log_callback('running %s' % cmd)
            matlab.run(cmd)
            log_callback('matlab exited')
            if os.path.exists(feature_csv):
                log_callback('features found at %s' % feature_csv)
            else:
                raise Exception('no features found')
            log_callback('uploading %s' % features_url)
            upload(feature_csv, features_url)
            if os.path.exists(multiblob_csv):
                log_callback('multiblob found at %s' % multiblob_csv)
                log_callback('uploading %s' % multiblob_url)
                upload(multiblob_csv, multiblob_url)
                log_callback('complete')
            client.wakeup()
Beispiel #2
0
 def run_callback(self,message):
     jobid = gen_id()[:5]
     def selflog(line):
         self.log('%s %s' % (jobid, line))
     def self_check_log(line,bin_pid):
         selflog(line)
         self.output_check -= 1
         if self.output_check <= 0:
             if self.exists(bin_pid):
                 selflog('STOPPING JOB - %s completed by another worker' % bin_pid)
                 raise JobExit(bin_pid, SKIP)
             self.output_check = CHECK_EVERY
     bin_pid = message
     dest_file = self.storage.dest(bin_pid)
     if self.exists(bin_pid):
         selflog('SKIPPING %s - already completed' % bin_pid)
         return SKIP
     job_dir = os.path.join(self.config.tmp_dir, gen_id())
     try:
         os.makedirs(job_dir)
     except:
         selflog('WARNING cannot create temporary directory %s' % job_dir)
     tmp_file = os.path.join(job_dir, self.storage.zipname(bin_pid))
     matlab = Matlab(self.config.matlab_exec_path,self.config.matlab_path,output_callback=lambda l: self_check_log(l, bin_pid))
     cmd = 'bin_blobs(\'%s\',\'%s\')' % (bin_pid, job_dir)
     try:
         self.output_check = CHECK_EVERY
         matlab.run(cmd)
         if not os.path.exists(tmp_file):
             selflog('WARNING bin_blobs succeeded but produced no output for %s' % bin_pid)
         elif not self.exists(bin_pid): # check to make sure another worker hasn't finished it in the meantime
             if self.deposit is not None:
                 selflog('DEPOSITING blob zip for %s to deposit service' % bin_pid)
                 self.deposit.deposit(bin_pid,tmp_file)
             else:
                 selflog('SAVING completed blob zip for %s to %s' % (bin_pid, dest_file))
                 local_deposit(bin_pid,tmp_file)
         else:
             selflog('NOT SAVING - blobs for %s already present at output destination' % bin_pid)
     except KeyboardInterrupt:
         selflog('KeyboardInterrupt, requeueing job before exit')
         return DIE
     finally:
         try:
             shutil.rmtree(job_dir)
         except:
             selflog('WARNING cannot remove temporary directory %s' % job_dir)
Beispiel #3
0
def extract_blobs(pid,job):
    def log_callback(msg):
        logging.warn('BLOBS %s' % msg)
        client.heartbeat(pid,message=msg)
    parsed_pid = parse_pid(pid)
    bin_lid = parsed_pid[LID]
    bin_pid = ''.join([parsed_pid[NAMESPACE], parsed_pid[LID]]) 
    binzip_url = ''.join([bin_pid,'_binzip.zip'])
    binzip_file = os.path.basename(binzip_url)
    deposit_url = '%s_blobs.zip' % bin_pid
    if exists(deposit_url):
        log_callback('skipping %s - blobs exist' % pid)
        return
    log_callback('computing blobs for %s' % pid)
    with safe_tempdir() as binzip_dir:
        # first, copy the zipfile to a temp dir
        binzip_path = os.path.join(binzip_dir, '%s.zip' % bin_lid)
        log_callback('downloading %s to %s' % (binzip_url, binzip_path))
        download(binzip_url, binzip_path)
        # now run bin_blobs
        with safe_tempdir() as job_dir:
            # configure matlab
            matlab = Matlab(MATLAB_EXEC_PATH, MATLAB_PATH, output_callback=log_callback)
            # run command
            blobs_file = os.path.join(job_dir, blob_zip_name(bin_pid))
            cmd = 'bin_blobs(\'%s\',\'%s\',\'%s\')' % (bin_pid, binzip_path, job_dir)
            log_callback('running %s' % cmd)
            matlab.run(cmd)
            log_callback('MATLAB done, checking for %s' % blobs_file)
            if not os.path.exists(blobs_file):
                raise Exception('missing output file')
            log_callback('depositing %s' % blobs_file)
            upload(blobs_file, deposit_url)
            log_callback('deposited %s' % blobs_file)
    log_callback('completed %s' % bin_pid)
    client.wakeup()
Beispiel #4
0
 def extract_features(self,bin_pid):
     jobid = gen_id()[:5]
     def selflog(line):
         self.log('[%s] %s' % (jobid, line))
     def self_check_log(line,bin_pid):
         selflog(line)
         now = time.time()
         elapsed = now - self.last_check
         self.last_check = now
         if elapsed > CHECK_EVERY:
             if self.complete(bin_pid):
                 msg = 'STOPPING JOB - %s completed by another worker' % bin_pid
                 selflog(msg)
                 raise JobExit(msg, SKIP)
     if self.complete(bin_pid):
         selflog('SKIPPING %s - already completed' % bin_pid)
         return SKIP
     job_dir = os.path.join(self.config.tmp_dir, gen_id())
     zip_dir = os.path.join(self.config.tmp_dir, gen_id())
     bin_zip_path = os.path.join(zip_dir, binzipname(bin_pid))
     try:
         os.makedirs(job_dir)
         selflog('CREATED temporary directory %s for %s' % (job_dir, bin_pid))
     except:
         selflog('WARNING cannot create temporary directory %s for %s' % (job_dir, bin_pid))
     try:
         os.makedirs(zip_dir)
         selflog('CREATED temporary directory %s for %s' % (zip_dir, bin_pid))
     except:
         selflog('WARNING cannot create temporary directory %s for %s' % (zip_dir, bin_pid))
     selflog('LOADING and STITCHING %s' % bin_pid)
     with open(bin_zip_path,'wb') as binzip:
         represent.binpid2zip(bin_pid, binzip, resolver=self.resolver)
     blobzipurl = bin_pid + '_blob.zip'
     blobzipfile = re.sub(r'\.zip','_blob.zip',bin_zip_path)
     selflog('LOADING blob zip from %s -> %s' % (blobzipurl, blobzipfile))
     drain(UrlSource(blobzipurl), LocalFileSink(blobzipfile))
     feature_csv = os.path.join(job_dir, csvname(bin_pid))
     multiblob_csv = os.path.join(job_dir, 'multiblob', multiblobname(bin_pid))
     matlab = Matlab(self.config.matlab_exec_path,self.config.matlab_path,output_callback=lambda l: self_check_log(l, bin_pid))
     namespace = os.path.dirname(bin_zip_path) + '/'
     lid = os.path.basename(bin_zip_path)
     cmd = 'bin_features(\'%s\',\'%s\',\'%s\',\'chatty\')' % (namespace, lid, job_dir + '/')
     selflog('RUNNING %s' % cmd)
     try:
         self.output_check = CHECK_EVERY
         matlab.run(cmd)
         if not os.path.exists(feature_csv):
             msg = 'WARNING bin_features succeeded but no output file found at %s' % feature_csv
             selflog(msg)
             raise JobExit(msg,FAIL)
         if not self.complete(bin_pid): # check to make sure another worker hasn't finished it in the meantime
             selflog('DEPOSITING features csv for %s to deposit service at %s' % (bin_pid, self.config.features_deposit))
             self.deposit.deposit(bin_pid,feature_csv)
             selflog('DEPOSITED features csv for %s ' % bin_pid)
             if os.path.exists(multiblob_csv):
                 selflog('DEPOSITING multiblob csv for %s to deposit service at %s' % (bin_pid, self.config.features_deposit))
                 self.multiblob_deposit.deposit(bin_pid,multiblob_csv)
                 selflog('DEPOSITED multiblob csv for %s ' % bin_pid)
         else:
             selflog('NOT SAVING - features for %s already present at output destination' % bin_pid)
     except KeyboardInterrupt:
         selflog('KeyboardInterrupt, exiting')
         return DIE
     except JobExit:
         pass
     finally:
         try:
             shutil.rmtree(job_dir)
             selflog('DELETED temporary directory %s for %s' % (job_dir, bin_pid))
         except:
             selflog('WARNING cannot remove temporary directory %s for %s' % (job_dir, bin_pid))
         try:
             shutil.rmtree(zip_dir)
             selflog('DELETED temporary directory %s for %s' % (zip_dir, bin_pid))
         except:
             selflog('WARNING cannot remove temporary directory %s for %s' % (zip_dir, bin_pid))
         selflog('DONE - no more actions for %s' % bin_pid)
Beispiel #5
0
 def extract_blobs(self,bin_pid):
     try:
         jobid = self.config.task_id
     except:
         jobid = gen_id()[:5]
     def selflog(line):
         self.log('[%s] %s' % (jobid, line))
     def self_check_log(line,bin_pid):
         selflog(line)
         now = time.time()
         elapsed = now - self.last_check
         self.last_check = now
         if elapsed > CHECK_EVERY:
             if self.exists(bin_pid):
                 msg = 'STOPPING JOB - %s completed by another worker' % bin_pid
                 selflog(msg)
                 raise JobExit(msg, SKIP)
     if self.exists(bin_pid):
         selflog('SKIPPING %s - already completed' % bin_pid)
         return SKIP
     job_dir = os.path.join(self.config.tmp_dir, gen_id())
     zip_dir = os.path.join(self.config.tmp_dir, gen_id())
     bin_zip_path = os.path.join(zip_dir, binzipname(bin_pid))
     try:
         os.makedirs(job_dir)
         selflog('CREATED temporary directory %s for %s' % (job_dir, bin_pid))
     except:
         selflog('WARNING cannot create temporary directory %s for %s' % (job_dir, bin_pid))
     try:
         os.makedirs(zip_dir)
         selflog('CREATED temporary directory %s for %s' % (zip_dir, bin_pid))
     except:
         selflog('WARNING cannot create temporary directory %s for %s' % (zip_dir, bin_pid))
     try:
         selflog('LOADING and STITCHING %s' % bin_pid)
         with open(bin_zip_path,'wb') as binzip:
             targets = represent.binpid2zip(bin_pid, binzip, resolver=self.resolver)
             if len(targets)==0:
                 selflog('SKIPPING %s - no targets in bin' % bin_pid)
                 return SKIP
         tmp_file = os.path.join(job_dir, zipname(bin_pid))
         matlab = Matlab(self.config.matlab_exec_path,self.config.matlab_path,output_callback=lambda l: self_check_log(l, bin_pid))
         cmd = 'bin_blobs(\'%s\',\'%s\',\'%s\')' % (bin_pid, bin_zip_path, job_dir)
         self.output_check = CHECK_EVERY
         matlab.run(cmd)
         if not os.path.exists(tmp_file):
             selflog('WARNING bin_blobs succeeded but no output file found at %s' % tmp_file)
         elif not self.exists(bin_pid): # check to make sure another worker hasn't finished it in the meantime
             selflog('DEPOSITING blob zip for %s to deposit service at %s' % (bin_pid, self.config.blob_deposit))
             self.deposit.deposit(bin_pid,tmp_file)
             selflog('DEPOSITED blob zip for %s ' % bin_pid)
         else:
             selflog('NOT SAVING - blobs for %s already present at output destination' % bin_pid)
     except KeyboardInterrupt:
         selflog('KeyboardInterrupt, exiting')
         return DIE
     except JobExit:
         pass
     finally:
         try:
             shutil.rmtree(job_dir)
             selflog('DELETED temporary directory %s for %s' % (job_dir, bin_pid))
         except:
             selflog('WARNING cannot remove temporary directory %s for %s' % (job_dir, bin_pid))
         try:
             shutil.rmtree(zip_dir)
             selflog('DELETED temporary directory %s for %s' % (zip_dir, bin_pid))
         except:
             selflog('WARNING cannot remove temporary directory %s for %s' % (zip_dir, bin_pid))
         selflog('DONE - no more actions for %s' % bin_pid)