def prepare_extract_and_evaluate(ext_hash,image_certificate_file,model_certificate_file,task): conn = pm.Connection(document_class=bson.SON) db = conn[DB_NAME] perf_coll = db['performance'] perf_coll.remove({'__hash__':ext_hash}) split_coll = db['splits.files'] split_fs = gridfs.GridFS(db,'splits') remove_existing(split_coll,split_fs,ext_hash) splitperf_coll = db['split_performance.files'] splitperf_fs = gridfs.GridFS(db,'split_performance') remove_existing(splitperf_coll,splitperf_fs,ext_hash) model_certdict = cPickle.load(open(model_certificate_file)) model_hash = model_certdict['model_hash'] model_coll = db['models.files'] image_certdict = cPickle.load(open(image_certificate_file)) image_hash = image_certdict['image_hash'] image_config_gen = image_certdict['args'] model_configs = get_most_recent_files(model_coll,{'__hash__' : model_hash}) if isinstance(task,list): task_list = task else: task_list = [task] return model_configs,image_config_gen,model_hash,image_hash, task_list, perf_coll, split_coll, split_fs, splitperf_coll, splitperf_fs
def extract_features_inner_core(image_certificate, model_certificate, feature_hash, image_hash, model_hash, convolve_func_name,device_id, im_query, m_query, im_skip, im_limit, m_skip, m_limit): if im_query is None: im_query = {} if m_query is None: m_query = {} im_query['__hash__'] = image_hash m_query['__hash__'] = model_hash conn = pm.Connection(document_class = SON) db = conn[DB_NAME] image_col = db['images.files'] model_col = db['models.files'] image_fs = gridfs.GridFS(db,'images') model_fs = gridfs.GridFS(db,'models') feature_fs = gridfs.GridFS(db,'features') if convolve_func_name == 'pyfft': context = v1_pyfft.setup_pyfft(device_id) context.push() convolve_func = functools.partial(v1f.v1like_filter_pyfft,device_id=device_id) else: convolve_func = v1f.v1like_filter_numpy L1 = get_most_recent_files(image_col,im_query,skip=im_skip,limit=im_limit) L2 = get_most_recent_files(model_col,m_query,skip=m_skip,limit=m_limit) for image_config in L1: for model_config in L2: features = compute_features(image_config['filename'], image_fs, model_config, model_fs,convolve_func) features_string = cPickle.dumps(features) y = SON([('config',SON([('model',model_config['config']['model']),('image',image_config['config']['image'])]))]) filename = get_filename(y['config']) y['filename'] = filename y['__hash__'] = feature_hash feature_fs.put(features_string,**y) if convolve_func_name == 'pyfft': context.pop()
def extract_and_evaluate_parallel_core(image_config_gen,m,task,ext_hash,split_id,convolve_func_name,cache_port=None): if cache_port is None: cache_port = NETWORK_CACHE_PORT conn = pm.Connection(document_class=bson.SON) db = conn[DB_NAME] split_col = db['splits.files'] split_fs = gridfs.GridFS(db,'splits') splitconf = get_most_recent_files(split_col,{'__hash__':ext_hash,'split_id':split_id,'model':m['config']['model'],'images':son_escape(image_config_gen['images'])})[0] split = cPickle.loads(split_fs.get_version(splitconf['filename']).read())['split'] res = extract_and_evaluate_core(split,m,convolve_func_name,task,cache_port) splitperf_fs = gridfs.GridFS(db,'split_performance') put_in_split_result(res,image_config_gen,m,task,ext_hash,split_id,splitperf_fs)
def extract_and_evaluate_parallel(outfile,image_certificate_file,model_certificate_file,cpath,convolve_func_name,task,ext_hash): (model_configs, image_config_gen, model_hash, image_hash, task_list, perf_col, split_coll, split_fs, splitperf_coll, splitperf_fs) = prepare_extract_and_evaluate(ext_hash, image_certificate_file, model_certificate_file, task) jobids = [] if convolve_func_name == 'numpy': opstring = '-l qname=extraction_cpu.q' elif convolve_func_name == 'pyfft': opstring = '-l qname=extraction_gpu.q -o /home/render -e /home/render' for m in model_configs: print('Evaluating model',m) for task in task_list: classifier_kwargs = task.get('classifier_kwargs',{}) print('task',task) splits = generate_splits(task,image_hash,'images') for (ind,split) in enumerate(splits): put_in_split(split,image_config_gen,m,task,ext_hash,ind,split_fs) jobid = qsub(extract_and_evaluate_parallel_core,(image_config_gen,m,task,ext_hash,ind,convolve_func_name),opstring=opstring) jobids.append(jobid) print(jobids) statuses = wait_and_get_statuses(jobids) for m in model_configs: print('Evaluating model',m) for task in task_list: split_results = get_most_recent_files(splitperf_coll,{'__hash__':ext_hash,'task':son_escape(task),'model':m['config']['model'],'images':son_escape(image_config_gen['images'])}) put_in_performance(split_results,image_config_gen,m,model_hash,image_hash,perf_col,task,ext_hash) createCertificateDict(outfile,{'image_file':image_certificate_file,'models_file':model_certificate_file})
def evaluate(outfile,feature_certificate,cpath,task,ext_hash): conn = pm.Connection(document_class=bson.SON) db = conn[DB_NAME] perf_fs = gridfs.GridFS(db,'performance') perf_coll = db['performance.files'] remove_existing(perf_coll,perf_fs,ext_hash) feature_certdict = cPickle.load(open(feature_certificate)) feature_hash = feature_certdict['feature_hash'] image_hash = feature_certdict['image_hash'] model_hash = feature_certdict['model_hash'] image_config_gen = feature_certdict['args']['images'] model_col = db['models.files'] feature_fs = gridfs.GridFS(db,'features') feature_col = db['features.files'] stats = ['test_accuracy','ap','auc','mean_ap','mean_auc','train_accuracy'] if isinstance(task,list): task_list = task else: task_list = [task] model_configs = get_most_recent_files(model_col,{'__hash__':model_hash}) for m in model_configs: print('Evaluating model',m) for task in task_list: task['universe'] = task.get('universe',SON([])) task['universe']['model'] = m['config']['model'] print('task', task) classifier_kwargs = task.get('classifier_kwargs',{}) split_results = [] splits = generate_splits(task,feature_hash,'features') for (ind,split) in enumerate(splits): print ('split', ind) train_data = split['train_data'] test_data = split['test_data'] train_filenames = [t['filename'] for t in train_data] test_filenames = [t['filename'] for t in test_data] assert set(train_filenames).intersection(test_filenames) == set([]) print('train feature extraction ...') train_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in train_data]) print('test feature extraction ...') test_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in test_data]) train_labels = split['train_labels'] test_labels = split['test_labels'] print('classifier ...') res = svm.classify(train_features,train_labels,test_features,test_labels,classifier_kwargs) print('Split test accuracy', res['test_accuracy']) split_results.append(res) model_results = SON([]) for stat in STATS: if stat in split_results[0] and split_results[0][stat] != None: model_results[stat] = sp.array([split_result[stat] for split_result in split_results]).mean() out_record = SON([('model',m['config']['model']), ('model_hash',model_hash), ('model_filename',m['filename']), ('images',son_escape(image_config_gen)), ('image_hash',image_hash), ('task',son_escape(task)), ]) filename = get_filename(out_record) out_record['filename'] = filename out_record['config_path'] = cpath out_record['__hash__'] = ext_hash out_record.update(model_results) print('dump out ...') out_data = cPickle.dumps(SON([('split_results',split_results),('splits',splits)])) perf_fs.put(out_data,**out_record) createCertificateDict(outfile,{'feature_file':feature_certificate})
def generate_splits(dbname,collectionname,task_query,N,ntrain,ntest, ntrain_pos = None, ntest_pos = None, universe = None, use_negate = False, overlap=None): task_query = copy.deepcopy(task_query) print('Generating splits ...') if universe is None: universe = SON([]) connection = pm.Connection(document_class=SON) db = connection[dbname] data = db[collectionname + '.files'] fs = gridfs.GridFS(db,collection=collectionname) combine_things(task_query,universe) print('T',task_query) task_data = list(data.find(task_query,fields=["filename"])) task_fnames = [str(x['filename']) for x in task_data] N_task = len(task_data) if use_negate: task_fnames = np.array(task_fnames) all_data = list(data.find(universe,fields=["filename"])) all_fnames = np.array([str(x['filename']) for x in all_data]) I = np.invert(tb.isin(all_fnames,task_fnames)).nonzero()[0] nontask_data = [all_data[ind] for ind in I] nontask_fnames = [str(x['filename']) for x in nontask_data] assert set(task_fnames).intersection(nontask_fnames) == set([]), set(task_fnames).intersection(nontask_fnames) else: nontask_query = {'filename':{'$nin':task_fnames}} nontask_query.update(universe) nontask_data = get_most_recent_files(data,nontask_query) N_nontask = len(nontask_data) assert ntrain + ntest <= N_task + N_nontask, "Not enough training and/or testing examples " + str([N_task,N_nontask]) splits = [] for ind in range(N): print('... split', ind) if ntrain_pos is not None: ntrain_neg = ntrain - ntrain_pos assert ntrain_pos <= N_task, "Not enough positive training examples, there are: " + str(N_task) assert ntrain_neg <= N_nontask, "Not enough negative training examples, there are: " + str(N_nontask) perm_pos = sp.random.permutation(len(task_data)) perm_neg = sp.random.permutation(len(nontask_data)) train_data = [task_data[i] for i in perm_pos[:ntrain_pos]] + [nontask_data[i] for i in perm_neg[:ntrain_neg]] if ntest_pos is not None: ntest_neg = ntest - ntest_pos assert ntest_pos <= N_task - ntrain_pos, "Not enough positive test examples, there are: " + str(N_task - ntrain_pos) assert ntest_neg <= N_nontask - ntrain_neg, "Not enough negative test examples, there are: " + str(N_nontask - ntrain_neg) test_data = [task_data[i] for i in perm_pos[ntrain_pos:ntrain_pos + ntest_pos]] + [nontask_data[i] for i in perm_neg[ntrain_neg:ntrain_neg + ntest_neg]] else: nontrain_data = [task_data[i] for i in perm_pos[ntrain_pos:]] + [nontask_data[i] for i in perm_neg[ntrain_neg:]] new_perm = sp.random.permutation(len(nontrain_data)) test_data = [nontrain_data[i] for i in new_perm[:ntest]] else: if ntest_pos is not None: ntest_neg = ntest - ntest_pos assert ntest_pos <= N_task, "Not enough positive test examples, there are: " + str(N_task) assert ntest_neg <= N_nontask, "Not enough negative test examples, there are: " + str(N_nontask) perm_pos = sp.random.permutation(len(task_data)) perm_neg = sp.random.permutation(len(nontask_data)) test_data = [task_data[i] for i in perm_pos[:ntest_pos]] + [nontask_data[i] for i in perm_neg[:ntest_neg]] nontest_data = [task_data[i] for i in perm_pos[ntest_pos:]] + [nontask_data[i] for i in perm_neg[ntest_neg:]] new_perm = sp.random.permutation(len(nontest_data)) train_data = [nontest_data[i] for i in new_perm[:ntrain]] else: all_data = task_data + nontask_data perm = sp.random.permutation(len(all_data)) train_data = [all_data[i] for i in perm[:ntrain]] test_data = [all_data[i] for i in perm[ntrain:ntrain + ntest]] train_filenames = np.array([str(_t['filename']) for _t in train_data]) test_filenames = np.array([str(_t['filename']) for _t in test_data]) train_labels = tb.isin(train_filenames,task_fnames) test_labels = tb.isin(test_filenames,task_fnames) #train_labels = sp.array([x['filename'] in task_fnames for x in train_data]) #test_labels = sp.array([x['filename'] in task_fnames for x in test_data]) assert set(train_filenames).intersection(test_filenames) == set([]), str(set(train_filenames).intersection(test_filenames)) split = {'train_data': train_data, 'test_data' : test_data, 'train_labels':train_labels,'test_labels':test_labels} splits.append(split) return splits