def put_in_split(split,image_config_gen,m,task,ext_hash,split_id,split_fs): out_record = SON([('model',m['config']['model']), ('images',son_escape(image_config_gen['images'])), ('task',son_escape(task)), ('split_id',split_id), ]) filename = get_filename(out_record) out_record['filename'] = filename out_record['__hash__'] = ext_hash print('dump out ...') out_data = cPickle.dumps(SON([('split',split)])) split_fs.put(out_data,**out_record)
def train_test_loop(outfile,extract_creates,task_config,feature_config_path,hash): feature_config = get_config(feature_config_path) base_query = SON([('__config_hash__',hash)]) image_params = SON([('image',feature_config['image'])]) models_params = feature_config['models'] ntrain = task_config['ntrain'] ntest = task_config['ntest'] ntrain_pos = task_config.get('ntrain_pos') N = task_config.get('N',10) query = task_config['query'] base_query.update(reach_in('config',task_config.get('universe',SON([])))) print('\n') print('BASE',base_query) print('\n') conn = pm.Connection(document_class=SON) db = conn['v1'] fs = gridfs.GridFS(db, collection = 'model_performance') cquery = reach_in('config',query) for m in models_params: base_query_copy = base_query.copy() base_query_copy.update(reach_in('config.model',m)) splitdata, results = train_test(cquery,'v1','features',ntrain,ntest,ntrain_pos=ntrain_pos,N=N,universe=base_query_copy) splitpickle = cPickle.dumps(splitdata) data = SON([('feature_config_path',feature_config_path), ('model',m), ('task',son_escape(task_config)), ('image__aggregate__',son_escape(feature_config['image']))]) filename = get_filename(data) data.update(results) data['filename'] = filename fs.put(splitpickle,**data) createCertificateDict(outfile,{'task_config':task_config,'feature_config':feature_config,'feature_config_path':feature_config_path})
def put_in_split_result(res,image_config_gen,m,task,ext_hash,split_id,splitres_fs): out_record = SON([('model',m['config']['model']), ('images',son_escape(image_config_gen['images'])), ('task',son_escape(task)), ('split_id',split_id), ]) split_result = SON([]) for stat in STATS: if stat in res and res[stat] != None: split_result[stat] = res[stat] filename = get_filename(out_record) out_record['filename'] = filename out_record['__hash__'] = ext_hash out_record.update(split_result) print('dump out split result...') out_data = cPickle.dumps(SON([('split_result',res)])) splitres_fs.put(out_data,**out_record)
def put_in_performance(split_results,image_config_gen,m,model_hash,image_hash,perf_coll,task,ext_hash): model_results = SON([]) for stat in STATS: if stat in split_results[0] and split_results[0][stat] != None: model_results[stat] = sp.array([split_result[stat] for split_result in split_results]).mean() out_record = SON([('model',m['config']['model']), ('model_hash',model_hash), ('model_filename',m['filename']), ('images',son_escape(image_config_gen['images'])), ('image_hash',image_hash), ('task',son_escape(task)), ('__hash__',ext_hash) ]) out_record.update(model_results) perf_coll.insert(out_record)
def greedy_optimization(outfile,task,image_certificate_file,initial_model,convolve_func,rep_limit, modifier_args,modifier): conn = pm.Connection(document_class=bson.SON) db = conn['v1'] opt_fs = gridfs.GridFS(db,'optimized_performance') image_coll = db['raw_images.files'] image_fs = gridfs.GridFS(db,'raw_images') image_certdict = cPickle.load(open(image_certificate_file)) print('using image certificate', image_certificate_file) image_hash = image_certdict['run_hash'] image_args = image_certdict['out_args'] if convolve_func == v1f.v1like_filter_pyfft: v1_pyfft.setup_pyfft() filterbanks = [] perfs = [] model_configs = [] center_config = initial_model i = 0 improving = True while ((i < rep_limit) or rep_limit is None): i += 1 print('Round', i) next_configs = [m for m in get_consistent_deltas(center_config,modifier) if m not in model_configs] if next_configs: next_results = [get_performance(task,image_hash,image_fs,m,convolve_func) for m in next_configs] next_perfs = [x[0] for x in next_results] next_filterbanks = [x[1] for x in next_results] next_perf_ac_max = np.array([x['test_accuracy'] for x in next_perfs]).max() perf_ac_max = max([x['test_accuracy'] for x in perfs]) if perfs else 0 if next_perf_ac_max > perf_ac_max: next_perf_ac_argmax = np.array([x['test_accuracy'] for x in next_perfs]).argmax() center_config = next_configs[next_perf_ac_argmax] print('\n\n') print('new best performance is', next_perf_ac_max, 'from model', center_config) print('\n\n') perfs.extend(next_perfs) model_configs.extend(next_configs) filterbanks.extend(next_filterbanks) else: print('Breaking because no further optimization could be done. Best existing performance was', perf_ac_max, 'while best next performance was', next_perf_ac_max) break else: print('Breaking because no next configs') break perfargmax = np.array([p['test_accuracy'] for p in perfs]).argmax() best_model = model_configs[perfargmax] best_performance = perfs[perfargmax] out_record = SON([('initial_model',initial_model), ('task',son_escape(task)), ('images',son_escape(image_args)), ('images_hash',image_hash), ('modifier_args',son_escape(modifier_args)), ('modifier',modifier.__class__.__module__ + '.' + modifier.__class__.__name__) ]) filename = get_filename(out_record) out_record['filename'] = filename out_record.update(SON([('performances',perfs)])) out_record.update(SON([('best_model',best_model)])) out_record.update(SON([('best_performance',best_performance)])) out_record.update(SON([('num_steps',len(model_configs))])) out_record.update(SON([('models',model_configs)])) outdata = cPickle.dumps(filterbanks) opt_fs.put(outdata,**out_record) if convolve_func == v1f.v1like_filter_pyfft: v1_pyfft.cleanup_pyfft() createCertificateDict(outfile,{'image_file':image_certificate_file})
""" Example Parameters module """ #from collections import OrderedDict import copy import itertools from bson import SON import config.ten_categories_images as Images import config.renderman_correlation_tasks2 as Tasks from dbutils import son_escape import config.ht_l1_gabor_models_for_corr as l1_models corr_layer = SON([(u'filter',SON([(u'model_name','correlation'), (u'random_subset',SON([('const',.5)])), (u'images',son_escape(Images.config['images']))])), (u'activ', SON([(u'min_out', 0), (u'max_out', 1)]))]) models = [] for M in l1_models.config['models']: for num_filters in [256,384]: for task in Tasks.config['extractions']: m = copy.deepcopy(M) m['layers'].append(copy.deepcopy(corr_layer)) m['layers'][2]['filter']['task'] = son_escape(task) m['layers'][2]['filter']['num_filters'] = num_filters models.append(m)
(u'ker_shape', [13, 13]), (u'divfreqs', [2, 4, 7, 8, 11]), (u'norients', 7)])), (u'activ', SON([(u'min_out', 0), (u'max_out', 1)])), (u'lnorm', SON([(u'inker_shape', [9, 9]), (u'outker_shape', [9, 9]), (u'threshold', 10.0), (u'stretch', 0.1)])), (u'lpool', SON([(u'order', 2), (u'stride', 2), (u'ker_shape', [5, 5])]))]), SON([(u'filter',SON([(u'model_name','correlation'), (u'num_filters',256), (u'random_subset',SON([('const',.5)])), (u'task',son_escape(Tasks.config['extractions'][0])), (u'images',son_escape(Images.config['images']))])), (u'activ', SON([(u'min_out', 0), (u'max_out', 1)]))]) ]), ]) config = { 'models': [model] }
(u"stretch", 0.1), ] ), ), (u"lpool", SON([(u"order", 2), (u"stride", 2), (u"ker_shape", [5, 5])])), ] ), SON( [ ( u"filter", SON( [ (u"model_name", "correlation"), (u"random_subset", SON([("const", 0.5)])), (u"num_filters", 256), (u"task", son_escape(Tasks.config["extractions"][0])), (u"images", son_escape(Images.config["images"])), ] ), ), (u"activ", SON([(u"min_out", 0), (u"max_out", 1)])), ] ), ], ), ] ) config = {"models": [model]}
def extract_and_evaluate_parallel(outfile,image_certificate_file,model_certificate_file,cpath,convolve_func_name,task,ext_hash): (model_configs, image_config_gen, model_hash, image_hash, task_list, perf_col, split_coll, split_fs, splitperf_coll, splitperf_fs) = prepare_extract_and_evaluate(ext_hash, image_certificate_file, model_certificate_file, task) jobids = [] if convolve_func_name == 'numpy': opstring = '-l qname=extraction_cpu.q' elif convolve_func_name == 'pyfft': opstring = '-l qname=extraction_gpu.q -o /home/render -e /home/render' for m in model_configs: print('Evaluating model',m) for task in task_list: classifier_kwargs = task.get('classifier_kwargs',{}) print('task',task) splits = generate_splits(task,image_hash,'images') for (ind,split) in enumerate(splits): put_in_split(split,image_config_gen,m,task,ext_hash,ind,split_fs) jobid = qsub(extract_and_evaluate_parallel_core,(image_config_gen,m,task,ext_hash,ind,convolve_func_name),opstring=opstring) jobids.append(jobid) print(jobids) statuses = wait_and_get_statuses(jobids) for m in model_configs: print('Evaluating model',m) for task in task_list: split_results = get_most_recent_files(splitperf_coll,{'__hash__':ext_hash,'task':son_escape(task),'model':m['config']['model'],'images':son_escape(image_config_gen['images'])}) put_in_performance(split_results,image_config_gen,m,model_hash,image_hash,perf_col,task,ext_hash) createCertificateDict(outfile,{'image_file':image_certificate_file,'models_file':model_certificate_file})
def extract_and_evaluate_parallel_core(image_config_gen,m,task,ext_hash,split_id,convolve_func_name,cache_port=None): if cache_port is None: cache_port = NETWORK_CACHE_PORT conn = pm.Connection(document_class=bson.SON) db = conn[DB_NAME] split_col = db['splits.files'] split_fs = gridfs.GridFS(db,'splits') splitconf = get_most_recent_files(split_col,{'__hash__':ext_hash,'split_id':split_id,'model':m['config']['model'],'images':son_escape(image_config_gen['images'])})[0] split = cPickle.loads(split_fs.get_version(splitconf['filename']).read())['split'] res = extract_and_evaluate_core(split,m,convolve_func_name,task,cache_port) splitperf_fs = gridfs.GridFS(db,'split_performance') put_in_split_result(res,image_config_gen,m,task,ext_hash,split_id,splitperf_fs)
def evaluate(outfile,feature_certificate,cpath,task,ext_hash): conn = pm.Connection(document_class=bson.SON) db = conn[DB_NAME] perf_fs = gridfs.GridFS(db,'performance') perf_coll = db['performance.files'] remove_existing(perf_coll,perf_fs,ext_hash) feature_certdict = cPickle.load(open(feature_certificate)) feature_hash = feature_certdict['feature_hash'] image_hash = feature_certdict['image_hash'] model_hash = feature_certdict['model_hash'] image_config_gen = feature_certdict['args']['images'] model_col = db['models.files'] feature_fs = gridfs.GridFS(db,'features') feature_col = db['features.files'] stats = ['test_accuracy','ap','auc','mean_ap','mean_auc','train_accuracy'] if isinstance(task,list): task_list = task else: task_list = [task] model_configs = get_most_recent_files(model_col,{'__hash__':model_hash}) for m in model_configs: print('Evaluating model',m) for task in task_list: task['universe'] = task.get('universe',SON([])) task['universe']['model'] = m['config']['model'] print('task', task) classifier_kwargs = task.get('classifier_kwargs',{}) split_results = [] splits = generate_splits(task,feature_hash,'features') for (ind,split) in enumerate(splits): print ('split', ind) train_data = split['train_data'] test_data = split['test_data'] train_filenames = [t['filename'] for t in train_data] test_filenames = [t['filename'] for t in test_data] assert set(train_filenames).intersection(test_filenames) == set([]) print('train feature extraction ...') train_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in train_data]) print('test feature extraction ...') test_features = sp.row_stack([load_features(f['filename'],feature_fs,m,task) for f in test_data]) train_labels = split['train_labels'] test_labels = split['test_labels'] print('classifier ...') res = svm.classify(train_features,train_labels,test_features,test_labels,classifier_kwargs) print('Split test accuracy', res['test_accuracy']) split_results.append(res) model_results = SON([]) for stat in STATS: if stat in split_results[0] and split_results[0][stat] != None: model_results[stat] = sp.array([split_result[stat] for split_result in split_results]).mean() out_record = SON([('model',m['config']['model']), ('model_hash',model_hash), ('model_filename',m['filename']), ('images',son_escape(image_config_gen)), ('image_hash',image_hash), ('task',son_escape(task)), ]) filename = get_filename(out_record) out_record['filename'] = filename out_record['config_path'] = cpath out_record['__hash__'] = ext_hash out_record.update(model_results) print('dump out ...') out_data = cPickle.dumps(SON([('split_results',split_results),('splits',splits)])) perf_fs.put(out_data,**out_record) createCertificateDict(outfile,{'feature_file':feature_certificate})