def getModel(self, exp_uid, args_json): try: args_dict = self.helper.convert_json(args_json) args_dict = verifier.verify(args_dict, self.reference_dict['getModel']['args']) alg_label = args_dict['args']['alg_label'] args = self.butler.experiment.get(key='args') for algorithm in args['alg_list']: if alg_label == algorithm['alg_label']: alg_id = algorithm['alg_id'] myapp_response = self.call_app_fn(alg_label, alg_id, 'getModel', args_dict) myapp_response['exp_uid'] = exp_uid myapp_response['alg_label'] = alg_label # Log the response of the getModel in ALG-EVALUATION if args_dict['args']['logging']: alg_log_entry = {'exp_uid': exp_uid, 'alg_label':alg_label, 'task': 'getModel', 'timestamp': str(utils.datetimeNow())} alg_log_entry.update(myapp_response) self.butler.log('ALG-EVALUATION', alg_log_entry) return json.dumps({'args': myapp_response, 'meta': {'log_entry_durations':self.log_entry_durations, 'timestamp': str(utils.datetimeNow())}}), True, '' except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() full_error = str(traceback.format_exc())+'\n'+str(error) utils.debug_print("getModel Exception: " + full_error, color='red') log_entry = { 'exp_uid':exp_uid,'task':'getModel','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry ) traceback.print_tb(exc_traceback) return Exception(error)
def processAnswer(self, exp_uid, args_json): try: args_dict = self.helper.convert_json(args_json) args_dict = verifier.verify(args_dict, self.reference_dict['processAnswer']['args']) # Update timing info in query query = self.butler.queries.get(uid=args_dict['args']['query_uid']) timestamp_answer_received = args_dict['args'].get('timestamp_answer_received', None) delta_datetime = utils.str2datetime(timestamp_answer_received) - \ utils.str2datetime(query['timestamp_query_generated']) round_trip_time = delta_datetime.total_seconds() response_time = float(args_dict['args'].get('response_time',0.)) query_update = self.call_app_fn(query['alg_label'], query['alg_id'], 'processAnswer', args_dict) query_update.update({'response_time':response_time, 'network_delay':round_trip_time - response_time, 'timestamp_answer_received': timestamp_answer_received }) self.butler.queries.set_many(uid=args_dict['args']['query_uid'],key_value_dict=query_update) return json.dumps({'args': {}, 'meta': {'log_entry_durations':self.log_entry_durations}}), True, '' except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() full_error = str(traceback.format_exc())+'\n'+str(error) utils.debug_print("processAnswer Exception: " + full_error, color='red') log_entry = { 'exp_uid':exp_uid,'task':'processAnswer','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry ) traceback.print_tb(exc_traceback) raise Exception(error)
def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key, i=None, get_bucket=True): if get_bucket: bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key) else: bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key) utils.debug_print('Uploading target: {}'.format(filename)) url = s3.upload(filename, StringIO(file_obj), bucket) target_types = {'png': 'image', 'jpeg': 'image', 'jpg': 'image', 'gif': 'image', 'mp4': 'video', 'mov': 'video', 'txt': 'text', 'csv': 'text'} filetype = filename.split('.')[-1] if filetype not in target_types: msg = ('Target not recognized (extension: "{}"). ' 'Available extensions: {}').format(filetype, list(target_types.keys())) raise ValueError(msg) utils.debug_print('Done uploading target: {}'.format(filename)) return {'target_id': str(i), 'primary_type': target_types[filetype], 'primary_description': url, 'alt_type': 'text', 'alt_description': filename}
def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None, get_bucket=True): base64_zip = io.BytesIO(s) zip_file = zipfile.ZipFile(base64_zip) files = zipfile_to_dictionary(zip_file) if not n_jobs: n_jobs = min(len(files), 50) if not bucket_name: bucket_name = '{}{}'.format(aws_key.lower(), utils.random_string(length=20)) # TODO: trim here for JSON object to append to dictionaries # TODO: manage CSV targets here # TODO: how come creating a S3 bucket isn't working for me? utils.debug_print('Beginning to upload targets') try: targets = Parallel(n_jobs=n_jobs, backend='threading') \ (delayed(upload_target, check_pickle=False) (name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True) for i, (name, file) in enumerate(files.items())) except: utils.debug_print('Whoops, parallel upload failed. ' 'Trying with {} threads'.format(n_jobs)) targets = [upload_target(name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True) for i, (name, file) in enumerate(files.items())] return targets
def lock(self, name, **kwargs): try: self.ensure_connection() name = self.key_prefix + name return self.cache.lock(name, **kwargs) except Exception as e: utils.debug_print("Butler.Collection.Memory.lock exception: {}".format(e)) return None
def exists(self, key): try: self.ensure_connection() key = self.key_prefix + key return self.cache.exists(key) except Exception as e: utils.debug_print("Butler.Collection.Memory.exists exception: {}".format(e)) return None
def pop_list(self, database_id, bucket_id, doc_uid, key, value): """ pops a value from a list. If value=0, pops from start of list If value=-1, pops from end of list. (note this is inconsistent with Mongo's api to be consistent with python's pop) See declaration of mongo_index for more info. Inputs: (string) database_id, (string) bucket_id, (string) doc_uid, (string) key, (int) value Outputs: (any) value, (bool) didSucceed, (string) message Usage: ::\n value, didSucceed, message = db.set(database_id, bucket_id, doc_uid, key, value) """ if self.client is None: didSucceed, message = self.connectToMongoServer() if not didSucceed: return None, False, message # For Mongo's $pop, 1 means last element, -1 means first element try: if value == -1: mongo_index = 1 elif value == 0: mongo_index = -1 else: raise DatabaseException("can only pop first (value=0) or last (value=-1) element") try: return_value = self.client[database_id][bucket_id].find_and_modify({"_id": doc_uid}, {'$pop': {key: mongo_index}})[key] except KeyError as e: if e.args[0] == key: raise DatabaseException("key '{}' not found in document '{}.{}'".format(key, database_id, bucket_id)) elif e.args[0] == bucket_id: raise DatabaseException("bucket '{}' not found in database '{}'".format(bucket_id, database_id)) elif e.args[0] == database_id: raise DatabaseException("database '{}' not found".format(database_id)) else: raise DatabaseException("unknown KeyError: '{}' not found".format(e)) except OperationFailure: # This gets thrown if you try to pop from a non-list raise DatabaseException("cannot pop from non-list") if return_value: return_value = return_value[value] else: raise DatabaseException("cannot pop from empty list") return_value = self.undoDatabaseFormat(return_value) return return_value, True, 'From Mongo' except DatabaseException as e: error = "PermStore.pop_list failed with exception: {}".format(e) utils.debug_print(error) return None, False, error
def getQuery(self, butler): num_ans = butler.algorithms.get(key='num_reported_answers') query_list = butler.algorithms.get(key='query_list') i = num_ans % len(query_list) query = query_list[i] utils.debug_print(query) # butler.participants.set(key='query', value=query) # append the current query to do_not_ask # butler.algorithms.append(key='do_not_ask', value=query) return query[2], query[0], query[1]
def set(self, key, value): self.check_prefix() key = self.key_prefix + key try: self.ensure_connection() l = len(value) n = self.num_entries(l) utils.debug_print("Setting {} in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) self.cache.set(k, value[i*self.max_entry_size:(i+1)*self.max_entry_size]) return self.cache.set(key, "{}:{}".format(str(n), str(l))) except Exception as e: utils.debug_print("Butler.Collection.Memory.set exception: {}".format(e)) return False
def docs(app_id=None,form="raw"): if app_id: filename = '{0}/myApp.yaml'.format(app_id) utils.debug_print(filename) api,blank,pretty = doc_gen.get_docs(filename,'apps/') if form == "pretty": return render_template('doc.html',doc_string=pretty, base_dir="/assistant/static") elif form == "blank": return render_template('raw.html',doc=blank) elif form == "raw": return render_template('raw.html',doc=api) message = ('Welcome to the next.discovery system.\n ' 'Available apps {}'.format(', '.join(utils.get_supported_apps()))) return render_template('raw.html',doc=message)
def set_file(self, key, f): self.check_prefix() key = self.key_prefix + key try: self.ensure_connection() f.seek(0, os.SEEK_END) l = f.tell() f.seek(0, 0) n = self.num_entries(l) utils.debug_print("Setting {} bytes in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) v = f.read(self.max_entry_size) self.cache.set(k, v) return self.cache.set(key, "{}:{}".format(str(n), str(l))) except Exception as e: utils.debug_print("Butler.Collection.Memory.set_file exception: {}".format(e)) return False
def get(self, key): self.check_prefix() try: self.ensure_connection() key = self.key_prefix + key d = self.cache.get(key) n, l = d.split(":") l = int(l) n = int(n) ans = "" utils.debug_print("Getting {} bytes in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) ans += self.cache.get(k) return ans except Exception as e: utils.debug_print("Butler.Collection.Memory.get exception: {}".format(e)) return None
def initExp(self, butler, init_algs, args): utils.debug_print("AA: "+str(args)) if 'targetset' in args['targets'].keys(): n = len(args['targets']['targetset']) self.TargetManager.set_targetset(butler.exp_uid, args['targets']['targetset']) d = len(args['targets']['targetset'][0]['meta']['features']) args['n'] = n args['d'] = d del args['targets'] alg_data = {} algorithm_keys = ['n','failure_probability'] for key in algorithm_keys: if key in args: alg_data[key]=args[key] init_algs(alg_data) return args
def get_file(self, key): self.check_prefix() try: self.ensure_connection() key = self.key_prefix + key d = self.cache.get(key) f = StringIO.StringIO() n, l = d.split(":") l = int(l) n = int(n) utils.debug_print("Getting {} bytes in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) f.write(self.cache.get(k)) f.seek(0, 0) return f except Exception as e: utils.debug_print("Butler.Collection.Memory.get_file exception: {}".format(e)) return None
def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key, i=None, get_bucket=True): if get_bucket: bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key) else: bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key) utils.debug_print('begin ' + filename) url = s3.upload(filename, StringIO(file_obj), bucket) target_types = {'png': 'image', 'jpeg': 'image', 'jpg': 'image', 'mp4': 'movie', 'mov': 'movie', 'txt': 'text', 'csv': 'text'} utils.debug_print('end ' + filename) return {'target_id': str(i), 'primary_type': target_types[filename.split('.')[-1]], 'primary_description': url, 'alt_type': 'text', 'alt_description': filename}
def set_file(self, key, f): self.check_prefix() key = self.key_prefix + key try: self.ensure_connection() f.seek(0, os.SEEK_END) l = f.tell() f.seek(0, 0) n = self.num_entries(l) utils.debug_print("Setting {} bytes in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) v = f.read(self.max_entry_size) self.cache.set(k, v) return self.cache.set(key, "{}:{}".format(str(n), str(l))) except Exception as e: utils.debug_print( "Butler.Collection.Memory.set_file exception: {}".format(e)) return False
def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None, get_bucket=True): base64_zip = io.BytesIO(s) zip_file = zipfile.ZipFile(base64_zip) files = zipfile_to_dictionary(zip_file) if not n_jobs: n_jobs = min(len(files), 50) if not bucket_name: bucket_name = '{}{}'.format(aws_key.lower(), utils.random_string(length=20)) # TODO: trim here for JSON object to append to dictionaries # TODO: manage CSV targets here # TODO: how come creating a S3 bucket isn't working for me? utils.debug_print('=== Starting upload of targets to S3 ===') try: targets = Parallel(n_jobs=n_jobs, backend='threading') \ (delayed(upload_target, check_pickle=False) (name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True) for i, (name, file) in enumerate(files.items())) except: utils.debug_print( 'Whoops, parallel S3 upload failed. Trying serially.') targets = [ upload_target(name, file, bucket_name, aws_key, aws_secret_key, i=i, get_bucket=True) for i, (name, file) in enumerate(files.items()) ] return targets
def modelUpdate(self, butler, task_args): arm_id = task_args['arm_id'] reward = task_args['reward'] participant_uid = task_args['participant_uid'] d = butler.algorithms.get(key='d') R = butler.algorithms.get(key='R') S = butler.algorithms.get(key='S') delta = butler.algorithms.get(key='delta') ridge = butler.algorithms.get(key='ridge') invVt = np.array( butler.participants.get(uid=participant_uid, key='invVt')) b = np.array(butler.participants.get(uid=participant_uid, key='b')) features = np.load('features.npy') x_invVt_norm = butler.participants.get(uid=participant_uid, key='x_invVt_norm') t = butler.participants.get(uid=participant_uid, key='num_reported_answers') xt = features[arm_id, :] u = invVt.dot(xt) invVt -= np.outer(u, u) / (1 + np.inner(xt, u)) x_invVt_norm -= np.dot(features, u)**2 / (1 + np.inner(xt, u)) b += reward * xt theta_hat = invVt.dot(b) utils.debug_print((1 + t / (ridge * d))) sqrt_beta = R * np.sqrt(d * np.log( (1 + t / (ridge * d)) / delta)) + np.sqrt(ridge) * S expected_rewards = np.dot( features, theta_hat) + sqrt_beta * np.sqrt(x_invVt_norm) butler.participants.set(uid=participant_uid, key='arm_order', value=np.argsort(expected_rewards)[::-1]) butler.participants.set(uid=participant_uid, key='invVt', value=invVt) butler.participants.set(uid=participant_uid, key='b', value=b) butler.participants.set(uid=participant_uid, key='x_invVt_norm', value=x_invVt_norm) return True
def get_file(self, key): self.check_prefix() try: self.ensure_connection() key = self.key_prefix + key d = self.cache.get(key) f = StringIO.StringIO() n, l = d.split(":") l = int(l) n = int(n) utils.debug_print("Getting {} bytes in {} entries".format(l, n)) for i in range(n): k = key + ":" + str(i) f.write(self.cache.get(k)) f.seek(0, 0) return f except Exception as e: utils.debug_print( "Butler.Collection.Memory.get_file exception: {}".format(e)) return None
def initExp(self, exp_uid, args_json): try: self.helper.ensure_indices(self.app_id,self.butler.db, self.butler.ell) args_dict = self.helper.convert_json(args_json) args_dict = verifier.verify(args_dict, self.reference_dict['initExp']['args']) args_dict['exp_uid'] = exp_uid # to get doc from db args_dict['start_date'] = utils.datetime2str(utils.datetimeNow()) self.butler.admin.set(uid=exp_uid,value={'exp_uid': exp_uid, 'app_id':self.app_id, 'start_date':str(utils.datetimeNow())}) self.butler.experiment.set(value={'exp_uid': exp_uid}) args_dict['args'] = self.init_app(exp_uid, args_dict['args']['alg_list'], args_dict['args']) args_dict['git_hash'] = git_hash self.butler.experiment.set_many(key_value_dict=args_dict) return '{}', True, '' except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() full_error = str(traceback.format_exc())+'\n'+str(error) utils.debug_print("initExp Exception: " + full_error, color='red') log_entry = { 'exp_uid':exp_uid,'task':'initExp','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry ) traceback.print_tb(exc_traceback) return '{}', False, str(error)
def docs(app_id=None, form="raw"): if app_id: filename = '{0}/myApp.yaml'.format(app_id) utils.debug_print(filename) api, blank, pretty = doc_gen.get_docs(filename, 'apps/') if form == "pretty": return render_template('doc.html', doc_string=pretty, base_dir="/assistant/static") elif form == "blank": return render_template('raw.html', doc=blank) elif form == "raw": return render_template('raw.html', doc=api) message = ('Welcome to the next.discovery system.\n ' 'Available apps {}'.format(', '.join( utils.get_supported_apps()))) return render_template('raw.html', doc=message)
def initExp(self, exp_uid, args_json): try: self.helper.ensure_indices(self.app_id,self.butler.db, self.butler.ell) args_dict = self.helper.convert_json(args_json) args_dict = verifier.verify(args_dict, self.reference_dict['initExp']['args']) args_dict['exp_uid'] = exp_uid # to get doc from db args_dict['start_date'] = utils.datetime2str(utils.datetimeNow()) self.butler.admin.set(uid=exp_uid,value={'exp_uid': exp_uid, 'app_id':self.app_id, 'start_date':str(utils.datetimeNow())}) utils.debug_print("ASD "+str(args_dict)) args_dict['args'] = self.init_app(exp_uid, args_dict['args']['alg_list'], args_dict['args']) args_dict['git_hash'] = git_hash self.butler.experiment.set(value=args_dict) return '{}', True, '' except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() full_error = str(traceback.format_exc())+'\n'+str(error) utils.debug_print("initExp Exception: " + full_error, color='red') log_entry = { 'exp_uid':exp_uid,'task':'initExp','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry ) traceback.print_tb(exc_traceback) return '{}', False, str(error)
def processAnswer(self, butler, alg, args): query = butler.queries.get(uid=args['query_uid']) target = query['target_indices'] target_label = args['target_label'] #DEBUG # utils.debug_print("type(target_label)") # utils.debug_print(type(target_label)) num_reported_answers = butler.experiment.increment( key='num_reported_answers_for_' + query['alg_label']) labelled_row = pickle.loads(butler.memory.get(str(target['index']))) if labelled_row is None: utils.debug_print("Labelled row doesnt exist") return {} # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict experiment = butler.experiment.get() d = experiment['args']['d'] # if num_reported_answers % ((d+4)/4) == 0: # butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}})) alg({'target_index': target['index'], 'target_label': target_label}) return {'target_index': target['index'], 'target_label': target_label}
def getQuery(self, butler, alg, args): experiment = butler.experiment.get() n = experiment['args']['n'] exp_uid = experiment['exp_uid'] participant_uid = args['participant_uid'] num_responses = butler.participants.get(uid=participant_uid, key='num_responses') init_arm = int(args['init_arm']) print('init_arm:', init_arm) if num_responses == 0 or num_responses is None: butler.participants.set(uid=participant_uid, key='init_arm', value=init_arm) arm_order = range(n) np.random.shuffle(arm_order) butler.participants.set(uid=participant_uid, key='arm_order', value=arm_order) butler.participants.set(uid=participant_uid, key='do_not_ask', value=[init_arm]) print('Initialized lists in getQuery') alg_response = alg({'participant_uid': participant_uid}) exp_uid = butler.exp_uid if num_responses == 0 or num_responses is None: butler.participants.set(uid=participant_uid, key='init_arm', value=init_arm) arm_order = range(n) np.random.shuffle(arm_order) butler.participants.set(uid=participant_uid, key='arm_order', value=arm_order) utils.debug_print('Alg_resp:', alg_response) target = self.TargetManager.get_target_item(exp_uid, alg_response) init_target = init_arm and self.TargetManager.get_target_item(exp_uid, init_arm) return_dict = { 'target_indices': [alg_response], 'targets': [target], 'init_target': init_target, 'instructions': 'Is this the kind of document you are looking for?', # 'instructions': 'Is this the kind of image you are looking for?', 'count': 1, } return return_dict
def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key, i=None, get_bucket=True): if get_bucket: bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key) else: bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key) utils.debug_print('Uploading target: {}'.format(filename)) url = s3.upload(filename, StringIO(file_obj), bucket) target_types = { 'png': 'image', 'jpeg': 'image', 'jpg': 'image', 'gif': 'image', 'mp4': 'video', 'mov': 'video', 'txt': 'text', 'csv': 'text' } filetype = filename.split('.')[-1] if filetype not in target_types: msg = ('Target not recognized (extension: "{}"). ' 'Available extensions: {}').format(filetype, list(target_types.keys())) raise ValueError(msg) utils.debug_print('Done uploading target: {}'.format(filename)) return { 'target_id': str(i), 'primary_type': target_types[filetype], 'primary_description': url, 'alt_type': 'text', 'alt_description': filename }
def post(self): utils.debug_print('experiment:58',request.data) post_parser = exp_parser.copy() post_parser.add_argument('app_id', type=str, required=True) post_parser.add_argument('args', type=dict, required=True) # Validate args with post_parser args_data = post_parser.parse_args() utils.debug_print(args_data) app_id = args_data['app_id'] utils.debug_print(app_id) # Create and set exp_uid exp_uid = '%030x' % random.randrange(16**30) # Args from dict to json type args_json = json.dumps(args_data) print('experiment:69') # Execute initExp through the broker response_json,didSucceed,message = broker.applyAsync(app_id, exp_uid, 'initExp', json.dumps(args_data)) if not didSucceed: return attach_meta({}, meta_error['InitExpError'], backend_error=message), 400 return attach_meta({'exp_uid':exp_uid}, meta_success), 200
def __get_domain_for_job(self, job_id): """ Computes which domain to run a given job_id on. Git Commit: c1e4f8aacaa42fae80e111979e3f450965643520 has support for multiple worker nodes. See the code in broker.py, cluster_monitor.py, and the docker-compose file in that commit to see how to get that up and running. It uses a simple circular hashing scheme to load balance getQuery/processAnswer calls. This implementation assumes just a single master node and no workers so only a single hostname (e.g. localhost) has celery workers. """ if self.r.exists('MINIONWORKER_HOSTNAME'): self.hostname = self.r.get('MINIONWORKER_HOSTNAME') utils.debug_print('Found hostname: {} (Redis)'.format(self.hostname)) else: with open('/etc/hosts', 'r') as fid: for line in fid: if 'MINIONWORKER' in line: self.hostname = line.split('\t')[1].split(' ')[1] self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes utils.debug_print('Found hostname: {} (/etc/hosts)'.format(self.hostname)) break if self.hostname is None: import socket self.hostname = socket.gethostname() self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes utils.debug_print('Found hostname: {} (socket.gethostname())'.format(self.hostname)) return self.hostname
def basic_info(self, app, butler): """ returns basic statistics like number of queries, participants, etc. """ utils.debug_print("butler algo corona print2") utils.debug_print( butler.algorithms.get(pattern={'exp_uid': app.exp_uid})) experiment_dict = butler.experiment.get() # utils.debug_print("experiment_dict dasboard corona") # utils.debug_print(experiment_dict) alg_list = butler.experiment.get(key='args')['alg_list'] # utils.debug_print("ALGO list app dasboard corona") # for algorithm in alg_list: # utils.debug_print(algorithm) algo_dict = butler.algorithms.get(pattern={'exp_uid': app.exp_uid}) utils.debug_print("algo_dict dasboard corona") utils.debug_print(algo_dict) #git_hash = rm.get_git_hash_for_exp_uid(exp_uid) git_hash = experiment_dict.get('git_hash', 'None') # labelled_list = butler.algorithms.get(key="labelled_list") # utils.debug_print("labelled_list app dasboard corona") # utils.debug_print(labelled_list) # start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)['start_date']) start_date = experiment_dict.get('start_date', 'Unknown') + ' UTC' # participant_uids = rm.get_participant_uids(exp_uid) participants = butler.participants.get( pattern={'exp_uid': app.exp_uid}) num_participants = len(participants) queries = butler.queries.get(pattern={'exp_uid': app.exp_uid}) num_queries = len(queries) return_dict = { 'git_hash': git_hash, 'exp_start_data': start_date, 'num_participants': num_participants, 'num_queries': num_queries, 'meta': { 'last_dashboard_update': '<1 minute ago' } } return return_dict
def pop_list(self, bucket_id, doc_uid, key, value): """ Inputs: (string) bucket_id, (string) doc_uid, (string) key, (int) value value=-1 pops the last item of the list value=0 pops the first item of the list Outputs: (python object) value, (bool) didSucceed, (string) message Usage: ::\n didSucceed,message = db.pop_list(bucket_id,doc_uid,key,value) """ try: response, dt = utils.timeit(self.permStore.pop_list)(constants.app_data_database_id, bucket_id, doc_uid, key, value) value, didSucceedPerm, messagePerm = response self.duration_permStoreSet += dt return value, didSucceedPerm, messagePerm except Exception as e: error = "DatabaseAPI.pop_list failed with exception: {}".format(e) utils.debug_print(error) return None, False, error
def pop_list(self, bucket_id, doc_uid, key, value): """ Inputs: (string) bucket_id, (string) doc_uid, (string) key, (int) value value=-1 pops the last item of the list value=0 pops the first item of the list Outputs: (python object) value, (bool) didSucceed, (string) message Usage: ::\n didSucceed,message = db.pop_list(bucket_id,doc_uid,key,value) """ try: response, dt = utils.timeit(self.permStore.pop_list)( constants.app_data_database_id, bucket_id, doc_uid, key, value) value, didSucceedPerm, messagePerm = response self.duration_permStoreSet += dt return value, didSucceedPerm, messagePerm except Exception as e: error = "DatabaseAPI.pop_list failed with exception: {}".format(e) utils.debug_print(error) return None, False, error
def __get_domain_for_job(self, job_id): """ Computes which domain to run a given job_id on. Git Commit: c1e4f8aacaa42fae80e111979e3f450965643520 has support for multiple worker nodes. See the code in broker.py, cluster_monitor.py, and the docker-compose file in that commit to see how to get that up and running. It uses a simple circular hashing scheme to load balance getQuery/processAnswer calls. This implementation assumes just a single master node and no workers so only a single hostname (e.g. localhost) has celery workers. """ if self.r.exists('MINIONWORKER_HOSTNAME'): self.hostname = self.r.get('MINIONWORKER_HOSTNAME') utils.debug_print('Found hostname: {} (Redis)'.format( self.hostname)) else: with open('/etc/hosts', 'r') as fid: for line in fid: if 'MINIONWORKER' in line: self.hostname = line.split('\t')[1].split(' ')[1] self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes utils.debug_print( 'Found hostname: {} (/etc/hosts)'.format( self.hostname)) break if self.hostname is None: import socket self.hostname = socket.gethostname() self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360) # expire after 10 minutes utils.debug_print( 'Found hostname: {} (socket.gethostname())'.format( self.hostname)) return self.hostname
def getModel(self, butler): # The model is simply the vector of weights and a record of the number of reported answers. utils.debug_print( butler.algorithms.get(key=['weights', 'num_reported_answers'])) return butler.algorithms.get(key=['weights', 'num_reported_answers'])
def post(self): utils.debug_print('POSTED!') utils.debug_print('H', request.headers) try: utils.debug_print('L', len(request.get_data())) except Exception as exc: print(exc) print('OH NO an error in assistant_blueprint!', exc, sys.exc_info()) # TODO? replace with msgpack args = self.deserialise(request.get_data()) # Unpacking the YAML/ZIP file for key in args: if key not in {'bucket_id', 'key_id', 'secret_key'}: comma_idx = args[key].find(',') args[key] = args[key][comma_idx + 1:] if args[key] in {'True', 'False'}: args[key] = True if args[key] == 'True' else False else: args[key] = base64.decodestring(args[key]) if all([ key not in args for key in ['bucket_id', 'key_id', 'sercret_key'] ]): args['upload'] = False else: args['upload'] = True utils.debug_print('args.keys() = ', args.keys()) args['args'] = yaml.load(args['args']) try: init_exp_args = args['args'] utils.debug_print("args.keys = ", args['args'].keys()) if 'targets' in args.keys(): target_zipfile = args['targets'] utils.debug_print("args = ", args) if args.get('upload', True): bucket_id = args['bucket_id'] key_id = args['key_id'] secret_key = args['secret_key'] for x_ in ['bucket_id', 'secret_key', 'key_id']: utils.debug_print(x_, args[x_]) # Unpack the targets targets = target_unpacker.unpack(target_zipfile, key_id, secret_key, bucket_id) else: targets = target_unpacker.unpack_csv_file(target_zipfile) init_exp_args['args']['targets'] = {'targetset': targets} # Init the experiment: app_id = init_exp_args['app_id'] exp_uid = '%030x' % random.randrange(16**30) r = broker.applyAsync(app_id, exp_uid, 'initExp', json.dumps(init_exp_args)) response_json, didSucceed, message = r if not didSucceed: raise ValueError(message) except: tb = traceback.format_exc() info = sys.exc_info() if hasattr(info[1], 'message') and len(info[1].message) > 0: message = info[1].message if 'time' in message: message += ( "\nNOTE: error has to do with time; try " "restarting docker, more detail at " "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and" ) else: message = str(info[1]) + str(info[-1]) message = '\n'.join(tb.split('\n')[-5:]) message = message + '\n\nDetails:\n' + tb return {'success': False, 'message': message, 'exp_uid': None} return { 'success': didSucceed, 'message': message, 'exp_uid': exp_uid, 'app_id': args['args']['app_id'] }
def getQuery(self, butler, alg, args): sttime = time.time() alg_response = alg({'participant_uid': args['participant_uid']}) # Get Unlabelled Set #alg_response contains index returned from LogisticRegressionActive getQuery method #Retrieve the row using this index unlabelled_row = butler.memory.get(str(alg_response)) if unlabelled_row is None: utils.debug_print("No row was retrieved") return {} unlabelled_row = pickle.loads(unlabelled_row).replace(np.nan, "None") unlabelled_row_dict = unlabelled_row.to_dict() sra_study_id = unlabelled_row_dict.get('sra_study_id') sra_sample_id = unlabelled_row_dict.get('sra_sample_id') key_value = unlabelled_row_dict.get('key_value') #Convert from str to dict key_value_dict = ast.literal_eval(key_value) ontology_mapping = unlabelled_row_dict.get('ontology_mapping') # Convert from str to list ontology_mapping_list = ast.literal_eval(ontology_mapping) ont_mapping_dict = {} if ontology_mapping_list is None: ontology_mapping_list = [] for ont in ontology_mapping_list: ont_org = ont return_link = "" #pre-processing steps ont = ont.replace(":", "_") ''' "DOID": "DOID.17-01-30.obo", "UBERON": "UBERON.17-01-30.obo", "CL": "CL.18-11-13.obo", "CVCL": "CVCL.17-01-30.obo", "UO": "UO.17-01-30.obo", "EFO": "EFO.17-01-30.obo", "CHEXBI": "CHEBI.17-01-30.obo", "GO": "GO.19-01-18.obo" ''' #TODO: Other terms link if "CL" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/cl/terms?short_form=" + ont elif "UBERON" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/uberon/terms?short_form=" + ont elif "DOID" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/doid/terms?short_form=" + ont elif "EFO" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + ont elif "CVCL" in ont: return_link = "https://web.expasy.org/cellosaurus/" + ont ont_mapping_dict[ont_org] = return_link #retrieve study row based on study_id study_row_str = pickle.loads(butler.memory.get(sra_study_id)).replace( np.nan, "None") study_row_json = study_row_str.to_dict() #Class-wise confidence of all classes cur_confidence = butler.memory.get("cur_confidence") if cur_confidence is None: cur_confidence = pickle.dumps([]) cur_confidence = pickle.loads(cur_confidence) utils.debug_print(cur_confidence) #Get name of classes lr_classes = butler.memory.get("lr_classes") if lr_classes is None: lr_classes = pickle.dumps([]) lr_classes = pickle.loads(lr_classes) #this is what is received in widgets/getQuery_widget.html ret = { 'target_indices': unlabelled_row_dict, 'study': study_row_json, 'key_value': key_value_dict, 'ontology_mapping': ont_mapping_dict, 'cur_confidence': cur_confidence, 'lr_classes': lr_classes, 'sra_sample_id': sra_sample_id } return ret
def getQuery(self, exp_uid, args_json): try: args_dict = self.helper.convert_json(args_json) args_dict = verifier.verify(args_dict, self.reference_dict['getQuery']['args']) experiment_dict = self.butler.experiment.get() alg_list = experiment_dict['args']['alg_list'] participant_to_algorithm_management = experiment_dict['args']['participant_to_algorithm_management'] algorithm_management_settings = experiment_dict['args']['algorithm_management_settings'] # Create the participant dictionary in participants bucket if needed. Also pull out label and id for this algorithm participant_uid = args_dict['args'].get('participant_uid', args_dict['exp_uid']) # Check to see if the first participant has come by and if not, save to db participant_doc = self.butler.participants.get(uid=participant_uid) first_participant_query = participant_doc==None if first_participant_query: participant_doc = {} self.butler.participants.set(uid=participant_uid, value={'exp_uid':exp_uid, 'participant_uid':participant_uid}) if (participant_uid == exp_uid) or (participant_to_algorithm_management == 'one_to_many') or (first_participant_query): if algorithm_management_settings['mode'] == 'fixed_proportions': labels = [alg['alg_label'] for alg in algorithm_management_settings['params']] prop = [prop_item['proportion'] for prop_item in algorithm_management_settings['params']] # reorder prop and alg_list to have same order new_alg_list = [] broken = False for label in labels: broken = False for alg in alg_list: if label == alg['alg_label']: new_alg_list += [alg] broken = True break if not broken: raise Exception('alg_label not present for both porportions and labels') chosen_alg = numpy.random.choice(new_alg_list, p=prop) elif algorithm_management_settings['mode'] == 'custom' : chosen_alg = self.myApp.chooseAlg(self.butler, alg_list, args_dict['args']) else: chosen_alg = numpy.random.choice(alg_list) alg_id = chosen_alg['alg_id'] alg_label = chosen_alg['alg_label'] if (first_participant_query) and (participant_to_algorithm_management=='one_to_one'): self.butler.participants.set(uid=participant_uid, key='alg_id',value=alg_id) self.butler.participants.set(uid=participant_uid, key='alg_label',value=alg_label) elif (participant_to_algorithm_management=='one_to_one'): alg_id = participant_doc['alg_id'] alg_label = participant_doc['alg_label'] query_uid = utils.getNewUID() args_dict['args'].update(query_uid=query_uid) query_doc = self.call_app_fn(alg_label, alg_id, 'getQuery', args_dict) query_doc.update({'participant_uid':participant_uid, 'alg_id':alg_id, 'exp_uid':exp_uid, 'alg_label':alg_label, 'timestamp_query_generated':str(utils.datetimeNow()), 'query_uid':query_uid}) self.butler.queries.set(uid=query_uid, value=query_doc) return json.dumps({'args':query_doc,'meta':{'log_entry_durations':self.log_entry_durations}}), True,'' except Exception, error: exc_type, exc_value, exc_traceback = sys.exc_info() full_error = str(traceback.format_exc())+'\n'+str(error) utils.debug_print("getQuery Exception: " + full_error, color='red') log_entry = { 'exp_uid':exp_uid,'task':'getQuery','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry ) traceback.print_tb(exc_traceback) return '{}', False, str(error)
def cumulative_reward_plot(self, app, butler): """ Description: Returns multiline plot where there is a one-to-one mapping lines to algorithms and each line indicates the error on the validation set with respect to number of reported answers Expected input: None Expected output (in dict): (dict) MPLD3 plot dictionary """ # get list of algorithms associated with project # utils.debug_print('came into Dashboard') args = butler.experiment.get(key='args') # num_algs = len(args['alg_list']) # utils.debug_print('num_tries: ', args['num_tries']) # alg_labels = [] # for i in range(num_algs): # alg_labels += [args['alg_list'][i]['alg_label']] plot_data = butler.experiment.get(key='plot_data') # utils.debug_print('butler.algs.plot_data in Dashboard: ', plot_data) df = pd.DataFrame(plot_data) df.columns = [ u'alg', u'arm_pulled', u'initial_arm', u'participant_uid', u'rewards', u'time' ] # utils.debug_print('df: ', df) # df = df.pivot_table(columns='initial arm', index='time', values='rewards', aggfunc=np.mean) # utils.debug_print('df: ', df) # utils.debug_print('Came into Dashbord, trying to print algs and init_arms') algs = list(df['alg'].unique()) utils.debug_print('algs: ', algs) init_arms = df['initial_arm'].unique() utils.debug_print('init_arms: ', init_arms) import matplotlib.pyplot as plt import mpld3 fig, ax = plt.subplots(nrows=1, ncols=1, subplot_kw=dict(axisbg='#EEEEEE')) # T = args['num_tries'] # utils.debug_print('T: ', T) for alg in algs: alg_results = np.zeros(T) for i, init_arm in enumerate(init_arms): print alg print init_arm result = df.query( 'alg == "{alg}" and initial_arm == {iarm}'.format( alg=alg, iarm=init_arm))[['time', 'rewards', 'participant_uid' ]].groupby('time').mean() rewards = np.array(result['rewards']) # utils.debug_print('rewards: ', rewards) # utils.debug_print('len rewards: ', len(rewards)) # utils.debug_print('alg_results: ', alg_results) alg_results[0:len(rewards)] += rewards / float(len(init_arms)) ax.plot(range(len(rewards)), np.cumsum(rewards), label='{alg}'.format(alg=alg)) ax.set_xlabel('Time') ax.set_ylabel('Average cumulative rewards') ax.set_title('Cumulative rewards', size=10) legend = ax.legend(loc=2, ncol=2, mode="expand") for label in legend.get_texts(): label.set_fontsize('xx-small') plot_dict = mpld3.fig_to_dict(fig) plt.close() return plot_dict
def __init__(self, db): self.app_id = 'PoolBasedBinaryClassification' self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager( db) utils.debug_print("initialized myApp again")
def post(self): utils.debug_print('POSTED!') utils.debug_print('H',request.headers) try: utils.debug_print('L',len(request.get_data())) except Exception as exc: print(exc) print('OH NO an error in assistant_blueprint!',exc,sys.exc_info()) # TODO? replace with msgpack args = self.deserialise(request.get_data()) # Unpacking the YAML/ZIP file for key in args: if key not in {'bucket_id', 'key_id', 'secret_key'}: comma_idx = args[key].find(',') args[key] = args[key][comma_idx + 1:] if args[key] in {'True', 'False'}: args[key] = True if args[key] == 'True' else False else: args[key] = base64.decodestring(args[key]) if all([key not in args for key in ['bucket_id', 'key_id', 'sercret_key']]): args['upload'] = False else: args['upload'] = True utils.debug_print('args.keys() = ', args.keys()) args['args'] = yaml.load(args['args']) try: init_exp_args = args['args'] utils.debug_print("args.keys = ", args['args'].keys()) if 'targets' in args.keys(): target_zipfile = args['targets'] utils.debug_print("args = ", args) if args.get('upload', True): bucket_id = args['bucket_id'] key_id = args['key_id'] secret_key = args['secret_key'] for x_ in ['bucket_id', 'secret_key', 'key_id']: utils.debug_print(x_, args[x_]) # Unpack the targets targets = target_unpacker.unpack(target_zipfile, key_id, secret_key, bucket_id) else: targets = target_unpacker.unpack_csv_file(target_zipfile) init_exp_args['args']['targets'] = {'targetset': targets} # Init the experiment: app_id = init_exp_args['app_id'] exp_uid = '%030x' % random.randrange(16**30) r = broker.applyAsync(app_id, exp_uid, 'initExp', json.dumps(init_exp_args)) response_json, didSucceed, message = r if not didSucceed: raise ValueError(message) except: tb = traceback.format_exc() info = sys.exc_info() if hasattr(info[1], 'message') and len(info[1].message) > 0: message = info[1].message if 'time' in message: message += ("\nNOTE: error has to do with time; try " "restarting docker, more detail at " "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and") else: message = str(info[1]) + str(info[-1]) message = '\n'.join(tb.split('\n')[-5:]) message = message + '\n\nDetails:\n' + tb return {'success': False, 'message': message, 'exp_uid': None} return {'success': didSucceed, 'message': message, 'exp_uid': exp_uid, 'app_id': args['args']['app_id']}
def initExp(self, butler, n, d, failure_probability): # Importing here as myApp gets initialized many times during #course of experiment from ..onto_lib import general_ontology_tools as ob from ..onto_lib import load_ontology nltk.downloader.download('punkt') #This is needed for rules_vector ONT_IDS = ["4"] OGS = [load_ontology.load(ont_id)[0] for ont_id in ONT_IDS] cvcl_og = OGS[0] mongo_mem_set(butler, 'n', n) mongo_mem_set(butler, 'delta', failure_probability) mongo_mem_set(butler, 'd', d) # Initialize the weight to an empty list of 0's mongo_mem_set(butler, 'num_reported_answers', 0) #Train data form butler mem train_df = redis_mem_get(butler, "train_data") test_df = redis_mem_get(butler, "test_data") unlabelled_df = redis_mem_get(butler, "unlabelled_data") #Initializing rules #Rules are negated as all custom rules were based on a sample being NOT of certain type rules_dict = { "not_tissue": 0, "not_cell_line": 0, "not_primary_cells": 0, "not_in_vitro_differentiated_cells": 0, "not_induced_pluripotent_stem_cells": 0, "not_stem_cells": 0 } bag_of_words = [] #Creating string so that it can be vectorized later X_train_str, y_train, train_rules = create_word_vector( train_df, True, bag_of_words, ob, cvcl_og) X_test_str, y_test, test_rules = create_word_vector( test_df, True, bag_of_words, ob, cvcl_og) X_unlabelled_str, empty_y_unlabelled, unlabelled_rules = create_word_vector( unlabelled_df, False, bag_of_words, ob, cvcl_og) #Encode y_train and y_test y_train = pd.Series(y_train) sample_dict = get_encode_dict() y_train = y_train.replace(sample_dict).values y_test = pd.Series(y_test) y_test = y_test.replace(sample_dict).values # create the transform #This vectorizer is used to vectorize key-value pairs, #ontologies and ancestors of each of the ontology word_vectorizer = TfidfVectorizer(decode_error='ignore', binary=True, max_features=75, lowercase=False, token_pattern=r'\S+', stop_words='english') # This vectorizer is used to vectorize custom rules rules_vectorizer = DictVectorizer() # transform word and rule vectors # Concatenate both vectors word_vectorizer.fit(bag_of_words) rules_vectorizer.fit([rules_dict]) #DEBUG utils.debug_print("word feature name") utils.debug_print(word_vectorizer.get_feature_names()) utils.debug_print("rules feature name") utils.debug_print(rules_vectorizer.get_feature_names()) X_train_word = word_vectorizer.transform(X_train_str) X_test_word = word_vectorizer.transform(X_test_str) X_unlabelled_word = word_vectorizer.transform(X_unlabelled_str) X_train_rules = rules_vectorizer.transform(train_rules) X_test_rules = rules_vectorizer.transform(test_rules) X_unlabelled_rules = rules_vectorizer.transform(unlabelled_rules) #Combining bot vectors X_train = hstack([X_train_word, X_train_rules], format="csr") X_test = hstack([X_test_word, X_test_rules], format="csr") X_unlabelled = hstack([X_unlabelled_word, X_unlabelled_rules], format="csr") #DEBUG # utils.debug_print("size") # utils.debug_print(X_train_word) # utils.debug_print(X_test_word) # utils.debug_print(X_unlabelled_word) # # utils.debug_print(X_train_rules) # utils.debug_print(X_test_rules) # utils.debug_print(X_unlabelled_rules) # # utils.debug_print("the matrices") # utils.debug_print(X_train) # utils.debug_print(X_test) # utils.debug_print(X_unlabelled) lr_model = LogisticRegression(solver='saga', penalty='l1') lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) acc_init = accuracy_score(y_test, y_pred) debug_print("acc in init") debug_print(acc_init) set_updated_acc(butler, X_train.shape[0], acc_init) unlabelled_list = redis_mem_get(butler, "unlabelled_list") study_id_list = redis_mem_get(butler, "study_id_list") largest_val = get_largest_values(X_unlabelled, lr_model, unlabelled_list, study_id_list, d) debug_print(largest_val) sample_list = largest_val['index'].tolist() sample_probs = largest_val['prob'].tolist() #DEBUG # debug_print("largest val init") # debug_print(largest_val) # debug_print("sample prob init") # debug_print(sample_probs) lr_classes = get_decode_list(lr_model.classes_) # Print model parameters - the names and coefficients are in same order utils.debug_print(lr_model.coef_) mongo_mem_set(butler, 'lr_classes', lr_classes) redis_mem_set(butler, 'lr_classes', lr_classes) mongo_mem_set(butler, 'X_train', X_train) mongo_mem_set(butler, 'y_train', y_train) mongo_mem_set(butler, 'X_test', X_test) mongo_mem_set(butler, 'y_test', y_test) debug_print(len(unlabelled_df)) mongo_mem_set(butler, 'unlabelled_len', len(unlabelled_df)) mongo_mem_set(butler, 'labelled_list', []) redis_mem_set(butler, "X_unlabelled", X_unlabelled) #Setting sample list and probability redis_mem_set(butler, 'sample_probs', sample_probs) redis_mem_set(butler, 'sample_list', sample_list) mongo_mem_set(butler, "S_trial", json.dumps({})) cm = confusion_matrix(y_test, y_pred) mongo_mem_set(butler, "confusion_matrix", cm) return True
def check_prefix(self): if self.key_prefix == '': utils.debug_print("butler.memory is deprecated." " Change to butler.experiment.memory or butler.algorithm.memory, etc." " wherever appropriate")
def full_embedding_update(self, butler, args): debug_print("inside update new lrmodel") # Main function to update the model. labelled_items = mongo_mem_get(butler, 'S') X_test = mongo_mem_get(butler, "X_test") y_test = mongo_mem_get(butler, "y_test") #DEBUG # debug_print("X_test") # debug_print(X_test) X_train = mongo_mem_get(butler, "X_train") y_train = mongo_mem_get(butler, "y_train") #DEBUG # debug_print("X_train") # debug_print(X_train) debug_print("inside full update") X_unlabelled = redis_mem_get(butler, "X_unlabelled") #Get unlabelled lists unlabelled_list = redis_mem_get(butler, "unlabelled_list") labelled_list = mongo_mem_get(butler, "labelled_list") # Build a list of feature vectors and associated labels. utils.debug_print(X_unlabelled) X_unlabelled_list = list(X_unlabelled) X_labelled_list = [] y_labelled = [] labelled_index = [] bucket_id = redis_mem_get(butler, "bucket_id") batch_no = redis_mem_get(butler, "batch_no") #Use this if you need to integrate with Amazon s3 # s3.modify_csv_contents(bucket_id, 'Labels.csv', labelled_items,batch_no) #Modify contents - getting labels df labels_filename = os.path.join(self.FILE_PATH, "Labels.csv") labels_df = pd.read_csv(labels_filename) redis_mem_set(butler, "batch_no", batch_no + 1) #Iterate through the labelled items and change labels file for index, label in labelled_items: X_labelled_list.append(X_unlabelled_list[index]) y_labelled.append(label) labelled_index.append(index) labelled_row = redis_mem_get(butler, str(index)) if labelled_row is not None: labelled_list.append({labelled_row['sra_sample_id']: label}) #Updating labels file labels_df.loc[index, 'label'] = get_decode(label) labels_df.loc[index, 'dataset_type'] = 'train' labels_df.loc[index, 'batch_no'] = batch_no labels_df.to_csv(labels_filename, index=False) debug_print("X_labelled_list") debug_print(X_labelled_list) X_labelled = vstack(X_labelled_list) #Combine X_train and newly labelled vector X_train = vstack([X_labelled, X_train]) y_train = np.concatenate((y_labelled, y_train)) mongo_mem_set(butler, "X_train", X_train) mongo_mem_set(butler, "y_train", y_train) study_id_list = redis_mem_get(butler, "study_id_list") #Drop newly labelled data from unlabelled for curr_index in sorted(labelled_index, reverse=True): debug_print("removing curr_index") debug_print(curr_index) if (curr_index in unlabelled_list): debug_print("curr_index exists") #unlabelled_list consist of indices that are unlabelles #Remove indices that currently got labelled from this list #Remove from lists parallel to this which are X_unlabelled_list and study_id_list curr_index_pos = unlabelled_list.index(curr_index) X_unlabelled_list.pop(curr_index_pos) unlabelled_list.pop(curr_index_pos) study_id_list.pop(curr_index_pos) if (curr_index in unlabelled_list): utils.debug_print("index did not get removed oops") # Performing training (Retraining model along with newly labelled samples) lr_model = LogisticRegression(solver='saga', penalty='l1') utils.debug_print("X_train_len") utils.debug_print(X_train.shape[0]) lr_model.fit(X_train, y_train) lr_classes = get_decode_list(lr_model.classes_) redis_mem_set(butler, 'lr_classes', lr_classes) y_pred = lr_model.predict(X_test) cm = confusion_matrix(y_test, y_pred) redis_mem_set(butler, "confusion_matrix", cm) acc_update = accuracy_score(y_test, y_pred) debug_print("acc in update") debug_print(acc_update) set_updated_acc(butler, X_train.shape[0], acc_update) #Get d d = mongo_mem_get(butler, 'd') sample_list = redis_mem_get(butler, "sample_list") #Get queries with largest entropy if (len(sample_list) <= 2 * d): X_unlabelled = vstack(X_unlabelled_list) largest_val = get_largest_values(X_unlabelled, lr_model, unlabelled_list, study_id_list, d) sample_probs = redis_mem_get(butler, "sample_probs") #Contains next set of queries to be asked sample_list = sample_list + largest_val['index'].tolist() # Contains probabilities of the next set of queries to be asked sample_probs = sample_probs + largest_val['prob'].tolist() #DEBUG debug_print("sample_list") debug_print(sample_list) redis_mem_set(butler, 'sample_list', sample_list) redis_mem_set(butler, 'sample_probs', sample_probs) mongo_mem_set(butler, 'S', []) mongo_mem_set(butler, 'labelled_list', labelled_list)
def reset_redis(self, app, butler): bucket_id = os.environ.get("AWS_BUCKET_NAME") file_name_list = ['samples.csv', 'Labels.csv', 'studies.csv'] csv_content_dict = s3.get_csv_content_dict(bucket_id, file_name_list) for filename, content in csv_content_dict.items(): if filename is 'samples.csv': samples_df = pd.read_csv(io.BytesIO(content)) # utils.debug_print("database_lib corona") # utils.debug_print(samples_df.head()) elif filename is 'Labels.csv': labels_df = pd.read_csv(io.BytesIO(content)) # utils.debug_print("database_lib corona") # utils.debug_print(labels_df.head()) elif filename is 'studies.csv': study_df = pd.read_csv(io.BytesIO(content)) # utils.debug_print("database_lib corona") # utils.debug_print(study_df.head()) df_sort = labels_df.groupby(['dataset_type']) for dataset_type, df_cur in df_sort: if (dataset_type == constants.UNLABELLED_TAG): unlabelled_indices = df_cur['index'].tolist() elif (dataset_type == constants.TRAIN_TAG): train_indices = df_cur['index'].values elif (dataset_type == constants.TEST_TAG): test_indices = df_cur['index'].values batch_no = labels_df['batch_no'].max() if pd.isnull(batch_no): batch_no = 0 butler.memory.set("batch_no", pickle.dumps(batch_no + 1)) train_df = samples_df.loc[samples_df['index'].isin(train_indices)] train_df['label'] = labels_df.loc[ labels_df['index'].isin(train_indices), 'label'] test_df = samples_df.loc[samples_df['index'].isin(test_indices)] test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices), 'label'] unlabelled_df = samples_df.loc[samples_df['index'].isin( unlabelled_indices)] X_unlabelled_str = lr.create_vector(unlabelled_df) N = 2 bag_of_words = [ "differentiated", "cell", "hela", "derived", "CL:0000057", "CL:0000115", "EFO:0000322", "EFO:0000324", "EFO:0000313", "CL:0000034" ] # create the transform vectorizer = TfidfVectorizer(ngram_range=(1, N + 1), decode_error='ignore') # tokenize and build vocab try: vectorizer.fit(bag_of_words) except Exception as e: utils.debug_print(e) X_train_str, y_train = lr.create_dict(train_df) X_test_str, y_test = lr.create_dict(test_df) # Encode y_train and y_test y_train = pd.Series(y_train) sample_dict = lr.get_encode_dict() y_train = y_train.replace(sample_dict).values y_test = pd.Series(y_test) y_test = y_test.replace(sample_dict).values # encode document try: X_train = vectorizer.transform(X_train_str) except Exception as e: utils.debug_print(e) try: X_test = vectorizer.transform(X_test_str) except Exception as e: utils.debug_print(e) try: X_unlabelled = vectorizer.transform(X_unlabelled_str) except Exception as e: utils.debug_print(e) study_id_list = unlabelled_df['sra_study_id'].tolist() for i, row in unlabelled_df.iterrows(): utils.debug_print(str(row['index'])) redis_mem_set(butler, str(row['index']), row) utils.debug_print("done setting unlabelled") lr_model = LogisticRegression(penalty='l1') lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) acc_init = accuracy_score(y_test, y_pred) utils.debug_print("acc in app dashboard") utils.debug_print(acc_init) largest_val = lr.get_largest_values(X_unlabelled, lr_model, unlabelled_indices, study_id_list, 5) sample_list = largest_val['index'].tolist() sample_probs = largest_val['prob'].tolist() lr_classes = lr.get_decode_list(lr_model.classes_) redis_mem_set(butler, 'lr_classes', lr_classes) utils.debug_print("sample_list init") utils.debug_print(sample_list) utils.debug_print("sample prob init") utils.debug_print(sample_probs) for i, row in study_df.iterrows(): redis_mem_set(butler, row['sra_study_id'], row) #Making sure that the eariler batch hasnt been labelled before algo_list = butler.algorithms.get(pattern={'exp_uid': app.exp_uid}) S_trial = {} # for cur_algo in algo_list: # if cur_algo.get("alg_id") is "LogisticRegressionActive": # S_trial = json.loads(cur_algo.get("S_trial")) #Remove hardcode cur_algo = algo_list[0] cur_algo["sample_probs"] = sample_probs cur_algo["sample_list"] = sample_list lr.redis_mem_set(butler, 'sample_probs', sample_probs) lr.redis_mem_set(butler, 'sample_list', sample_list) lr.redis_mem_set(butler, "study_id_list", study_id_list) # Set data in mem # redis_mem_set(butler, "does_this_work", 5) lr.redis_mem_set(butler, "train_data", train_df) lr.redis_mem_set(butler, "bucket_id", bucket_id) lr.redis_mem_set(butler, "test_data", test_df) lr.redis_mem_set(butler, "unlabelled_data", unlabelled_df[['key_value', 'ontology_mapping']]) lr.redis_mem_set(butler, "label_data", labels_df) lr.redis_mem_set(butler, "unlabelled_list", unlabelled_indices) lr.redis_mem_set(butler, "X_unlabelled", X_unlabelled) return {}
def redis_mem_set(butler, key, value): # Butler.memory is essentially set in redis try: butler.memory.set(key, pickle.dumps(value)) except Exception as e: utils.debug_print("Could not set " + key + " in redis")
class MyApp: #Data files stored inside NEXT/local DIR_PATH = os.path.dirname(os.path.realpath(__file__)) FILE_PATH = os.path.join(DIR_PATH, constants.PATH_FROM_myApp) utils.debug_print("myApp " + FILE_PATH) def __init__(self, db): self.app_id = 'PoolBasedBinaryClassification' self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager( db) utils.debug_print("initialized myApp again") def append(butler, row, key="data"): butler.memory.cache.lpush(key, pickle.dumps(row)) def df2bytes(df): with BytesIO() as f: df.to_pickle(f) df_bytes = f.getvalue() return df_bytes def getitem(butler, index, key="data"): unlabelled_len = butler.algorithms.get(key="unlabelled_len") bytes_ = butler.memory.cache.lindex(key, unlabelled_len - index - 1) row = pickle.loads(bytes_) return row def initExp(self, butler, init_algs, args): utils.debug_print("experiment initialized again") args['n'] = len(args['targets']['targetset']) #Use when Amazon s3 is needed # bucket_id = args['bucket_id'] # key_id = args['key_id'] # secret_key = args['secret_key'] # set(butler.memory, "bucket_id", bucket_id) # set(butler.memory, "key_id", key_id) # set(butler.memory, "secret_key", secret_key) samples_filename = self.FILE_PATH + "/samples.csv" samples_df = pd.read_csv(samples_filename) studies_filename = self.FILE_PATH + "/studies.csv" study_df = pd.read_csv(studies_filename) labels_filename = self.FILE_PATH + "/Labels.csv" labels_df = pd.read_csv(labels_filename) #Loading ontologies, #This particular index contains ontologies that we are concerned with #Use this to integrate with s3 # file_name_list = ['samples.csv','Labels.csv','studies.csv'] # csv_content_dict = s3.get_csv_content_dict(bucket_id,file_name_list) # for filename,content in csv_content_dict.items(): # if filename is 'samples.csv': # samples_df = pd.read_csv(io.BytesIO(content)) # elif filename is 'Labels.csv': # labels_df = pd.read_csv(io.BytesIO(content)) # elif filename is 'studies.csv': # study_df = pd.read_csv(io.BytesIO(content)) experiment = butler.experiment.get() df_sort = labels_df.groupby(['dataset_type']) for dataset_type, df_cur in df_sort: if (dataset_type == constants.UNLABELLED_TAG): unlabelled_indices = df_cur['index'].tolist() elif (dataset_type == constants.TRAIN_TAG): train_indices = df_cur['index'].values elif (dataset_type == constants.TEST_TAG): test_indices = df_cur['index'].values batch_no = labels_df['batch_no'].max() if pd.isnull(batch_no): batch_no = 0 butler.memory.set("batch_no", pickle.dumps(batch_no + 1)) train_df = samples_df.loc[samples_df['index'].isin(train_indices)] train_df['label'] = labels_df.loc[ labels_df['index'].isin(train_indices), 'label'] test_df = samples_df.loc[samples_df['index'].isin(test_indices)] test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices), 'label'] unlabelled_df = samples_df.loc[samples_df['index'].isin( unlabelled_indices)] study_id_list = unlabelled_df['sra_study_id'].tolist() unlabelled_indices = unlabelled_df['index'].tolist() for i, row in unlabelled_df.iterrows(): set_debug(butler.memory, str(row['index']), row, i, verbose=i % 10000 == 0) utils.debug_print("done setting unlabelled") for i, row in study_df.iterrows(): set(butler.memory, row['sra_study_id'], row) train_list = [] acc_list = [] set(butler.memory, "study_id_list", study_id_list) # Set data in memory set(butler.memory, "train_data", train_df) set(butler.memory, "test_data", test_df) set(butler.memory, "unlabelled_data", unlabelled_df[['key_value', 'ontology_mapping']]) set(butler.memory, "label_data", labels_df) set(butler.memory, "unlabelled_list", unlabelled_indices) butler.memory.set("train_list", pickle.dumps(train_list)) butler.memory.set("acc_list", pickle.dumps(acc_list)) alg_data = { 'n': args['n'], 'failure_probability': args['failure_probability'], 'd': args['d'] } init_algs(alg_data) return args def getQuery(self, butler, alg, args): sttime = time.time() alg_response = alg({'participant_uid': args['participant_uid']}) # Get Unlabelled Set #alg_response contains index returned from LogisticRegressionActive getQuery method #Retrieve the row using this index unlabelled_row = butler.memory.get(str(alg_response)) if unlabelled_row is None: utils.debug_print("No row was retrieved") return {} unlabelled_row = pickle.loads(unlabelled_row).replace(np.nan, "None") unlabelled_row_dict = unlabelled_row.to_dict() sra_study_id = unlabelled_row_dict.get('sra_study_id') sra_sample_id = unlabelled_row_dict.get('sra_sample_id') key_value = unlabelled_row_dict.get('key_value') #Convert from str to dict key_value_dict = ast.literal_eval(key_value) ontology_mapping = unlabelled_row_dict.get('ontology_mapping') # Convert from str to list ontology_mapping_list = ast.literal_eval(ontology_mapping) ont_mapping_dict = {} if ontology_mapping_list is None: ontology_mapping_list = [] for ont in ontology_mapping_list: ont_org = ont return_link = "" #pre-processing steps ont = ont.replace(":", "_") ''' "DOID": "DOID.17-01-30.obo", "UBERON": "UBERON.17-01-30.obo", "CL": "CL.18-11-13.obo", "CVCL": "CVCL.17-01-30.obo", "UO": "UO.17-01-30.obo", "EFO": "EFO.17-01-30.obo", "CHEXBI": "CHEBI.17-01-30.obo", "GO": "GO.19-01-18.obo" ''' #TODO: Other terms link if "CL" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/cl/terms?short_form=" + ont elif "UBERON" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/uberon/terms?short_form=" + ont elif "DOID" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/doid/terms?short_form=" + ont elif "EFO" in ont: return_link = "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + ont elif "CVCL" in ont: return_link = "https://web.expasy.org/cellosaurus/" + ont ont_mapping_dict[ont_org] = return_link #retrieve study row based on study_id study_row_str = pickle.loads(butler.memory.get(sra_study_id)).replace( np.nan, "None") study_row_json = study_row_str.to_dict() #Class-wise confidence of all classes cur_confidence = butler.memory.get("cur_confidence") if cur_confidence is None: cur_confidence = pickle.dumps([]) cur_confidence = pickle.loads(cur_confidence) utils.debug_print(cur_confidence) #Get name of classes lr_classes = butler.memory.get("lr_classes") if lr_classes is None: lr_classes = pickle.dumps([]) lr_classes = pickle.loads(lr_classes) #this is what is received in widgets/getQuery_widget.html ret = { 'target_indices': unlabelled_row_dict, 'study': study_row_json, 'key_value': key_value_dict, 'ontology_mapping': ont_mapping_dict, 'cur_confidence': cur_confidence, 'lr_classes': lr_classes, 'sra_sample_id': sra_sample_id } return ret def processAnswer(self, butler, alg, args): query = butler.queries.get(uid=args['query_uid']) target = query['target_indices'] target_label = args['target_label'] #DEBUG # utils.debug_print("type(target_label)") # utils.debug_print(type(target_label)) num_reported_answers = butler.experiment.increment( key='num_reported_answers_for_' + query['alg_label']) labelled_row = pickle.loads(butler.memory.get(str(target['index']))) if labelled_row is None: utils.debug_print("Labelled row doesnt exist") return {} # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict experiment = butler.experiment.get() d = experiment['args']['d'] # if num_reported_answers % ((d+4)/4) == 0: # butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}})) alg({'target_index': target['index'], 'target_label': target_label}) return {'target_index': target['index'], 'target_label': target_label} def getModel(self, butler, alg, args): return alg()
def initExp(self, butler, init_algs, args): utils.debug_print("experiment initialized again") args['n'] = len(args['targets']['targetset']) #Use when Amazon s3 is needed # bucket_id = args['bucket_id'] # key_id = args['key_id'] # secret_key = args['secret_key'] # set(butler.memory, "bucket_id", bucket_id) # set(butler.memory, "key_id", key_id) # set(butler.memory, "secret_key", secret_key) samples_filename = self.FILE_PATH + "/samples.csv" samples_df = pd.read_csv(samples_filename) studies_filename = self.FILE_PATH + "/studies.csv" study_df = pd.read_csv(studies_filename) labels_filename = self.FILE_PATH + "/Labels.csv" labels_df = pd.read_csv(labels_filename) #Loading ontologies, #This particular index contains ontologies that we are concerned with #Use this to integrate with s3 # file_name_list = ['samples.csv','Labels.csv','studies.csv'] # csv_content_dict = s3.get_csv_content_dict(bucket_id,file_name_list) # for filename,content in csv_content_dict.items(): # if filename is 'samples.csv': # samples_df = pd.read_csv(io.BytesIO(content)) # elif filename is 'Labels.csv': # labels_df = pd.read_csv(io.BytesIO(content)) # elif filename is 'studies.csv': # study_df = pd.read_csv(io.BytesIO(content)) experiment = butler.experiment.get() df_sort = labels_df.groupby(['dataset_type']) for dataset_type, df_cur in df_sort: if (dataset_type == constants.UNLABELLED_TAG): unlabelled_indices = df_cur['index'].tolist() elif (dataset_type == constants.TRAIN_TAG): train_indices = df_cur['index'].values elif (dataset_type == constants.TEST_TAG): test_indices = df_cur['index'].values batch_no = labels_df['batch_no'].max() if pd.isnull(batch_no): batch_no = 0 butler.memory.set("batch_no", pickle.dumps(batch_no + 1)) train_df = samples_df.loc[samples_df['index'].isin(train_indices)] train_df['label'] = labels_df.loc[ labels_df['index'].isin(train_indices), 'label'] test_df = samples_df.loc[samples_df['index'].isin(test_indices)] test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices), 'label'] unlabelled_df = samples_df.loc[samples_df['index'].isin( unlabelled_indices)] study_id_list = unlabelled_df['sra_study_id'].tolist() unlabelled_indices = unlabelled_df['index'].tolist() for i, row in unlabelled_df.iterrows(): set_debug(butler.memory, str(row['index']), row, i, verbose=i % 10000 == 0) utils.debug_print("done setting unlabelled") for i, row in study_df.iterrows(): set(butler.memory, row['sra_study_id'], row) train_list = [] acc_list = [] set(butler.memory, "study_id_list", study_id_list) # Set data in memory set(butler.memory, "train_data", train_df) set(butler.memory, "test_data", test_df) set(butler.memory, "unlabelled_data", unlabelled_df[['key_value', 'ontology_mapping']]) set(butler.memory, "label_data", labels_df) set(butler.memory, "unlabelled_list", unlabelled_indices) butler.memory.set("train_list", pickle.dumps(train_list)) butler.memory.set("acc_list", pickle.dumps(acc_list)) alg_data = { 'n': args['n'], 'failure_probability': args['failure_probability'], 'd': args['d'] } init_algs(alg_data) return args
def get(self, exp_uid): """ .. http:get:: /experiment/<exp_uid>/participants Get all participant response data associated with a given exp_uid. **Example request**: .. sourcecode:: http GET /experiment/<exp_uid>/participants HTTP/1.1 Host: next_backend.next.discovery.wisc.edu **Example response**: .. sourcecode:: http HTTP/1.1 200 OK Vary: Accept Content-Type: application/json { participant_responses: [participant_responses] status: { code: 200, status: OK, }, } :>json all_participant_responses: list of all participant_responses :statuscode 200: Participants responses successfully returned :statuscode 400: Participants responses failed to be generated """ true_values ={1, '1', 'True', 'true'} zip_true = False if 'zip' in request.args.keys(): zip_true = True if request.args.get('zip') in true_values else False csv = False if 'csv' in request.args.keys(): csv = True if request.args.get('csv') in true_values else False # Get all participants for exp_uid from resource_manager participant_uids = resource_manager.get_participant_uids(exp_uid) participant_responses = {} # Iterate through list of all participants for specified exp_uid for participant in participant_uids: response = resource_manager.get_participant_data(participant, exp_uid) # Append participant query responses to list participant_responses[participant] = response if csv: responses = [] for participant in participant_uids: response = resource_manager.get_participant_data(participant, exp_uid) for r in response: responses += [r] try: response_file = parse_responses(responses) except ValueError as e: message = str(e) message += '\n\n' + str(traceback.format_exc()) utils.debug_print(message) return message all_responses = {'participant_responses': participant_responses} if zip_true: filename, content = ('responses.json', json.dumps(all_responses)) if request.args.get('csv'): filename, content = ('responses.csv', response_file.getvalue()) zip_responses = BytesIO() with zipfile.ZipFile(zip_responses, 'w', compression=zipfile.ZIP_DEFLATED) as zf: zf.writestr(filename, content) zip_responses.seek(0) return send_file(zip_responses, attachment_filename=filename + '.zip', as_attachment='True') else: return api_util.attach_meta(all_responses, meta_success), 200
def post(self): # TODO? replace with msgpack args = self.deserialise(request.get_data()) # Unpacking the YAML/ZIP file for key in args: if key not in {'bucket_id', 'key_id', 'secret_key'}: comma_idx = args[key].find(',') args[key] = args[key][comma_idx + 1:] if args[key] in {'True', 'False'}: args[key] = True if args[key] == 'True' else False else: args[key] = base64.decodestring(args[key]) if all( [key not in args for key in ['bucket_id', 'key_id', 'secret_key']]): args['upload'] = False else: args['upload'] = True args['args'] = yaml.load(args['args']) try: init_exp_args = args['args'] if 'targets' in args.keys(): target_zipfile = args['targets'] if args.get('upload', True): bucket_id = args['bucket_id'] key_id = args['key_id'] secret_key = args['secret_key'] init_exp_args['args']['bucket_id'] = bucket_id init_exp_args['args']['key_id'] = key_id init_exp_args['args']['secret_key'] = secret_key targets = target_unpacker.unpack(target_zipfile, key_id, secret_key, bucket_id) else: filenames = target_unpacker.get_filenames_from_zip( target_zipfile) utils.debug_print("This will be bold and yellow!") utils.debug_print(filenames) utils.debug_print("This will be bold and yellow!") if len(filenames) != 1: raise ValueError( 'Specify exactly one file in the ZIP file') filename = filenames[0] extension = filename.split('.')[-1] targets = target_unpacker.unpack_text_file(target_zipfile, kind=extension) init_exp_args['args']['targets'] = {'targetset': targets} if 'keys_for_all_targets' in init_exp_args['args']: pairs = init_exp_args['args']['keys_for_all_targets'] for pair in pairs: map( lambda target: target.update( {pair['key']: pair['value']}), init_exp_args['args']['targets']['targetset']) # Init the experiment: app_id = init_exp_args['app_id'] exp_uid = '%030x' % random.randrange(16**30) r = broker.applyAsync(app_id, exp_uid, 'initExp', json.dumps(init_exp_args)) response_json, didSucceed, message = r if not didSucceed: raise ValueError(message) except: tb = traceback.format_exc() info = sys.exc_info() if hasattr(info[1], 'message') and len(info[1].message) > 0: message = info[1].message if 'time' in message: message += ( "\nNOTE: error has to do with time; try " "restarting docker, more detail at " "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and" ) else: message = str(info[1]) + str(info[-1]) message = '\n'.join(tb.split('\n')[-5:]) message = message + '\n\nDetails:\n' + tb return {'success': False, 'message': message, 'exp_uid': None} return { 'success': didSucceed, 'message': message, 'exp_uid': exp_uid, 'app_id': args['args']['app_id'] }
def pop_list(self, database_id, bucket_id, doc_uid, key, value): """ pops a value from a list. If value=0, pops from start of list If value=-1, pops from end of list. (note this is inconsistent with Mongo's api to be consistent with python's pop) See declaration of mongo_index for more info. Inputs: (string) database_id, (string) bucket_id, (string) doc_uid, (string) key, (int) value Outputs: (any) value, (bool) didSucceed, (string) message Usage: ::\n value, didSucceed, message = db.set(database_id, bucket_id, doc_uid, key, value) """ if self.client is None: didSucceed, message = self.connectToMongoServer() if not didSucceed: return None, False, message # For Mongo's $pop, 1 means last element, -1 means first element try: if value == -1: mongo_index = 1 elif value == 0: mongo_index = -1 else: raise DatabaseException( "can only pop first (value=0) or last (value=-1) element") try: return_value = self.client[database_id][ bucket_id].find_and_modify({"_id": doc_uid}, {'$pop': { key: mongo_index }})[key] except KeyError as e: if e.args[0] == key: raise DatabaseException( "key '{}' not found in document '{}.{}'".format( key, database_id, bucket_id)) elif e.args[0] == bucket_id: raise DatabaseException( "bucket '{}' not found in database '{}'".format( bucket_id, database_id)) elif e.args[0] == database_id: raise DatabaseException( "database '{}' not found".format(database_id)) else: raise DatabaseException( "unknown KeyError: '{}' not found".format(e)) except OperationFailure: # This gets thrown if you try to pop from a non-list raise DatabaseException("cannot pop from non-list") if return_value: return_value = return_value[value] else: raise DatabaseException("cannot pop from empty list") return_value = self.undoDatabaseFormat(return_value) return return_value, True, 'From Mongo' except DatabaseException as e: error = "PermStore.pop_list failed with exception: {}".format(e) utils.debug_print(error) return None, False, error
def mongo_mem_set(butler, key, value): #Butler.algorithms is essentially set in mongoDB try: butler.algorithms.set(key=key, value=value) except Exception as e: utils.debug_print("Could not set " + key + " in mongodb")
def getModel(self, butler): # The model is simply the vector of weights and a record of the number of reported answers. utils.debug_print(butler.algorithms.get(key=['weights', 'num_reported_answers'])) return butler.algorithms.get(key=['weights', 'num_reported_answers'])