Ejemplo n.º 1
0
Archivo: App.py Proyecto: nextml/NEXT
    def getModel(self, exp_uid, args_json):
        try:
            args_dict = self.helper.convert_json(args_json)
            args_dict = verifier.verify(args_dict, self.reference_dict['getModel']['args'])
            alg_label = args_dict['args']['alg_label']
            args = self.butler.experiment.get(key='args')
            for algorithm in args['alg_list']:
                if alg_label == algorithm['alg_label']:
                    alg_id = algorithm['alg_id']

            myapp_response = self.call_app_fn(alg_label, alg_id, 'getModel', args_dict)

            myapp_response['exp_uid'] = exp_uid
            myapp_response['alg_label'] = alg_label
            # Log the response of the getModel in ALG-EVALUATION
            if args_dict['args']['logging']:
                alg_log_entry = {'exp_uid': exp_uid, 'alg_label':alg_label, 'task': 'getModel', 'timestamp': str(utils.datetimeNow())}
                alg_log_entry.update(myapp_response)
                self.butler.log('ALG-EVALUATION', alg_log_entry)
            return json.dumps({'args': myapp_response,
                               'meta': {'log_entry_durations':self.log_entry_durations,
                                        'timestamp': str(utils.datetimeNow())}}), True, ''
        except Exception, error:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            full_error = str(traceback.format_exc())+'\n'+str(error)
            utils.debug_print("getModel Exception: " + full_error, color='red')
            log_entry = { 'exp_uid':exp_uid,'task':'getModel','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json }
            self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
            traceback.print_tb(exc_traceback)
            return Exception(error)
Ejemplo n.º 2
0
Archivo: App.py Proyecto: nextml/NEXT
    def processAnswer(self, exp_uid, args_json):
        try:
            args_dict = self.helper.convert_json(args_json)
            args_dict = verifier.verify(args_dict, self.reference_dict['processAnswer']['args'])
            # Update timing info in query
            query = self.butler.queries.get(uid=args_dict['args']['query_uid'])
            timestamp_answer_received = args_dict['args'].get('timestamp_answer_received', None)
            delta_datetime = utils.str2datetime(timestamp_answer_received) - \
                             utils.str2datetime(query['timestamp_query_generated'])
            round_trip_time = delta_datetime.total_seconds()
            response_time = float(args_dict['args'].get('response_time',0.))

            query_update = self.call_app_fn(query['alg_label'], query['alg_id'], 'processAnswer', args_dict)
            query_update.update({'response_time':response_time,
                                 'network_delay':round_trip_time - response_time,
                                 'timestamp_answer_received': timestamp_answer_received
                                 })
            self.butler.queries.set_many(uid=args_dict['args']['query_uid'],key_value_dict=query_update)

            return json.dumps({'args': {}, 'meta': {'log_entry_durations':self.log_entry_durations}}), True, ''

        except Exception, error:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            full_error = str(traceback.format_exc())+'\n'+str(error)
            utils.debug_print("processAnswer Exception: " + full_error, color='red')
            log_entry = { 'exp_uid':exp_uid,'task':'processAnswer','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json }
            self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
    	    traceback.print_tb(exc_traceback)
    	    raise Exception(error)
Ejemplo n.º 3
0
def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key,
                  i=None, get_bucket=True):
    if get_bucket:
        bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key)
    else:
        bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key)

    utils.debug_print('Uploading target: {}'.format(filename))
    url = s3.upload(filename,  StringIO(file_obj), bucket)
    target_types = {'png': 'image', 'jpeg': 'image', 'jpg': 'image', 'gif': 'image',
                    'mp4': 'video', 'mov': 'video',
                    'txt': 'text', 'csv': 'text'}
    filetype = filename.split('.')[-1]
    if filetype not in target_types:
        msg = ('Target not recognized (extension: "{}"). '
               'Available extensions: {}').format(filetype, list(target_types.keys()))
        raise ValueError(msg)

    utils.debug_print('Done uploading target: {}'.format(filename))

    return {'target_id': str(i),
            'primary_type': target_types[filetype],
            'primary_description': url,
            'alt_type': 'text',
            'alt_description': filename}
Ejemplo n.º 4
0
def unpack(s, aws_key, aws_secret_key, bucket_name, n_jobs=None,
           get_bucket=True):
    base64_zip = io.BytesIO(s)
    zip_file = zipfile.ZipFile(base64_zip)
    files = zipfile_to_dictionary(zip_file)

    if not n_jobs:
        n_jobs = min(len(files), 50)

    if not bucket_name:
        bucket_name = '{}{}'.format(aws_key.lower(), utils.random_string(length=20))

    # TODO: trim here for JSON object to append to dictionaries
    # TODO: manage CSV targets here
    # TODO: how come creating a S3 bucket isn't working for me?
    utils.debug_print('Beginning to upload targets')
    try:
        targets = Parallel(n_jobs=n_jobs, backend='threading') \
                    (delayed(upload_target, check_pickle=False)
                              (name, file, bucket_name, aws_key, aws_secret_key,
                               i=i, get_bucket=True)
                   for i, (name, file) in enumerate(files.items()))
    except:
        utils.debug_print('Whoops, parallel upload failed. '
                          'Trying with {} threads'.format(n_jobs))
        targets = [upload_target(name, file, bucket_name, aws_key, aws_secret_key,
                                 i=i, get_bucket=True)
                   for i, (name, file) in enumerate(files.items())]
    return targets
Ejemplo n.º 5
0
 def lock(self, name, **kwargs):
     try:
         self.ensure_connection()
         name = self.key_prefix + name
         return self.cache.lock(name, **kwargs)
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.lock exception: {}".format(e))
         return None
Ejemplo n.º 6
0
 def exists(self, key):
     try:
         self.ensure_connection()
         key = self.key_prefix + key
         return self.cache.exists(key)
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.exists exception: {}".format(e))
         return None
Ejemplo n.º 7
0
    def pop_list(self, database_id, bucket_id, doc_uid, key, value):
        """
        pops a value from a list.
        If value=0, pops from start of list
        If value=-1, pops from end of list.
        (note this is inconsistent with Mongo's api to be consistent with python's pop)
        See declaration of mongo_index for more info.
        
        Inputs: 
            (string) database_id, (string) bucket_id, (string) doc_uid, (string) key, (int) value
        
        Outputs:
            (any) value, (bool) didSucceed, (string) message 
        
        Usage: ::\n
            value, didSucceed, message = db.set(database_id, bucket_id, doc_uid, key, value)
        """
        if self.client is None:
            didSucceed, message = self.connectToMongoServer()
            if not didSucceed:
                return None, False, message
        # For Mongo's $pop, 1 means last element, -1 means first element
        try:
            if value == -1:
                mongo_index = 1
            elif value == 0:
                mongo_index = -1
            else:
                raise DatabaseException("can only pop first (value=0) or last (value=-1) element")
            try:
                return_value = self.client[database_id][bucket_id].find_and_modify({"_id": doc_uid},
                                                                                   {'$pop': {key: mongo_index}})[key]
            except KeyError as e:
                if e.args[0] == key:
                    raise DatabaseException("key '{}' not found in document '{}.{}'".format(key, database_id, bucket_id))
                elif e.args[0] == bucket_id:
                    raise DatabaseException("bucket '{}' not found in database '{}'".format(bucket_id, database_id))
                elif e.args[0] == database_id:
                    raise DatabaseException("database '{}' not found".format(database_id))
                else:
                    raise DatabaseException("unknown KeyError: '{}' not found".format(e))
            except OperationFailure:  # This gets thrown if you try to pop from a non-list
                raise DatabaseException("cannot pop from non-list")
            if return_value:
                return_value = return_value[value]
            else:
                raise DatabaseException("cannot pop from empty list")
            return_value = self.undoDatabaseFormat(return_value)

            return return_value, True, 'From Mongo'
        except DatabaseException as e:
            error = "PermStore.pop_list failed with exception: {}".format(e)
            utils.debug_print(error)
            return None, False, error
Ejemplo n.º 8
0
Archivo: myAlg.py Proyecto: nextml/NEXT
    def getQuery(self, butler):
        num_ans = butler.algorithms.get(key='num_reported_answers')
        query_list = butler.algorithms.get(key='query_list')
        i = num_ans % len(query_list)

        query = query_list[i]
        utils.debug_print(query)
        #  butler.participants.set(key='query', value=query)

        # append the current query to do_not_ask
        #  butler.algorithms.append(key='do_not_ask', value=query)
        return query[2], query[0], query[1]
Ejemplo n.º 9
0
 def set(self, key, value):
     self.check_prefix()
     key = self.key_prefix + key
     try:
         self.ensure_connection()
         l = len(value)
         n = self.num_entries(l)
         utils.debug_print("Setting {} in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             self.cache.set(k, value[i*self.max_entry_size:(i+1)*self.max_entry_size])
         return self.cache.set(key, "{}:{}".format(str(n), str(l)))
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.set exception: {}".format(e))
         return False
Ejemplo n.º 10
0
def docs(app_id=None,form="raw"):
    if app_id:
        filename = '{0}/myApp.yaml'.format(app_id)

        utils.debug_print(filename)
        api,blank,pretty = doc_gen.get_docs(filename,'apps/')
        
        if form == "pretty":
            return render_template('doc.html',doc_string=pretty, base_dir="/assistant/static")
        elif form == "blank":
            return render_template('raw.html',doc=blank)
        elif form == "raw":
            return render_template('raw.html',doc=api)

    message = ('Welcome to the next.discovery system.\n '
               'Available apps {}'.format(', '.join(utils.get_supported_apps())))

    return render_template('raw.html',doc=message)
Ejemplo n.º 11
0
 def set_file(self, key, f):
     self.check_prefix()
     key = self.key_prefix + key
     try:
         self.ensure_connection()
         f.seek(0, os.SEEK_END)
         l = f.tell()
         f.seek(0, 0)
         n = self.num_entries(l)
         utils.debug_print("Setting {} bytes in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             v = f.read(self.max_entry_size)
             self.cache.set(k, v)
         return self.cache.set(key, "{}:{}".format(str(n), str(l)))
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.set_file exception: {}".format(e))
         return False
Ejemplo n.º 12
0
 def get(self, key):
     self.check_prefix()
     try:
         self.ensure_connection()
         key = self.key_prefix + key
         d = self.cache.get(key)
         n, l = d.split(":")
         l = int(l)
         n = int(n)
         ans = ""
         utils.debug_print("Getting {} bytes in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             ans += self.cache.get(k)
         return ans
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.get exception: {}".format(e))
         return None
Ejemplo n.º 13
0
    def initExp(self, butler, init_algs, args):
        utils.debug_print("AA: "+str(args))
        if 'targetset' in args['targets'].keys():
            n  = len(args['targets']['targetset'])
            self.TargetManager.set_targetset(butler.exp_uid, args['targets']['targetset'])

        d = len(args['targets']['targetset'][0]['meta']['features'])
        args['n'] = n
        args['d'] = d
        del args['targets']

        alg_data = {}
        algorithm_keys = ['n','failure_probability']
        for key in algorithm_keys:
            if key in args:
                alg_data[key]=args[key]
        init_algs(alg_data)
        return args
Ejemplo n.º 14
0
 def get_file(self, key):
     self.check_prefix()
     try:
         self.ensure_connection()
         key = self.key_prefix + key
         d = self.cache.get(key)
         f = StringIO.StringIO()
         n, l = d.split(":")
         l = int(l)
         n = int(n)
         utils.debug_print("Getting {} bytes in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             f.write(self.cache.get(k))
         f.seek(0, 0)
         return f
     except Exception as e:
         utils.debug_print("Butler.Collection.Memory.get_file exception: {}".format(e))
         return None
Ejemplo n.º 15
0
def upload_target(filename, file_obj, bucket_name, aws_key, aws_secret_key,
                  i=None, get_bucket=True):
    if get_bucket:
        bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key)
    else:
        bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key)

    utils.debug_print('begin ' + filename)
    url = s3.upload(filename,  StringIO(file_obj), bucket)
    target_types = {'png': 'image', 'jpeg': 'image', 'jpg': 'image',
                    'mp4': 'movie', 'mov': 'movie',
                    'txt': 'text', 'csv': 'text'}
    utils.debug_print('end ' + filename)

    return {'target_id': str(i),
            'primary_type': target_types[filename.split('.')[-1]],
            'primary_description': url,
            'alt_type': 'text',
            'alt_description': filename}
Ejemplo n.º 16
0
 def set_file(self, key, f):
     self.check_prefix()
     key = self.key_prefix + key
     try:
         self.ensure_connection()
         f.seek(0, os.SEEK_END)
         l = f.tell()
         f.seek(0, 0)
         n = self.num_entries(l)
         utils.debug_print("Setting {} bytes in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             v = f.read(self.max_entry_size)
             self.cache.set(k, v)
         return self.cache.set(key, "{}:{}".format(str(n), str(l)))
     except Exception as e:
         utils.debug_print(
             "Butler.Collection.Memory.set_file exception: {}".format(e))
         return False
Ejemplo n.º 17
0
def unpack(s,
           aws_key,
           aws_secret_key,
           bucket_name,
           n_jobs=None,
           get_bucket=True):
    base64_zip = io.BytesIO(s)
    zip_file = zipfile.ZipFile(base64_zip)
    files = zipfile_to_dictionary(zip_file)

    if not n_jobs:
        n_jobs = min(len(files), 50)

    if not bucket_name:
        bucket_name = '{}{}'.format(aws_key.lower(),
                                    utils.random_string(length=20))

    # TODO: trim here for JSON object to append to dictionaries
    # TODO: manage CSV targets here
    # TODO: how come creating a S3 bucket isn't working for me?
    utils.debug_print('=== Starting upload of targets to S3 ===')
    try:
        targets = Parallel(n_jobs=n_jobs, backend='threading') \
                    (delayed(upload_target, check_pickle=False)
                              (name, file, bucket_name, aws_key, aws_secret_key,
                               i=i, get_bucket=True)
                   for i, (name, file) in enumerate(files.items()))
    except:
        utils.debug_print(
            'Whoops, parallel S3 upload failed. Trying serially.')
        targets = [
            upload_target(name,
                          file,
                          bucket_name,
                          aws_key,
                          aws_secret_key,
                          i=i,
                          get_bucket=True)
            for i, (name, file) in enumerate(files.items())
        ]

    return targets
Ejemplo n.º 18
0
    def modelUpdate(self, butler, task_args):
        arm_id = task_args['arm_id']
        reward = task_args['reward']
        participant_uid = task_args['participant_uid']

        d = butler.algorithms.get(key='d')
        R = butler.algorithms.get(key='R')
        S = butler.algorithms.get(key='S')
        delta = butler.algorithms.get(key='delta')
        ridge = butler.algorithms.get(key='ridge')

        invVt = np.array(
            butler.participants.get(uid=participant_uid, key='invVt'))
        b = np.array(butler.participants.get(uid=participant_uid, key='b'))
        features = np.load('features.npy')
        x_invVt_norm = butler.participants.get(uid=participant_uid,
                                               key='x_invVt_norm')
        t = butler.participants.get(uid=participant_uid,
                                    key='num_reported_answers')

        xt = features[arm_id, :]
        u = invVt.dot(xt)
        invVt -= np.outer(u, u) / (1 + np.inner(xt, u))
        x_invVt_norm -= np.dot(features, u)**2 / (1 + np.inner(xt, u))
        b += reward * xt
        theta_hat = invVt.dot(b)
        utils.debug_print((1 + t / (ridge * d)))
        sqrt_beta = R * np.sqrt(d * np.log(
            (1 + t / (ridge * d)) / delta)) + np.sqrt(ridge) * S
        expected_rewards = np.dot(
            features, theta_hat) + sqrt_beta * np.sqrt(x_invVt_norm)

        butler.participants.set(uid=participant_uid,
                                key='arm_order',
                                value=np.argsort(expected_rewards)[::-1])
        butler.participants.set(uid=participant_uid, key='invVt', value=invVt)
        butler.participants.set(uid=participant_uid, key='b', value=b)
        butler.participants.set(uid=participant_uid,
                                key='x_invVt_norm',
                                value=x_invVt_norm)

        return True
Ejemplo n.º 19
0
 def get_file(self, key):
     self.check_prefix()
     try:
         self.ensure_connection()
         key = self.key_prefix + key
         d = self.cache.get(key)
         f = StringIO.StringIO()
         n, l = d.split(":")
         l = int(l)
         n = int(n)
         utils.debug_print("Getting {} bytes in {} entries".format(l, n))
         for i in range(n):
             k = key + ":" + str(i)
             f.write(self.cache.get(k))
         f.seek(0, 0)
         return f
     except Exception as e:
         utils.debug_print(
             "Butler.Collection.Memory.get_file exception: {}".format(e))
         return None
Ejemplo n.º 20
0
Archivo: App.py Proyecto: nextml/NEXT
 def initExp(self, exp_uid, args_json):
     try:
         self.helper.ensure_indices(self.app_id,self.butler.db, self.butler.ell)
         args_dict = self.helper.convert_json(args_json)
         args_dict = verifier.verify(args_dict, self.reference_dict['initExp']['args'])
         args_dict['exp_uid'] = exp_uid # to get doc from db
         args_dict['start_date'] = utils.datetime2str(utils.datetimeNow())
         self.butler.admin.set(uid=exp_uid,value={'exp_uid': exp_uid, 'app_id':self.app_id, 'start_date':str(utils.datetimeNow())})
         self.butler.experiment.set(value={'exp_uid': exp_uid})
         args_dict['args'] = self.init_app(exp_uid, args_dict['args']['alg_list'], args_dict['args'])
         args_dict['git_hash'] = git_hash
         self.butler.experiment.set_many(key_value_dict=args_dict)
         return '{}', True, ''
     except Exception, error:
         exc_type, exc_value, exc_traceback = sys.exc_info()
         full_error = str(traceback.format_exc())+'\n'+str(error)
         utils.debug_print("initExp Exception: " + full_error, color='red')
         log_entry = { 'exp_uid':exp_uid,'task':'initExp','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json }
         self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
         traceback.print_tb(exc_traceback)
         return '{}', False, str(error)
Ejemplo n.º 21
0
def docs(app_id=None, form="raw"):
    if app_id:
        filename = '{0}/myApp.yaml'.format(app_id)

        utils.debug_print(filename)
        api, blank, pretty = doc_gen.get_docs(filename, 'apps/')

        if form == "pretty":
            return render_template('doc.html',
                                   doc_string=pretty,
                                   base_dir="/assistant/static")
        elif form == "blank":
            return render_template('raw.html', doc=blank)
        elif form == "raw":
            return render_template('raw.html', doc=api)

    message = ('Welcome to the next.discovery system.\n '
               'Available apps {}'.format(', '.join(
                   utils.get_supported_apps())))

    return render_template('raw.html', doc=message)
Ejemplo n.º 22
0
 def initExp(self, exp_uid, args_json):
     try:
         self.helper.ensure_indices(self.app_id,self.butler.db, self.butler.ell)
         args_dict = self.helper.convert_json(args_json)
         args_dict = verifier.verify(args_dict, self.reference_dict['initExp']['args'])
         args_dict['exp_uid'] = exp_uid # to get doc from db
         args_dict['start_date'] = utils.datetime2str(utils.datetimeNow())
         self.butler.admin.set(uid=exp_uid,value={'exp_uid': exp_uid, 'app_id':self.app_id, 'start_date':str(utils.datetimeNow())})            
         utils.debug_print("ASD "+str(args_dict))
         args_dict['args'] = self.init_app(exp_uid, args_dict['args']['alg_list'], args_dict['args'])
         args_dict['git_hash'] = git_hash
         self.butler.experiment.set(value=args_dict)
         return '{}', True, ''
     except Exception, error:
         exc_type, exc_value, exc_traceback = sys.exc_info()
         full_error = str(traceback.format_exc())+'\n'+str(error)
         utils.debug_print("initExp Exception: " + full_error, color='red')
         log_entry = { 'exp_uid':exp_uid,'task':'initExp','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } 
         self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
         traceback.print_tb(exc_traceback)
         return '{}', False, str(error)
Ejemplo n.º 23
0
    def processAnswer(self, butler, alg, args):
        query = butler.queries.get(uid=args['query_uid'])

        target = query['target_indices']
        target_label = args['target_label']
        #DEBUG
        # utils.debug_print("type(target_label)")
        # utils.debug_print(type(target_label))
        num_reported_answers = butler.experiment.increment(
            key='num_reported_answers_for_' + query['alg_label'])
        labelled_row = pickle.loads(butler.memory.get(str(target['index'])))
        if labelled_row is None:
            utils.debug_print("Labelled row doesnt exist")
            return {}
        # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict
        experiment = butler.experiment.get()
        d = experiment['args']['d']
        # if num_reported_answers % ((d+4)/4) == 0:
        #     butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}}))
        alg({'target_index': target['index'], 'target_label': target_label})
        return {'target_index': target['index'], 'target_label': target_label}
Ejemplo n.º 24
0
    def getQuery(self, butler, alg, args):
        experiment = butler.experiment.get()
        n = experiment['args']['n']
        exp_uid = experiment['exp_uid']
        participant_uid = args['participant_uid']
        num_responses = butler.participants.get(uid=participant_uid, key='num_responses')
        init_arm = int(args['init_arm'])
        print('init_arm:', init_arm)
        if num_responses == 0 or num_responses is None:
            butler.participants.set(uid=participant_uid, key='init_arm', value=init_arm)
            arm_order = range(n)
            np.random.shuffle(arm_order)
            butler.participants.set(uid=participant_uid, key='arm_order', value=arm_order)
            butler.participants.set(uid=participant_uid, key='do_not_ask', value=[init_arm])
            print('Initialized lists in getQuery')

        alg_response = alg({'participant_uid': participant_uid})
        exp_uid = butler.exp_uid

        if num_responses == 0 or num_responses is None:
            butler.participants.set(uid=participant_uid, key='init_arm', value=init_arm)
            arm_order = range(n)
            np.random.shuffle(arm_order)
            butler.participants.set(uid=participant_uid, key='arm_order', value=arm_order)

        utils.debug_print('Alg_resp:', alg_response)

        target = self.TargetManager.get_target_item(exp_uid, alg_response)
        init_target = init_arm and self.TargetManager.get_target_item(exp_uid, init_arm)

        return_dict = {
            'target_indices': [alg_response],
            'targets': [target],
            'init_target': init_target,
            'instructions': 'Is this the kind of document you are looking for?',
            # 'instructions': 'Is this the kind of image you are looking for?',
            'count': 1,
        }

        return return_dict
Ejemplo n.º 25
0
def upload_target(filename,
                  file_obj,
                  bucket_name,
                  aws_key,
                  aws_secret_key,
                  i=None,
                  get_bucket=True):
    if get_bucket:
        bucket = s3.get_bucket(bucket_name, aws_key, aws_secret_key)
    else:
        bucket = s3.create_bucket(bucket_name, aws_key, aws_secret_key)

    utils.debug_print('Uploading target: {}'.format(filename))
    url = s3.upload(filename, StringIO(file_obj), bucket)
    target_types = {
        'png': 'image',
        'jpeg': 'image',
        'jpg': 'image',
        'gif': 'image',
        'mp4': 'video',
        'mov': 'video',
        'txt': 'text',
        'csv': 'text'
    }
    filetype = filename.split('.')[-1]
    if filetype not in target_types:
        msg = ('Target not recognized (extension: "{}"). '
               'Available extensions: {}').format(filetype,
                                                  list(target_types.keys()))
        raise ValueError(msg)

    utils.debug_print('Done uploading target: {}'.format(filename))

    return {
        'target_id': str(i),
        'primary_type': target_types[filetype],
        'primary_description': url,
        'alt_type': 'text',
        'alt_description': filename
    }
Ejemplo n.º 26
0
    def post(self):
        utils.debug_print('experiment:58',request.data)
        post_parser = exp_parser.copy()
        post_parser.add_argument('app_id', type=str, required=True)
        post_parser.add_argument('args', type=dict, required=True)
        # Validate args with post_parser
        args_data = post_parser.parse_args()
        utils.debug_print(args_data)
        app_id = args_data['app_id']
        utils.debug_print(app_id)
        # Create and set exp_uid
        exp_uid = '%030x' % random.randrange(16**30)
        # Args from dict to json type
        args_json = json.dumps(args_data)
        print('experiment:69')
        # Execute initExp through the broker
        response_json,didSucceed,message = broker.applyAsync(app_id,
                                                             exp_uid,
                                                             'initExp',
                                                             json.dumps(args_data))

        if not didSucceed:
            return attach_meta({}, meta_error['InitExpError'], backend_error=message), 400

        return attach_meta({'exp_uid':exp_uid}, meta_success), 200
Ejemplo n.º 27
0
    def __get_domain_for_job(self, job_id):
        """
        Computes which domain to run a given job_id on.
        Git Commit: c1e4f8aacaa42fae80e111979e3f450965643520 has support
        for multiple worker nodes. See the code in broker.py, cluster_monitor.py, and the docker-compose
        file in that commit to see how to get that up and running. It uses
        a simple circular hashing scheme to load balance getQuery/processAnswer calls.
        This implementation assumes just a single master node and no workers
        so only a single hostname (e.g. localhost) has celery workers.
        """
        if self.r.exists('MINIONWORKER_HOSTNAME'):
            self.hostname = self.r.get('MINIONWORKER_HOSTNAME')
            utils.debug_print('Found hostname: {} (Redis)'.format(self.hostname))
        else:
            with open('/etc/hosts', 'r') as fid:
                for line in fid:
                    if 'MINIONWORKER' in line:
                        self.hostname = line.split('\t')[1].split(' ')[1]
                        self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360)  # expire after 10 minutes
                        utils.debug_print('Found hostname: {} (/etc/hosts)'.format(self.hostname))
                        break
        if self.hostname is None:
            import socket
            self.hostname = socket.gethostname()
            self.r.set('MINIONWORKER_HOSTNAME', self.hostname, ex=360)  # expire after 10 minutes
            utils.debug_print('Found hostname: {} (socket.gethostname())'.format(self.hostname))

        return self.hostname
Ejemplo n.º 28
0
    def basic_info(self, app, butler):
        """
    returns basic statistics like number of queries, participants, etc.
    """
        utils.debug_print("butler algo corona print2")
        utils.debug_print(
            butler.algorithms.get(pattern={'exp_uid': app.exp_uid}))

        experiment_dict = butler.experiment.get()
        # utils.debug_print("experiment_dict dasboard corona")
        # utils.debug_print(experiment_dict)
        alg_list = butler.experiment.get(key='args')['alg_list']
        # utils.debug_print("ALGO list app dasboard corona")
        # for algorithm in alg_list:
        #    utils.debug_print(algorithm)

        algo_dict = butler.algorithms.get(pattern={'exp_uid': app.exp_uid})
        utils.debug_print("algo_dict dasboard corona")
        utils.debug_print(algo_dict)

        #git_hash = rm.get_git_hash_for_exp_uid(exp_uid)
        git_hash = experiment_dict.get('git_hash', 'None')
        # labelled_list = butler.algorithms.get(key="labelled_list")
        # utils.debug_print("labelled_list app dasboard corona")
        # utils.debug_print(labelled_list)
        # start_date = utils.str2datetime(butler.admin.get(uid=app.exp_uid)['start_date'])
        start_date = experiment_dict.get('start_date', 'Unknown') + ' UTC'

        # participant_uids = rm.get_participant_uids(exp_uid)
        participants = butler.participants.get(
            pattern={'exp_uid': app.exp_uid})
        num_participants = len(participants)

        queries = butler.queries.get(pattern={'exp_uid': app.exp_uid})
        num_queries = len(queries)

        return_dict = {
            'git_hash': git_hash,
            'exp_start_data': start_date,
            'num_participants': num_participants,
            'num_queries': num_queries,
            'meta': {
                'last_dashboard_update': '<1 minute ago'
            }
        }
        return return_dict
Ejemplo n.º 29
0
    def pop_list(self, bucket_id, doc_uid, key, value):
        """
        Inputs:
            (string) bucket_id, (string) doc_uid, (string) key, (int) value
            value=-1 pops the last item of the list
            value=0 pops the first item of the list

        Outputs:
            (python object) value, (bool) didSucceed, (string) message

        Usage: ::\n
            didSucceed,message = db.pop_list(bucket_id,doc_uid,key,value)       
        """

        try:
            response, dt = utils.timeit(self.permStore.pop_list)(constants.app_data_database_id,
                                                                 bucket_id, doc_uid, key, value)
            value, didSucceedPerm, messagePerm = response
            self.duration_permStoreSet += dt
            return value, didSucceedPerm, messagePerm
        except Exception as e:
            error = "DatabaseAPI.pop_list failed with exception: {}".format(e)
            utils.debug_print(error)
            return None, False, error
Ejemplo n.º 30
0
    def pop_list(self, bucket_id, doc_uid, key, value):
        """
        Inputs:
            (string) bucket_id, (string) doc_uid, (string) key, (int) value
            value=-1 pops the last item of the list
            value=0 pops the first item of the list

        Outputs:
            (python object) value, (bool) didSucceed, (string) message

        Usage: ::\n
            didSucceed,message = db.pop_list(bucket_id,doc_uid,key,value)       
        """

        try:
            response, dt = utils.timeit(self.permStore.pop_list)(
                constants.app_data_database_id, bucket_id, doc_uid, key, value)
            value, didSucceedPerm, messagePerm = response
            self.duration_permStoreSet += dt
            return value, didSucceedPerm, messagePerm
        except Exception as e:
            error = "DatabaseAPI.pop_list failed with exception: {}".format(e)
            utils.debug_print(error)
            return None, False, error
Ejemplo n.º 31
0
    def __get_domain_for_job(self, job_id):
        """
        Computes which domain to run a given job_id on.
        Git Commit: c1e4f8aacaa42fae80e111979e3f450965643520 has support
        for multiple worker nodes. See the code in broker.py, cluster_monitor.py, and the docker-compose
        file in that commit to see how to get that up and running. It uses
        a simple circular hashing scheme to load balance getQuery/processAnswer calls.
        This implementation assumes just a single master node and no workers
        so only a single hostname (e.g. localhost) has celery workers.
        """
        if self.r.exists('MINIONWORKER_HOSTNAME'):
            self.hostname = self.r.get('MINIONWORKER_HOSTNAME')
            utils.debug_print('Found hostname: {} (Redis)'.format(
                self.hostname))
        else:
            with open('/etc/hosts', 'r') as fid:
                for line in fid:
                    if 'MINIONWORKER' in line:
                        self.hostname = line.split('\t')[1].split(' ')[1]
                        self.r.set('MINIONWORKER_HOSTNAME',
                                   self.hostname,
                                   ex=360)  # expire after 10 minutes
                        utils.debug_print(
                            'Found hostname: {} (/etc/hosts)'.format(
                                self.hostname))
                        break
        if self.hostname is None:
            import socket
            self.hostname = socket.gethostname()
            self.r.set('MINIONWORKER_HOSTNAME', self.hostname,
                       ex=360)  # expire after 10 minutes
            utils.debug_print(
                'Found hostname: {} (socket.gethostname())'.format(
                    self.hostname))

        return self.hostname
Ejemplo n.º 32
0
 def getModel(self, butler):
     # The model is simply the vector of weights and a record of the number of reported answers.
     utils.debug_print(
         butler.algorithms.get(key=['weights', 'num_reported_answers']))
     return butler.algorithms.get(key=['weights', 'num_reported_answers'])
Ejemplo n.º 33
0
    def post(self):
        utils.debug_print('POSTED!')
        utils.debug_print('H', request.headers)
        try:
            utils.debug_print('L', len(request.get_data()))
        except Exception as exc:
            print(exc)
            print('OH NO an error in assistant_blueprint!', exc,
                  sys.exc_info())

        # TODO? replace with msgpack
        args = self.deserialise(request.get_data())

        # Unpacking the YAML/ZIP file
        for key in args:
            if key not in {'bucket_id', 'key_id', 'secret_key'}:
                comma_idx = args[key].find(',')
                args[key] = args[key][comma_idx + 1:]
                if args[key] in {'True', 'False'}:
                    args[key] = True if args[key] == 'True' else False
                else:
                    args[key] = base64.decodestring(args[key])

        if all([
                key not in args
                for key in ['bucket_id', 'key_id', 'sercret_key']
        ]):
            args['upload'] = False
        else:
            args['upload'] = True

        utils.debug_print('args.keys() = ', args.keys())

        args['args'] = yaml.load(args['args'])

        try:
            init_exp_args = args['args']
            utils.debug_print("args.keys = ", args['args'].keys())
            if 'targets' in args.keys():
                target_zipfile = args['targets']
                utils.debug_print("args = ", args)
                if args.get('upload', True):
                    bucket_id = args['bucket_id']
                    key_id = args['key_id']
                    secret_key = args['secret_key']

                    for x_ in ['bucket_id', 'secret_key', 'key_id']:
                        utils.debug_print(x_, args[x_])
                    # Unpack the targets
                    targets = target_unpacker.unpack(target_zipfile, key_id,
                                                     secret_key, bucket_id)
                else:
                    targets = target_unpacker.unpack_csv_file(target_zipfile)
                init_exp_args['args']['targets'] = {'targetset': targets}

            # Init the experiment:
            app_id = init_exp_args['app_id']
            exp_uid = '%030x' % random.randrange(16**30)

            r = broker.applyAsync(app_id, exp_uid, 'initExp',
                                  json.dumps(init_exp_args))
            response_json, didSucceed, message = r
            if not didSucceed:
                raise ValueError(message)
        except:
            tb = traceback.format_exc()
            info = sys.exc_info()
            if hasattr(info[1], 'message') and len(info[1].message) > 0:
                message = info[1].message
                if 'time' in message:
                    message += (
                        "\nNOTE: error has to do with time; try "
                        "restarting docker, more detail at "
                        "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and"
                    )
            else:
                message = str(info[1]) + str(info[-1])
                message = '\n'.join(tb.split('\n')[-5:])
            message = message + '\n\nDetails:\n' + tb

            return {'success': False, 'message': message, 'exp_uid': None}

        return {
            'success': didSucceed,
            'message': message,
            'exp_uid': exp_uid,
            'app_id': args['args']['app_id']
        }
Ejemplo n.º 34
0
    def getQuery(self, butler, alg, args):

        sttime = time.time()
        alg_response = alg({'participant_uid': args['participant_uid']})

        # Get Unlabelled Set
        #alg_response contains index returned from LogisticRegressionActive getQuery method
        #Retrieve the row using this index
        unlabelled_row = butler.memory.get(str(alg_response))
        if unlabelled_row is None:
            utils.debug_print("No row was retrieved")
            return {}
        unlabelled_row = pickle.loads(unlabelled_row).replace(np.nan, "None")
        unlabelled_row_dict = unlabelled_row.to_dict()
        sra_study_id = unlabelled_row_dict.get('sra_study_id')
        sra_sample_id = unlabelled_row_dict.get('sra_sample_id')
        key_value = unlabelled_row_dict.get('key_value')
        #Convert from str to dict
        key_value_dict = ast.literal_eval(key_value)

        ontology_mapping = unlabelled_row_dict.get('ontology_mapping')
        # Convert from str to list
        ontology_mapping_list = ast.literal_eval(ontology_mapping)
        ont_mapping_dict = {}
        if ontology_mapping_list is None:
            ontology_mapping_list = []
        for ont in ontology_mapping_list:
            ont_org = ont
            return_link = ""
            #pre-processing steps
            ont = ont.replace(":", "_")
            '''
            "DOID": "DOID.17-01-30.obo",
            "UBERON": "UBERON.17-01-30.obo",
            "CL": "CL.18-11-13.obo",
            "CVCL": "CVCL.17-01-30.obo",
            "UO": "UO.17-01-30.obo",
            "EFO": "EFO.17-01-30.obo",
            "CHEXBI": "CHEBI.17-01-30.obo",
            "GO": "GO.19-01-18.obo"   '''

            #TODO: Other terms link
            if "CL" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/cl/terms?short_form=" + ont
            elif "UBERON" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/uberon/terms?short_form=" + ont
            elif "DOID" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/doid/terms?short_form=" + ont
            elif "EFO" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + ont
            elif "CVCL" in ont:
                return_link = "https://web.expasy.org/cellosaurus/" + ont
            ont_mapping_dict[ont_org] = return_link
        #retrieve study row based on study_id
        study_row_str = pickle.loads(butler.memory.get(sra_study_id)).replace(
            np.nan, "None")
        study_row_json = study_row_str.to_dict()
        #Class-wise confidence of all classes
        cur_confidence = butler.memory.get("cur_confidence")
        if cur_confidence is None:
            cur_confidence = pickle.dumps([])
        cur_confidence = pickle.loads(cur_confidence)
        utils.debug_print(cur_confidence)
        #Get name of classes
        lr_classes = butler.memory.get("lr_classes")
        if lr_classes is None:
            lr_classes = pickle.dumps([])
        lr_classes = pickle.loads(lr_classes)
        #this is what is received in widgets/getQuery_widget.html
        ret = {
            'target_indices': unlabelled_row_dict,
            'study': study_row_json,
            'key_value': key_value_dict,
            'ontology_mapping': ont_mapping_dict,
            'cur_confidence': cur_confidence,
            'lr_classes': lr_classes,
            'sra_sample_id': sra_sample_id
        }
        return ret
Ejemplo n.º 35
0
Archivo: App.py Proyecto: nextml/NEXT
    def getQuery(self, exp_uid, args_json):
        try:
    	    args_dict = self.helper.convert_json(args_json)
            args_dict = verifier.verify(args_dict, self.reference_dict['getQuery']['args'])
            experiment_dict = self.butler.experiment.get()
            alg_list = experiment_dict['args']['alg_list']
            participant_to_algorithm_management = experiment_dict['args']['participant_to_algorithm_management']
            algorithm_management_settings = experiment_dict['args']['algorithm_management_settings']
            # Create the participant dictionary in participants bucket if needed. Also pull out label and id for this algorithm
            participant_uid = args_dict['args'].get('participant_uid', args_dict['exp_uid'])
            # Check to see if the first participant has come by and if not, save to db
            participant_doc = self.butler.participants.get(uid=participant_uid)
            first_participant_query = participant_doc==None
            if first_participant_query:
                participant_doc = {}
                self.butler.participants.set(uid=participant_uid, value={'exp_uid':exp_uid, 'participant_uid':participant_uid})
            if (participant_uid == exp_uid) or (participant_to_algorithm_management == 'one_to_many') or (first_participant_query):

                if algorithm_management_settings['mode'] == 'fixed_proportions':
                    labels = [alg['alg_label'] for alg in algorithm_management_settings['params']]
                    prop = [prop_item['proportion'] for prop_item in algorithm_management_settings['params']]
                    # reorder prop and alg_list to have same order
                    new_alg_list = []
                    broken = False
                    for label in labels:
                        broken = False
                        for alg in alg_list:
                            if label == alg['alg_label']:
                                new_alg_list += [alg]
                                broken = True
                                break
                        if not broken:
                            raise Exception('alg_label not present for both porportions and labels')
                    chosen_alg = numpy.random.choice(new_alg_list, p=prop)
                elif algorithm_management_settings['mode'] == 'custom' :
                    chosen_alg = self.myApp.chooseAlg(self.butler, alg_list, args_dict['args'])
                else:
                    chosen_alg = numpy.random.choice(alg_list)

                alg_id = chosen_alg['alg_id']
                alg_label = chosen_alg['alg_label']
                if (first_participant_query) and (participant_to_algorithm_management=='one_to_one'):
                    self.butler.participants.set(uid=participant_uid, key='alg_id',value=alg_id)
                    self.butler.participants.set(uid=participant_uid, key='alg_label',value=alg_label)
            elif (participant_to_algorithm_management=='one_to_one'):
                alg_id = participant_doc['alg_id']
                alg_label = participant_doc['alg_label']

            query_uid = utils.getNewUID()
            args_dict['args'].update(query_uid=query_uid)
            query_doc = self.call_app_fn(alg_label, alg_id, 'getQuery', args_dict)

            query_doc.update({'participant_uid':participant_uid,
                              'alg_id':alg_id,
                              'exp_uid':exp_uid,
                              'alg_label':alg_label,
                              'timestamp_query_generated':str(utils.datetimeNow()),
                              'query_uid':query_uid})
            self.butler.queries.set(uid=query_uid, value=query_doc)
            return json.dumps({'args':query_doc,'meta':{'log_entry_durations':self.log_entry_durations}}), True,''
        except Exception, error:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            full_error = str(traceback.format_exc())+'\n'+str(error)
            utils.debug_print("getQuery Exception: " + full_error, color='red')
            log_entry = { 'exp_uid':exp_uid,'task':'getQuery','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json }
            self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
            traceback.print_tb(exc_traceback)
            return '{}', False, str(error)
Ejemplo n.º 36
0
    def cumulative_reward_plot(self, app, butler):
        """
        Description: Returns multiline plot where there is a one-to-one mapping lines to
        algorithms and each line indicates the error on the validation set with respect to number of reported answers

        Expected input:
          None

        Expected output (in dict):
          (dict) MPLD3 plot dictionary
        """
        # get list of algorithms associated with project
        # utils.debug_print('came into Dashboard')
        args = butler.experiment.get(key='args')
        # num_algs = len(args['alg_list'])
        # utils.debug_print('num_tries: ', args['num_tries'])
        # alg_labels = []
        # for i in range(num_algs):
        #     alg_labels += [args['alg_list'][i]['alg_label']]

        plot_data = butler.experiment.get(key='plot_data')
        # utils.debug_print('butler.algs.plot_data in Dashboard: ', plot_data)
        df = pd.DataFrame(plot_data)
        df.columns = [
            u'alg', u'arm_pulled', u'initial_arm', u'participant_uid',
            u'rewards', u'time'
        ]
        # utils.debug_print('df: ', df)
        # df = df.pivot_table(columns='initial arm', index='time', values='rewards', aggfunc=np.mean)
        # utils.debug_print('df: ', df)
        # utils.debug_print('Came into Dashbord, trying to print algs and init_arms')
        algs = list(df['alg'].unique())
        utils.debug_print('algs: ', algs)
        init_arms = df['initial_arm'].unique()
        utils.debug_print('init_arms: ', init_arms)
        import matplotlib.pyplot as plt
        import mpld3
        fig, ax = plt.subplots(nrows=1,
                               ncols=1,
                               subplot_kw=dict(axisbg='#EEEEEE'))

        # T = args['num_tries']
        # utils.debug_print('T: ', T)

        for alg in algs:
            alg_results = np.zeros(T)
            for i, init_arm in enumerate(init_arms):
                print alg
                print init_arm
                result = df.query(
                    'alg == "{alg}" and initial_arm == {iarm}'.format(
                        alg=alg,
                        iarm=init_arm))[['time', 'rewards', 'participant_uid'
                                         ]].groupby('time').mean()
                rewards = np.array(result['rewards'])
                # utils.debug_print('rewards: ', rewards)
                # utils.debug_print('len rewards: ', len(rewards))
                # utils.debug_print('alg_results: ', alg_results)
                alg_results[0:len(rewards)] += rewards / float(len(init_arms))

            ax.plot(range(len(rewards)),
                    np.cumsum(rewards),
                    label='{alg}'.format(alg=alg))
            ax.set_xlabel('Time')
            ax.set_ylabel('Average cumulative rewards')

        ax.set_title('Cumulative rewards', size=10)
        legend = ax.legend(loc=2, ncol=2, mode="expand")
        for label in legend.get_texts():
            label.set_fontsize('xx-small')

        plot_dict = mpld3.fig_to_dict(fig)
        plt.close()
        return plot_dict
Ejemplo n.º 37
0
 def __init__(self, db):
     self.app_id = 'PoolBasedBinaryClassification'
     self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(
         db)
     utils.debug_print("initialized myApp again")
Ejemplo n.º 38
0
    def post(self):
        utils.debug_print('POSTED!')
        utils.debug_print('H',request.headers)
        try:
            utils.debug_print('L',len(request.get_data()))
        except Exception as exc:
            print(exc)
            print('OH NO an error in assistant_blueprint!',exc,sys.exc_info())

        # TODO? replace with msgpack
        args = self.deserialise(request.get_data())

        # Unpacking the YAML/ZIP file
        for key in args:
            if key not in {'bucket_id', 'key_id', 'secret_key'}:
                comma_idx = args[key].find(',')
                args[key] = args[key][comma_idx + 1:]
                if args[key] in {'True', 'False'}:
                    args[key] = True if args[key] == 'True' else False
                else:
                    args[key] = base64.decodestring(args[key])

        if all([key not in args for key in ['bucket_id', 'key_id', 'sercret_key']]):
            args['upload'] = False
        else:
            args['upload'] = True

        utils.debug_print('args.keys() = ', args.keys())

        args['args'] = yaml.load(args['args'])

        try:
            init_exp_args = args['args']
            utils.debug_print("args.keys = ", args['args'].keys())
            if 'targets' in args.keys():
                target_zipfile = args['targets']
                utils.debug_print("args = ", args)
                if args.get('upload', True):
                    bucket_id = args['bucket_id']
                    key_id = args['key_id']
                    secret_key = args['secret_key']

                    for x_ in ['bucket_id', 'secret_key', 'key_id']:
                        utils.debug_print(x_, args[x_])
                    # Unpack the targets
                    targets = target_unpacker.unpack(target_zipfile, key_id,
                                                     secret_key, bucket_id)
                else:
                    targets = target_unpacker.unpack_csv_file(target_zipfile)
                init_exp_args['args']['targets'] = {'targetset':  targets}

            # Init the experiment:
            app_id = init_exp_args['app_id']
            exp_uid = '%030x' % random.randrange(16**30)

            r = broker.applyAsync(app_id, exp_uid, 'initExp',
                                  json.dumps(init_exp_args))
            response_json, didSucceed, message = r
            if not didSucceed:
                raise ValueError(message)
        except:
            tb = traceback.format_exc()
            info = sys.exc_info()
            if hasattr(info[1], 'message') and len(info[1].message) > 0:
                message = info[1].message
                if 'time' in message:
                    message += ("\nNOTE: error has to do with time; try "
                                "restarting docker, more detail at "
                                "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and")
            else:
                message = str(info[1]) + str(info[-1])
                message = '\n'.join(tb.split('\n')[-5:])
            message = message + '\n\nDetails:\n' + tb

            return {'success': False, 'message': message, 'exp_uid': None}

        return {'success': didSucceed, 'message': message, 'exp_uid': exp_uid,
                'app_id': args['args']['app_id']}
Ejemplo n.º 39
0
    def initExp(self, butler, n, d, failure_probability):

        # Importing here as myApp gets initialized many times during
        #course of experiment
        from ..onto_lib import general_ontology_tools as ob
        from ..onto_lib import load_ontology
        nltk.downloader.download('punkt')

        #This is needed for rules_vector
        ONT_IDS = ["4"]
        OGS = [load_ontology.load(ont_id)[0] for ont_id in ONT_IDS]
        cvcl_og = OGS[0]

        mongo_mem_set(butler, 'n', n)
        mongo_mem_set(butler, 'delta', failure_probability)
        mongo_mem_set(butler, 'd', d)
        # Initialize the weight to an empty list of 0's
        mongo_mem_set(butler, 'num_reported_answers', 0)

        #Train data form butler mem

        train_df = redis_mem_get(butler, "train_data")
        test_df = redis_mem_get(butler, "test_data")
        unlabelled_df = redis_mem_get(butler, "unlabelled_data")
        #Initializing rules
        #Rules are negated as all custom rules were based on a sample being NOT of certain type
        rules_dict = {
            "not_tissue": 0,
            "not_cell_line": 0,
            "not_primary_cells": 0,
            "not_in_vitro_differentiated_cells": 0,
            "not_induced_pluripotent_stem_cells": 0,
            "not_stem_cells": 0
        }
        bag_of_words = []
        #Creating string so that it can be vectorized later
        X_train_str, y_train, train_rules = create_word_vector(
            train_df, True, bag_of_words, ob, cvcl_og)
        X_test_str, y_test, test_rules = create_word_vector(
            test_df, True, bag_of_words, ob, cvcl_og)
        X_unlabelled_str, empty_y_unlabelled, unlabelled_rules = create_word_vector(
            unlabelled_df, False, bag_of_words, ob, cvcl_og)

        #Encode y_train and y_test
        y_train = pd.Series(y_train)
        sample_dict = get_encode_dict()
        y_train = y_train.replace(sample_dict).values

        y_test = pd.Series(y_test)
        y_test = y_test.replace(sample_dict).values

        # create the transform
        #This vectorizer is used to vectorize key-value pairs,
        #ontologies and ancestors of each of the ontology
        word_vectorizer = TfidfVectorizer(decode_error='ignore',
                                          binary=True,
                                          max_features=75,
                                          lowercase=False,
                                          token_pattern=r'\S+',
                                          stop_words='english')
        # This vectorizer is used to vectorize custom rules
        rules_vectorizer = DictVectorizer()

        # transform word and rule vectors
        # Concatenate both vectors
        word_vectorizer.fit(bag_of_words)
        rules_vectorizer.fit([rules_dict])

        #DEBUG
        utils.debug_print("word feature name")
        utils.debug_print(word_vectorizer.get_feature_names())
        utils.debug_print("rules feature name")
        utils.debug_print(rules_vectorizer.get_feature_names())
        X_train_word = word_vectorizer.transform(X_train_str)
        X_test_word = word_vectorizer.transform(X_test_str)
        X_unlabelled_word = word_vectorizer.transform(X_unlabelled_str)

        X_train_rules = rules_vectorizer.transform(train_rules)
        X_test_rules = rules_vectorizer.transform(test_rules)
        X_unlabelled_rules = rules_vectorizer.transform(unlabelled_rules)
        #Combining bot vectors
        X_train = hstack([X_train_word, X_train_rules], format="csr")
        X_test = hstack([X_test_word, X_test_rules], format="csr")
        X_unlabelled = hstack([X_unlabelled_word, X_unlabelled_rules],
                              format="csr")

        #DEBUG
        # utils.debug_print("size")
        # utils.debug_print(X_train_word)
        # utils.debug_print(X_test_word)
        # utils.debug_print(X_unlabelled_word)
        #
        # utils.debug_print(X_train_rules)
        # utils.debug_print(X_test_rules)
        # utils.debug_print(X_unlabelled_rules)
        #
        # utils.debug_print("the matrices")
        # utils.debug_print(X_train)
        # utils.debug_print(X_test)
        # utils.debug_print(X_unlabelled)

        lr_model = LogisticRegression(solver='saga', penalty='l1')
        lr_model.fit(X_train, y_train)
        y_pred = lr_model.predict(X_test)
        acc_init = accuracy_score(y_test, y_pred)
        debug_print("acc in init")
        debug_print(acc_init)
        set_updated_acc(butler, X_train.shape[0], acc_init)
        unlabelled_list = redis_mem_get(butler, "unlabelled_list")
        study_id_list = redis_mem_get(butler, "study_id_list")
        largest_val = get_largest_values(X_unlabelled, lr_model,
                                         unlabelled_list, study_id_list, d)

        debug_print(largest_val)
        sample_list = largest_val['index'].tolist()
        sample_probs = largest_val['prob'].tolist()

        #DEBUG
        # debug_print("largest val init")
        # debug_print(largest_val)
        # debug_print("sample prob init")
        # debug_print(sample_probs)

        lr_classes = get_decode_list(lr_model.classes_)
        # Print model parameters - the names and coefficients are in same order
        utils.debug_print(lr_model.coef_)
        mongo_mem_set(butler, 'lr_classes', lr_classes)
        redis_mem_set(butler, 'lr_classes', lr_classes)
        mongo_mem_set(butler, 'X_train', X_train)
        mongo_mem_set(butler, 'y_train', y_train)
        mongo_mem_set(butler, 'X_test', X_test)
        mongo_mem_set(butler, 'y_test', y_test)
        debug_print(len(unlabelled_df))
        mongo_mem_set(butler, 'unlabelled_len', len(unlabelled_df))
        mongo_mem_set(butler, 'labelled_list', [])
        redis_mem_set(butler, "X_unlabelled", X_unlabelled)
        #Setting sample list and probability
        redis_mem_set(butler, 'sample_probs', sample_probs)
        redis_mem_set(butler, 'sample_list', sample_list)
        mongo_mem_set(butler, "S_trial", json.dumps({}))
        cm = confusion_matrix(y_test, y_pred)
        mongo_mem_set(butler, "confusion_matrix", cm)
        return True
Ejemplo n.º 40
0
 def check_prefix(self):
     if self.key_prefix == '':
         utils.debug_print("butler.memory is deprecated."
                           " Change to butler.experiment.memory or butler.algorithm.memory, etc."
                           " wherever appropriate")
Ejemplo n.º 41
0
    def full_embedding_update(self, butler, args):
        debug_print("inside update new lrmodel")
        # Main function to update the model.
        labelled_items = mongo_mem_get(butler, 'S')
        X_test = mongo_mem_get(butler, "X_test")
        y_test = mongo_mem_get(butler, "y_test")
        #DEBUG
        # debug_print("X_test")
        # debug_print(X_test)
        X_train = mongo_mem_get(butler, "X_train")
        y_train = mongo_mem_get(butler, "y_train")
        #DEBUG
        # debug_print("X_train")
        # debug_print(X_train)
        debug_print("inside full update")
        X_unlabelled = redis_mem_get(butler, "X_unlabelled")

        #Get unlabelled lists
        unlabelled_list = redis_mem_get(butler, "unlabelled_list")
        labelled_list = mongo_mem_get(butler, "labelled_list")

        # Build a list of feature vectors and associated labels.
        utils.debug_print(X_unlabelled)
        X_unlabelled_list = list(X_unlabelled)
        X_labelled_list = []
        y_labelled = []
        labelled_index = []
        bucket_id = redis_mem_get(butler, "bucket_id")
        batch_no = redis_mem_get(butler, "batch_no")
        #Use this if you need to integrate with Amazon s3
        # s3.modify_csv_contents(bucket_id, 'Labels.csv', labelled_items,batch_no)
        #Modify contents - getting labels df
        labels_filename = os.path.join(self.FILE_PATH, "Labels.csv")
        labels_df = pd.read_csv(labels_filename)
        redis_mem_set(butler, "batch_no", batch_no + 1)

        #Iterate through the labelled items and change labels file
        for index, label in labelled_items:
            X_labelled_list.append(X_unlabelled_list[index])
            y_labelled.append(label)
            labelled_index.append(index)
            labelled_row = redis_mem_get(butler, str(index))
            if labelled_row is not None:
                labelled_list.append({labelled_row['sra_sample_id']: label})
                #Updating labels file
                labels_df.loc[index, 'label'] = get_decode(label)
                labels_df.loc[index, 'dataset_type'] = 'train'
                labels_df.loc[index, 'batch_no'] = batch_no

        labels_df.to_csv(labels_filename, index=False)
        debug_print("X_labelled_list")
        debug_print(X_labelled_list)
        X_labelled = vstack(X_labelled_list)
        #Combine X_train and newly labelled vector
        X_train = vstack([X_labelled, X_train])
        y_train = np.concatenate((y_labelled, y_train))

        mongo_mem_set(butler, "X_train", X_train)
        mongo_mem_set(butler, "y_train", y_train)
        study_id_list = redis_mem_get(butler, "study_id_list")
        #Drop newly labelled data from unlabelled

        for curr_index in sorted(labelled_index, reverse=True):
            debug_print("removing curr_index")
            debug_print(curr_index)

            if (curr_index in unlabelled_list):
                debug_print("curr_index exists")
                #unlabelled_list consist of indices that are unlabelles
                #Remove indices that currently got labelled from this list
                #Remove from lists parallel to this which are X_unlabelled_list and study_id_list
                curr_index_pos = unlabelled_list.index(curr_index)
                X_unlabelled_list.pop(curr_index_pos)
                unlabelled_list.pop(curr_index_pos)
                study_id_list.pop(curr_index_pos)

                if (curr_index in unlabelled_list):
                    utils.debug_print("index did not get removed oops")

        # Performing training (Retraining model along with newly labelled samples)
        lr_model = LogisticRegression(solver='saga', penalty='l1')
        utils.debug_print("X_train_len")
        utils.debug_print(X_train.shape[0])
        lr_model.fit(X_train, y_train)
        lr_classes = get_decode_list(lr_model.classes_)
        redis_mem_set(butler, 'lr_classes', lr_classes)

        y_pred = lr_model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        redis_mem_set(butler, "confusion_matrix", cm)
        acc_update = accuracy_score(y_test, y_pred)
        debug_print("acc in update")
        debug_print(acc_update)
        set_updated_acc(butler, X_train.shape[0], acc_update)
        #Get d
        d = mongo_mem_get(butler, 'd')
        sample_list = redis_mem_get(butler, "sample_list")
        #Get queries with largest entropy
        if (len(sample_list) <= 2 * d):

            X_unlabelled = vstack(X_unlabelled_list)
            largest_val = get_largest_values(X_unlabelled, lr_model,
                                             unlabelled_list, study_id_list, d)
            sample_probs = redis_mem_get(butler, "sample_probs")
            #Contains next set of queries to be asked
            sample_list = sample_list + largest_val['index'].tolist()
            # Contains probabilities of the next set of queries to be asked
            sample_probs = sample_probs + largest_val['prob'].tolist()
            #DEBUG
            debug_print("sample_list")
            debug_print(sample_list)
            redis_mem_set(butler, 'sample_list', sample_list)
            redis_mem_set(butler, 'sample_probs', sample_probs)

        mongo_mem_set(butler, 'S', [])
        mongo_mem_set(butler, 'labelled_list', labelled_list)
Ejemplo n.º 42
0
    def reset_redis(self, app, butler):

        bucket_id = os.environ.get("AWS_BUCKET_NAME")
        file_name_list = ['samples.csv', 'Labels.csv', 'studies.csv']
        csv_content_dict = s3.get_csv_content_dict(bucket_id, file_name_list)

        for filename, content in csv_content_dict.items():
            if filename is 'samples.csv':
                samples_df = pd.read_csv(io.BytesIO(content))
                # utils.debug_print("database_lib corona")
                # utils.debug_print(samples_df.head())
            elif filename is 'Labels.csv':
                labels_df = pd.read_csv(io.BytesIO(content))
                # utils.debug_print("database_lib corona")
                # utils.debug_print(labels_df.head())
            elif filename is 'studies.csv':
                study_df = pd.read_csv(io.BytesIO(content))
                # utils.debug_print("database_lib corona")
                # utils.debug_print(study_df.head())

        df_sort = labels_df.groupby(['dataset_type'])

        for dataset_type, df_cur in df_sort:
            if (dataset_type == constants.UNLABELLED_TAG):
                unlabelled_indices = df_cur['index'].tolist()
            elif (dataset_type == constants.TRAIN_TAG):
                train_indices = df_cur['index'].values
            elif (dataset_type == constants.TEST_TAG):
                test_indices = df_cur['index'].values

        batch_no = labels_df['batch_no'].max()
        if pd.isnull(batch_no):
            batch_no = 0

        butler.memory.set("batch_no", pickle.dumps(batch_no + 1))

        train_df = samples_df.loc[samples_df['index'].isin(train_indices)]
        train_df['label'] = labels_df.loc[
            labels_df['index'].isin(train_indices), 'label']

        test_df = samples_df.loc[samples_df['index'].isin(test_indices)]
        test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices),
                                         'label']

        unlabelled_df = samples_df.loc[samples_df['index'].isin(
            unlabelled_indices)]
        X_unlabelled_str = lr.create_vector(unlabelled_df)
        N = 2
        bag_of_words = [
            "differentiated", "cell", "hela", "derived", "CL:0000057",
            "CL:0000115", "EFO:0000322", "EFO:0000324", "EFO:0000313",
            "CL:0000034"
        ]
        # create the transform
        vectorizer = TfidfVectorizer(ngram_range=(1, N + 1),
                                     decode_error='ignore')
        # tokenize and build vocab
        try:
            vectorizer.fit(bag_of_words)
        except Exception as e:
            utils.debug_print(e)

        X_train_str, y_train = lr.create_dict(train_df)
        X_test_str, y_test = lr.create_dict(test_df)

        # Encode y_train and y_test
        y_train = pd.Series(y_train)
        sample_dict = lr.get_encode_dict()
        y_train = y_train.replace(sample_dict).values

        y_test = pd.Series(y_test)
        y_test = y_test.replace(sample_dict).values

        # encode document
        try:
            X_train = vectorizer.transform(X_train_str)
        except Exception as e:
            utils.debug_print(e)

        try:
            X_test = vectorizer.transform(X_test_str)
        except Exception as e:
            utils.debug_print(e)

        try:
            X_unlabelled = vectorizer.transform(X_unlabelled_str)
        except Exception as e:
            utils.debug_print(e)

        study_id_list = unlabelled_df['sra_study_id'].tolist()

        for i, row in unlabelled_df.iterrows():
            utils.debug_print(str(row['index']))
            redis_mem_set(butler, str(row['index']), row)

        utils.debug_print("done setting unlabelled")

        lr_model = LogisticRegression(penalty='l1')
        lr_model.fit(X_train, y_train)
        y_pred = lr_model.predict(X_test)
        acc_init = accuracy_score(y_test, y_pred)
        utils.debug_print("acc in app dashboard")
        utils.debug_print(acc_init)
        largest_val = lr.get_largest_values(X_unlabelled, lr_model,
                                            unlabelled_indices, study_id_list,
                                            5)
        sample_list = largest_val['index'].tolist()
        sample_probs = largest_val['prob'].tolist()
        lr_classes = lr.get_decode_list(lr_model.classes_)
        redis_mem_set(butler, 'lr_classes', lr_classes)

        utils.debug_print("sample_list init")
        utils.debug_print(sample_list)
        utils.debug_print("sample prob init")
        utils.debug_print(sample_probs)

        for i, row in study_df.iterrows():
            redis_mem_set(butler, row['sra_study_id'], row)

        #Making sure that the eariler batch hasnt been labelled before
        algo_list = butler.algorithms.get(pattern={'exp_uid': app.exp_uid})
        S_trial = {}
        # for cur_algo in algo_list:
        #     if cur_algo.get("alg_id") is "LogisticRegressionActive":
        #         S_trial = json.loads(cur_algo.get("S_trial"))
        #Remove hardcode
        cur_algo = algo_list[0]
        cur_algo["sample_probs"] = sample_probs
        cur_algo["sample_list"] = sample_list

        lr.redis_mem_set(butler, 'sample_probs', sample_probs)
        lr.redis_mem_set(butler, 'sample_list', sample_list)
        lr.redis_mem_set(butler, "study_id_list", study_id_list)
        # Set  data in mem
        # redis_mem_set(butler, "does_this_work", 5)
        lr.redis_mem_set(butler, "train_data", train_df)
        lr.redis_mem_set(butler, "bucket_id", bucket_id)
        lr.redis_mem_set(butler, "test_data", test_df)
        lr.redis_mem_set(butler, "unlabelled_data",
                         unlabelled_df[['key_value', 'ontology_mapping']])
        lr.redis_mem_set(butler, "label_data", labels_df)
        lr.redis_mem_set(butler, "unlabelled_list", unlabelled_indices)
        lr.redis_mem_set(butler, "X_unlabelled", X_unlabelled)

        return {}
Ejemplo n.º 43
0
    def getQuery(self, exp_uid, args_json):
        try:
    	    args_dict = self.helper.convert_json(args_json)
            args_dict = verifier.verify(args_dict, self.reference_dict['getQuery']['args'])
            experiment_dict = self.butler.experiment.get()
            alg_list = experiment_dict['args']['alg_list']
            participant_to_algorithm_management = experiment_dict['args']['participant_to_algorithm_management']
            algorithm_management_settings = experiment_dict['args']['algorithm_management_settings']
            # Create the participant dictionary in participants bucket if needed. Also pull out label and id for this algorithm
            participant_uid = args_dict['args'].get('participant_uid', args_dict['exp_uid'])
            # Check to see if the first participant has come by and if not, save to db
            participant_doc = self.butler.participants.get(uid=participant_uid)
            first_participant_query = participant_doc==None
            if first_participant_query:
                participant_doc = {}
                self.butler.participants.set(uid=participant_uid, value={'exp_uid':exp_uid, 'participant_uid':participant_uid})
            if (participant_uid == exp_uid) or (participant_to_algorithm_management == 'one_to_many') or (first_participant_query):

                if algorithm_management_settings['mode'] == 'fixed_proportions':
                    labels = [alg['alg_label'] for alg in algorithm_management_settings['params']]
                    prop = [prop_item['proportion'] for prop_item in algorithm_management_settings['params']]
                    # reorder prop and alg_list to have same order
                    new_alg_list = []
                    broken = False
                    for label in labels:
                        broken = False
                        for alg in alg_list:
                            if label == alg['alg_label']:
                                new_alg_list += [alg]
                                broken = True
                                break
                        if not broken:
                            raise Exception('alg_label not present for both porportions and labels')
                    chosen_alg = numpy.random.choice(new_alg_list, p=prop)
                elif algorithm_management_settings['mode'] == 'custom' :
                    chosen_alg = self.myApp.chooseAlg(self.butler, alg_list, args_dict['args'])
                else:
                    chosen_alg = numpy.random.choice(alg_list)

                alg_id = chosen_alg['alg_id']
                alg_label = chosen_alg['alg_label']
                if (first_participant_query) and (participant_to_algorithm_management=='one_to_one'):
                    self.butler.participants.set(uid=participant_uid, key='alg_id',value=alg_id)
                    self.butler.participants.set(uid=participant_uid, key='alg_label',value=alg_label)
            elif (participant_to_algorithm_management=='one_to_one'):
                alg_id = participant_doc['alg_id']
                alg_label = participant_doc['alg_label']

            query_uid = utils.getNewUID()
            args_dict['args'].update(query_uid=query_uid)
            query_doc = self.call_app_fn(alg_label, alg_id, 'getQuery', args_dict)
            
            query_doc.update({'participant_uid':participant_uid,
                              'alg_id':alg_id,
                              'exp_uid':exp_uid,
                              'alg_label':alg_label,
                              'timestamp_query_generated':str(utils.datetimeNow()),
                              'query_uid':query_uid})
            self.butler.queries.set(uid=query_uid, value=query_doc)
            return json.dumps({'args':query_doc,'meta':{'log_entry_durations':self.log_entry_durations}}), True,''
        except Exception, error:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            full_error = str(traceback.format_exc())+'\n'+str(error)
            utils.debug_print("getQuery Exception: " + full_error, color='red')
            log_entry = { 'exp_uid':exp_uid,'task':'getQuery','error':full_error,'timestamp':utils.datetimeNow(),'args_json':args_json } 
            self.butler.ell.log( self.app_id+':APP-EXCEPTION', log_entry  )
            traceback.print_tb(exc_traceback)
            return '{}', False, str(error)
Ejemplo n.º 44
0
def redis_mem_set(butler, key, value):
    # Butler.memory is essentially set in redis
    try:
        butler.memory.set(key, pickle.dumps(value))
    except Exception as e:
        utils.debug_print("Could not set " + key + " in redis")
Ejemplo n.º 45
0
class MyApp:

    #Data files stored inside NEXT/local
    DIR_PATH = os.path.dirname(os.path.realpath(__file__))
    FILE_PATH = os.path.join(DIR_PATH, constants.PATH_FROM_myApp)
    utils.debug_print("myApp " + FILE_PATH)

    def __init__(self, db):
        self.app_id = 'PoolBasedBinaryClassification'
        self.TargetManager = next.apps.SimpleTargetManager.SimpleTargetManager(
            db)
        utils.debug_print("initialized myApp again")

    def append(butler, row, key="data"):
        butler.memory.cache.lpush(key, pickle.dumps(row))

    def df2bytes(df):
        with BytesIO() as f:
            df.to_pickle(f)
            df_bytes = f.getvalue()
        return df_bytes

    def getitem(butler, index, key="data"):
        unlabelled_len = butler.algorithms.get(key="unlabelled_len")
        bytes_ = butler.memory.cache.lindex(key, unlabelled_len - index - 1)
        row = pickle.loads(bytes_)
        return row

    def initExp(self, butler, init_algs, args):
        utils.debug_print("experiment initialized again")
        args['n'] = len(args['targets']['targetset'])
        #Use when Amazon s3 is needed
        # bucket_id = args['bucket_id']
        # key_id = args['key_id']
        # secret_key = args['secret_key']
        # set(butler.memory, "bucket_id", bucket_id)
        # set(butler.memory, "key_id", key_id)
        # set(butler.memory, "secret_key", secret_key)

        samples_filename = self.FILE_PATH + "/samples.csv"
        samples_df = pd.read_csv(samples_filename)
        studies_filename = self.FILE_PATH + "/studies.csv"
        study_df = pd.read_csv(studies_filename)
        labels_filename = self.FILE_PATH + "/Labels.csv"
        labels_df = pd.read_csv(labels_filename)

        #Loading ontologies,
        #This particular index contains ontologies that we are concerned with

        #Use this to integrate with s3
        # file_name_list = ['samples.csv','Labels.csv','studies.csv']
        # csv_content_dict = s3.get_csv_content_dict(bucket_id,file_name_list)
        # for filename,content in csv_content_dict.items():
        #     if filename is 'samples.csv':
        #         samples_df =  pd.read_csv(io.BytesIO(content))
        #     elif filename is 'Labels.csv':
        #         labels_df = pd.read_csv(io.BytesIO(content))
        #     elif filename is 'studies.csv':
        #         study_df = pd.read_csv(io.BytesIO(content))

        experiment = butler.experiment.get()

        df_sort = labels_df.groupby(['dataset_type'])
        for dataset_type, df_cur in df_sort:
            if (dataset_type == constants.UNLABELLED_TAG):
                unlabelled_indices = df_cur['index'].tolist()
            elif (dataset_type == constants.TRAIN_TAG):
                train_indices = df_cur['index'].values
            elif (dataset_type == constants.TEST_TAG):
                test_indices = df_cur['index'].values

        batch_no = labels_df['batch_no'].max()
        if pd.isnull(batch_no):
            batch_no = 0

        butler.memory.set("batch_no", pickle.dumps(batch_no + 1))
        train_df = samples_df.loc[samples_df['index'].isin(train_indices)]
        train_df['label'] = labels_df.loc[
            labels_df['index'].isin(train_indices), 'label']
        test_df = samples_df.loc[samples_df['index'].isin(test_indices)]
        test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices),
                                         'label']
        unlabelled_df = samples_df.loc[samples_df['index'].isin(
            unlabelled_indices)]
        study_id_list = unlabelled_df['sra_study_id'].tolist()
        unlabelled_indices = unlabelled_df['index'].tolist()

        for i, row in unlabelled_df.iterrows():
            set_debug(butler.memory,
                      str(row['index']),
                      row,
                      i,
                      verbose=i % 10000 == 0)

        utils.debug_print("done setting unlabelled")
        for i, row in study_df.iterrows():
            set(butler.memory, row['sra_study_id'], row)

        train_list = []
        acc_list = []
        set(butler.memory, "study_id_list", study_id_list)
        # Set  data in memory
        set(butler.memory, "train_data", train_df)
        set(butler.memory, "test_data", test_df)
        set(butler.memory, "unlabelled_data",
            unlabelled_df[['key_value', 'ontology_mapping']])
        set(butler.memory, "label_data", labels_df)
        set(butler.memory, "unlabelled_list", unlabelled_indices)
        butler.memory.set("train_list", pickle.dumps(train_list))
        butler.memory.set("acc_list", pickle.dumps(acc_list))

        alg_data = {
            'n': args['n'],
            'failure_probability': args['failure_probability'],
            'd': args['d']
        }

        init_algs(alg_data)
        return args

    def getQuery(self, butler, alg, args):

        sttime = time.time()
        alg_response = alg({'participant_uid': args['participant_uid']})

        # Get Unlabelled Set
        #alg_response contains index returned from LogisticRegressionActive getQuery method
        #Retrieve the row using this index
        unlabelled_row = butler.memory.get(str(alg_response))
        if unlabelled_row is None:
            utils.debug_print("No row was retrieved")
            return {}
        unlabelled_row = pickle.loads(unlabelled_row).replace(np.nan, "None")
        unlabelled_row_dict = unlabelled_row.to_dict()
        sra_study_id = unlabelled_row_dict.get('sra_study_id')
        sra_sample_id = unlabelled_row_dict.get('sra_sample_id')
        key_value = unlabelled_row_dict.get('key_value')
        #Convert from str to dict
        key_value_dict = ast.literal_eval(key_value)

        ontology_mapping = unlabelled_row_dict.get('ontology_mapping')
        # Convert from str to list
        ontology_mapping_list = ast.literal_eval(ontology_mapping)
        ont_mapping_dict = {}
        if ontology_mapping_list is None:
            ontology_mapping_list = []
        for ont in ontology_mapping_list:
            ont_org = ont
            return_link = ""
            #pre-processing steps
            ont = ont.replace(":", "_")
            '''
            "DOID": "DOID.17-01-30.obo",
            "UBERON": "UBERON.17-01-30.obo",
            "CL": "CL.18-11-13.obo",
            "CVCL": "CVCL.17-01-30.obo",
            "UO": "UO.17-01-30.obo",
            "EFO": "EFO.17-01-30.obo",
            "CHEXBI": "CHEBI.17-01-30.obo",
            "GO": "GO.19-01-18.obo"   '''

            #TODO: Other terms link
            if "CL" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/cl/terms?short_form=" + ont
            elif "UBERON" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/uberon/terms?short_form=" + ont
            elif "DOID" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/doid/terms?short_form=" + ont
            elif "EFO" in ont:
                return_link = "https://www.ebi.ac.uk/ols/ontologies/efo/terms?short_form=" + ont
            elif "CVCL" in ont:
                return_link = "https://web.expasy.org/cellosaurus/" + ont
            ont_mapping_dict[ont_org] = return_link
        #retrieve study row based on study_id
        study_row_str = pickle.loads(butler.memory.get(sra_study_id)).replace(
            np.nan, "None")
        study_row_json = study_row_str.to_dict()
        #Class-wise confidence of all classes
        cur_confidence = butler.memory.get("cur_confidence")
        if cur_confidence is None:
            cur_confidence = pickle.dumps([])
        cur_confidence = pickle.loads(cur_confidence)
        utils.debug_print(cur_confidence)
        #Get name of classes
        lr_classes = butler.memory.get("lr_classes")
        if lr_classes is None:
            lr_classes = pickle.dumps([])
        lr_classes = pickle.loads(lr_classes)
        #this is what is received in widgets/getQuery_widget.html
        ret = {
            'target_indices': unlabelled_row_dict,
            'study': study_row_json,
            'key_value': key_value_dict,
            'ontology_mapping': ont_mapping_dict,
            'cur_confidence': cur_confidence,
            'lr_classes': lr_classes,
            'sra_sample_id': sra_sample_id
        }
        return ret

    def processAnswer(self, butler, alg, args):
        query = butler.queries.get(uid=args['query_uid'])

        target = query['target_indices']
        target_label = args['target_label']
        #DEBUG
        # utils.debug_print("type(target_label)")
        # utils.debug_print(type(target_label))
        num_reported_answers = butler.experiment.increment(
            key='num_reported_answers_for_' + query['alg_label'])
        labelled_row = pickle.loads(butler.memory.get(str(target['index'])))
        if labelled_row is None:
            utils.debug_print("Labelled row doesnt exist")
            return {}
        # make a getModel call ~ every n/4 queries - note that this query will NOT be included in the predict
        experiment = butler.experiment.get()
        d = experiment['args']['d']
        # if num_reported_answers % ((d+4)/4) == 0:
        #     butler.job('getModel', json.dumps({'exp_uid':butler.exp_uid,'args':{'alg_label':query['alg_label'], 'logging':True}}))
        alg({'target_index': target['index'], 'target_label': target_label})
        return {'target_index': target['index'], 'target_label': target_label}

    def getModel(self, butler, alg, args):
        return alg()
Ejemplo n.º 46
0
 def check_prefix(self):
     if self.key_prefix == '':
         utils.debug_print("butler.memory is deprecated."
                           " Change to butler.experiment.memory or butler.algorithm.memory, etc."
                           " wherever appropriate")
Ejemplo n.º 47
0
    def initExp(self, butler, init_algs, args):
        utils.debug_print("experiment initialized again")
        args['n'] = len(args['targets']['targetset'])
        #Use when Amazon s3 is needed
        # bucket_id = args['bucket_id']
        # key_id = args['key_id']
        # secret_key = args['secret_key']
        # set(butler.memory, "bucket_id", bucket_id)
        # set(butler.memory, "key_id", key_id)
        # set(butler.memory, "secret_key", secret_key)

        samples_filename = self.FILE_PATH + "/samples.csv"
        samples_df = pd.read_csv(samples_filename)
        studies_filename = self.FILE_PATH + "/studies.csv"
        study_df = pd.read_csv(studies_filename)
        labels_filename = self.FILE_PATH + "/Labels.csv"
        labels_df = pd.read_csv(labels_filename)

        #Loading ontologies,
        #This particular index contains ontologies that we are concerned with

        #Use this to integrate with s3
        # file_name_list = ['samples.csv','Labels.csv','studies.csv']
        # csv_content_dict = s3.get_csv_content_dict(bucket_id,file_name_list)
        # for filename,content in csv_content_dict.items():
        #     if filename is 'samples.csv':
        #         samples_df =  pd.read_csv(io.BytesIO(content))
        #     elif filename is 'Labels.csv':
        #         labels_df = pd.read_csv(io.BytesIO(content))
        #     elif filename is 'studies.csv':
        #         study_df = pd.read_csv(io.BytesIO(content))

        experiment = butler.experiment.get()

        df_sort = labels_df.groupby(['dataset_type'])
        for dataset_type, df_cur in df_sort:
            if (dataset_type == constants.UNLABELLED_TAG):
                unlabelled_indices = df_cur['index'].tolist()
            elif (dataset_type == constants.TRAIN_TAG):
                train_indices = df_cur['index'].values
            elif (dataset_type == constants.TEST_TAG):
                test_indices = df_cur['index'].values

        batch_no = labels_df['batch_no'].max()
        if pd.isnull(batch_no):
            batch_no = 0

        butler.memory.set("batch_no", pickle.dumps(batch_no + 1))
        train_df = samples_df.loc[samples_df['index'].isin(train_indices)]
        train_df['label'] = labels_df.loc[
            labels_df['index'].isin(train_indices), 'label']
        test_df = samples_df.loc[samples_df['index'].isin(test_indices)]
        test_df['label'] = labels_df.loc[labels_df['index'].isin(test_indices),
                                         'label']
        unlabelled_df = samples_df.loc[samples_df['index'].isin(
            unlabelled_indices)]
        study_id_list = unlabelled_df['sra_study_id'].tolist()
        unlabelled_indices = unlabelled_df['index'].tolist()

        for i, row in unlabelled_df.iterrows():
            set_debug(butler.memory,
                      str(row['index']),
                      row,
                      i,
                      verbose=i % 10000 == 0)

        utils.debug_print("done setting unlabelled")
        for i, row in study_df.iterrows():
            set(butler.memory, row['sra_study_id'], row)

        train_list = []
        acc_list = []
        set(butler.memory, "study_id_list", study_id_list)
        # Set  data in memory
        set(butler.memory, "train_data", train_df)
        set(butler.memory, "test_data", test_df)
        set(butler.memory, "unlabelled_data",
            unlabelled_df[['key_value', 'ontology_mapping']])
        set(butler.memory, "label_data", labels_df)
        set(butler.memory, "unlabelled_list", unlabelled_indices)
        butler.memory.set("train_list", pickle.dumps(train_list))
        butler.memory.set("acc_list", pickle.dumps(acc_list))

        alg_data = {
            'n': args['n'],
            'failure_probability': args['failure_probability'],
            'd': args['d']
        }

        init_algs(alg_data)
        return args
Ejemplo n.º 48
0
    def get(self, exp_uid):
        """
        .. http:get:: /experiment/<exp_uid>/participants

        Get all participant response data associated with a given exp_uid.

        **Example request**:

        .. sourcecode:: http

        GET /experiment/<exp_uid>/participants HTTP/1.1
        Host: next_backend.next.discovery.wisc.edu

        **Example response**:

        .. sourcecode:: http
        
        HTTP/1.1 200 OK
        Vary: Accept
        Content-Type: application/json

        {
        	participant_responses: [participant_responses]
        	status: {
        		code: 200,
        		status: OK,
       		},
        }
        
        :>json all_participant_responses: list of all participant_responses

        :statuscode 200: Participants responses successfully returned
        :statuscode 400: Participants responses failed to be generated
    	"""
        true_values ={1, '1', 'True', 'true'}
        zip_true = False
        if 'zip' in request.args.keys():
            zip_true = True if request.args.get('zip') in true_values else False
        csv = False
        if 'csv' in request.args.keys():
            csv = True if request.args.get('csv') in true_values else False

        # Get all participants for exp_uid from resource_manager
        participant_uids = resource_manager.get_participant_uids(exp_uid)
        participant_responses = {}

        # Iterate through list of all participants for specified exp_uid
        for participant in participant_uids:
            response = resource_manager.get_participant_data(participant,
                                                             exp_uid)
            # Append participant query responses to list
            participant_responses[participant] = response

        if csv:
            responses = []
            for participant in participant_uids:
                response = resource_manager.get_participant_data(participant,
                                                                 exp_uid)
                for r in response:
                    responses += [r]

            try:
                response_file = parse_responses(responses)
            except ValueError as e:
                message = str(e)
                message += '\n\n' + str(traceback.format_exc())
                utils.debug_print(message)
                return message

        all_responses = {'participant_responses': participant_responses}
        if zip_true:
            filename, content = ('responses.json', json.dumps(all_responses))
            if request.args.get('csv'):
                filename, content = ('responses.csv', response_file.getvalue())

            zip_responses = BytesIO()
            with zipfile.ZipFile(zip_responses, 'w',
                                 compression=zipfile.ZIP_DEFLATED) as zf:
                zf.writestr(filename, content)
            zip_responses.seek(0)

            return send_file(zip_responses,
                             attachment_filename=filename + '.zip',
                             as_attachment='True')
        else:
            return api_util.attach_meta(all_responses, meta_success), 200
Ejemplo n.º 49
0
    def post(self):
        # TODO? replace with msgpack
        args = self.deserialise(request.get_data())

        # Unpacking the YAML/ZIP file
        for key in args:
            if key not in {'bucket_id', 'key_id', 'secret_key'}:
                comma_idx = args[key].find(',')
                args[key] = args[key][comma_idx + 1:]
                if args[key] in {'True', 'False'}:
                    args[key] = True if args[key] == 'True' else False
                else:
                    args[key] = base64.decodestring(args[key])

        if all(
            [key not in args
             for key in ['bucket_id', 'key_id', 'secret_key']]):
            args['upload'] = False
        else:
            args['upload'] = True

        args['args'] = yaml.load(args['args'])

        try:
            init_exp_args = args['args']
            if 'targets' in args.keys():
                target_zipfile = args['targets']
                if args.get('upload', True):
                    bucket_id = args['bucket_id']
                    key_id = args['key_id']
                    secret_key = args['secret_key']
                    init_exp_args['args']['bucket_id'] = bucket_id
                    init_exp_args['args']['key_id'] = key_id
                    init_exp_args['args']['secret_key'] = secret_key
                    targets = target_unpacker.unpack(target_zipfile, key_id,
                                                     secret_key, bucket_id)
                else:
                    filenames = target_unpacker.get_filenames_from_zip(
                        target_zipfile)
                    utils.debug_print("This will be bold and yellow!")
                    utils.debug_print(filenames)
                    utils.debug_print("This will be bold and yellow!")
                    if len(filenames) != 1:
                        raise ValueError(
                            'Specify exactly one file in the ZIP file')
                    filename = filenames[0]
                    extension = filename.split('.')[-1]
                    targets = target_unpacker.unpack_text_file(target_zipfile,
                                                               kind=extension)
                init_exp_args['args']['targets'] = {'targetset': targets}

                if 'keys_for_all_targets' in init_exp_args['args']:
                    pairs = init_exp_args['args']['keys_for_all_targets']

                    for pair in pairs:
                        map(
                            lambda target: target.update(
                                {pair['key']: pair['value']}),
                            init_exp_args['args']['targets']['targetset'])

            # Init the experiment:
            app_id = init_exp_args['app_id']

            exp_uid = '%030x' % random.randrange(16**30)

            r = broker.applyAsync(app_id, exp_uid, 'initExp',
                                  json.dumps(init_exp_args))
            response_json, didSucceed, message = r
            if not didSucceed:
                raise ValueError(message)
        except:
            tb = traceback.format_exc()
            info = sys.exc_info()
            if hasattr(info[1], 'message') and len(info[1].message) > 0:
                message = info[1].message
                if 'time' in message:
                    message += (
                        "\nNOTE: error has to do with time; try "
                        "restarting docker, more detail at "
                        "https://stackoverflow.com/questions/27674968/amazon-s3-docker-403-forbidden-the-difference-between-the-request-time-and"
                    )
            else:
                message = str(info[1]) + str(info[-1])
                message = '\n'.join(tb.split('\n')[-5:])
            message = message + '\n\nDetails:\n' + tb

            return {'success': False, 'message': message, 'exp_uid': None}

        return {
            'success': didSucceed,
            'message': message,
            'exp_uid': exp_uid,
            'app_id': args['args']['app_id']
        }
Ejemplo n.º 50
0
    def pop_list(self, database_id, bucket_id, doc_uid, key, value):
        """
        pops a value from a list.
        If value=0, pops from start of list
        If value=-1, pops from end of list.
        (note this is inconsistent with Mongo's api to be consistent with python's pop)
        See declaration of mongo_index for more info.
        
        Inputs: 
            (string) database_id, (string) bucket_id, (string) doc_uid, (string) key, (int) value
        
        Outputs:
            (any) value, (bool) didSucceed, (string) message 
        
        Usage: ::\n
            value, didSucceed, message = db.set(database_id, bucket_id, doc_uid, key, value)
        """
        if self.client is None:
            didSucceed, message = self.connectToMongoServer()
            if not didSucceed:
                return None, False, message
        # For Mongo's $pop, 1 means last element, -1 means first element
        try:
            if value == -1:
                mongo_index = 1
            elif value == 0:
                mongo_index = -1
            else:
                raise DatabaseException(
                    "can only pop first (value=0) or last (value=-1) element")
            try:
                return_value = self.client[database_id][
                    bucket_id].find_and_modify({"_id": doc_uid},
                                               {'$pop': {
                                                   key: mongo_index
                                               }})[key]
            except KeyError as e:
                if e.args[0] == key:
                    raise DatabaseException(
                        "key '{}' not found in document '{}.{}'".format(
                            key, database_id, bucket_id))
                elif e.args[0] == bucket_id:
                    raise DatabaseException(
                        "bucket '{}' not found in database '{}'".format(
                            bucket_id, database_id))
                elif e.args[0] == database_id:
                    raise DatabaseException(
                        "database '{}' not found".format(database_id))
                else:
                    raise DatabaseException(
                        "unknown KeyError: '{}' not found".format(e))
            except OperationFailure:  # This gets thrown if you try to pop from a non-list
                raise DatabaseException("cannot pop from non-list")
            if return_value:
                return_value = return_value[value]
            else:
                raise DatabaseException("cannot pop from empty list")
            return_value = self.undoDatabaseFormat(return_value)

            return return_value, True, 'From Mongo'
        except DatabaseException as e:
            error = "PermStore.pop_list failed with exception: {}".format(e)
            utils.debug_print(error)
            return None, False, error
Ejemplo n.º 51
0
def mongo_mem_set(butler, key, value):
    #Butler.algorithms is essentially set in mongoDB
    try:
        butler.algorithms.set(key=key, value=value)
    except Exception as e:
        utils.debug_print("Could not set " + key + " in mongodb")
Ejemplo n.º 52
0
 def getModel(self, butler):
     # The model is simply the vector of weights and a record of the number of reported answers.
     utils.debug_print(butler.algorithms.get(key=['weights', 'num_reported_answers']))
     return butler.algorithms.get(key=['weights', 'num_reported_answers'])