def post_table(self, params, files): if files: bottle.abort(400) params = {base64.b64decode(k): base64.b64decode(v) for k, v in params.items()} path = params['path'] start_stop_rows = parse_slices() if path in ('images/class',): data_table = get_table(self._auth_user, path.split('/', 1)[0]) for start_row, stop_row in start_stop_rows: data_table._slice_validate(start_row, stop_row, 'r') # We never need to decode these, they just need to be # random strings that can be in a url secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] p = {} image_column = params['imageColumn'] ub64 = base64.urlsafe_b64encode if path == 'images/class': class_column = params['classColumn'] assert class_column.startswith('meta:') suffix = '/'.join(ub64(x) + '/' + ub64(y) for x, y in start_stop_rows) data = 'hbase://localhost:9090/images/%s?class=%s&image=%s' % (suffix, ub64(class_column), ub64(image_column)) p['type'] = 'image_class' try: p['class_descriptions'] = params['classDescriptions'] except KeyError: pass try: p['class_thumbnails'] = params['classThumbnails'] except KeyError: pass else: bottle.abort(400) if 'instructions' in params: p['instructions'] = params['instructions'] p['num_tasks'] = int(params['numTasks']) assert 0 < p['num_tasks'] assert params['mode'] in ('standalone', 'amt') p['mode'] = params['mode'] p['task_key'] = task redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p) p['sync'] = True p['secret'] = secret p['redis_address'] = redis_host p['redis_port'] = int(redis_port) mturk_vision.manager(data=data, **p) return {'task': base64.b64encode(task)} else: bottle.abort(400)
def get_manager(self, task): self.exists(task) try: return self.cache[task] except KeyError: data = self.db.hgetall(task + ':annot') p = json.loads(data['params']) p['sync'] = False p['secret'] = data['_secret'] p['redis_address'] = self.redis_host p['redis_port'] = int(self.redis_port) self.cache[task] = mturk_vision.manager(data=data['_data'], **p) return self.cache[task]
def get_annotation_manager(self, task, data_connection, sync=False): self._exists(task) self._check_type(task, 'annotation') data = self.db.hgetall(self._task_prefix + task) p = json.loads(data['params']) ps = json.loads(data['_params']) p['sync'] = sync p['secret'] = str(ps['secret']) p['redis_address'] = self.annotation_redis_host p['redis_port'] = int(self.annotation_redis_port) p['task_key'] = task # TODO: Currently only compatible with thrift based datastores if data_connection: data_connection = data_connection._thrift return mturk_vision.manager(data=str(ps['data']), data_connection=data_connection, **p)
def get_manager(self, task): # TODO: Need to ensure no races due to a worker running while the DB is destroyed # can fix by putting a unique key in state_db, that is verified each call # but needs to lock the race down (check and set?) data = self.db.hgetall(self.annotator_prefix + task) p = json.loads(data['params']) p['setup'] = False p['reset'] = False p['secret'] = data['_secret'] redis_host, redis_port = data['_redis_host_port'].split(':') p['redis_address'] = redis_host p['redis_port'] = int(redis_port) # This ensures that the task still exists before we reuse the cache try: return self.cache[task] except KeyError: self.cache[task] = mturk_vision.manager(data=data['_data'], **p) return self.cache[task]
def server(**args): global MANAGER, SERVER MANAGER = mturk_vision.manager(**args) SERVER = gevent.pywsgi.WSGIServer(('0.0.0.0', int(args['port'])), bottle.app()) SERVER.serve_forever()
def post_slice(self, start_row, stop_row, params, files): action = params['action'] with thrift_lock() as thrift: manager = PicarusManager(thrift=thrift) if action == 'io/thumbnail': self._slice_validate(start_row, stop_row, 'rw') manager.image_thumbnail(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/exif': self._slice_validate(start_row, stop_row, 'rw') manager.image_exif(start_row=start_row, stop_row=stop_row) return {} elif action == 'io/preprocess': self._slice_validate(start_row, stop_row, 'rw') manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/classify': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/feature': self._slice_validate(start_row, stop_row, 'rw') manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'io/link': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_input, model_link = _takeout_model_link_from_key(manager, model_key) manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/chain': self._slice_validate(start_row, stop_row, 'rw') model_key = base64.urlsafe_b64decode(params['model']) chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key)) manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row) return {} elif action == 'io/hash': self._slice_validate(start_row, stop_row, 'rw') manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row) return {} elif action == 'i/dedupe/identical': self._slice_validate(start_row, stop_row, 'r') col = base64.urlsafe_b64decode(params['column']) features = {} dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x)) for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col, start_row=start_row, per_call=10, stop_row=stop_row): dedupe_feature(cur_row, cur_col) bottle.response.headers["Content-type"] = "application/json" return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1]) elif action == 'o/crawl/flickr': self._slice_validate(start_row, stop_row, 'w') # Only slices where the start_row can be used as a prefix may be used assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row p = {} row_prefix = start_row assert row_prefix.find(':') != -1 class_name = params['className'] query = params.get('query') query = class_name if query is None else query p['lat'] = query = params.get('lat') p['lon'] = query = params.get('lon') p['radius'] = query = params.get('radius') p['api_key'] = params.get('apiKey', FLICKR_API_KEY) p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET) if 'hasGeo' in params: p['has_geo'] = params['hasGeo'] == '1' try: p['min_upload_date'] = int(params['minUploadDate']) except KeyError: pass try: p['max_upload_date'] = int(params['maxUploadDate']) except KeyError: pass try: p['page'] = int(params['page']) except KeyError: pass return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)} elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'): self._slice_validate(start_row, stop_row, 'r') secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2] p = {} image_column = base64.urlsafe_b64decode(params['imageColumn']) if action == 'io/annotate/image/entity': entity_column = base64.urlsafe_b64decode(params['entityColumn']) data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), entity_column, image_column) p['type'] = 'image_entity' elif action == 'io/annotate/image/query': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query' p['query'] = query elif action == 'io/annotate/image/query_batch': query = params['query'] data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column) p['type'] = 'image_query_batch' p['query'] = query else: bottle.abort(400) p['num_tasks'] = 100 p['mode'] = 'standalone' try: redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':') except annotators.CapacityException: bottle.abort(503) p['setup'] = True p['reset'] = True p['secret'] = secret p['redis_address'] = redis_host p['redis_port'] = int(redis_port) mturk_vision.manager(data=data, **p) return {'task': task} else: bottle.abort(400)