Beispiel #1
0
 def post_slice(self, start_row, stop_row, params, files):
     action = params['action']
     with thrift_lock() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/preprocess':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_preprocessor(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/classify':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_prediction(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/feature':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.takeout_link_job(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_input, model_link = _takeout_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = base64.urlsafe_b64decode(params['model'])
             chain_inputs, model_chain = zip(*_takeout_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/hash':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.feature_to_hash(base64.urlsafe_b64decode(params['model']), start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = base64.urlsafe_b64decode(params['column'])
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.urlsafe_b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             class_name = params['className']
             query = params.get('query')
             query = class_name if query is None else query
             p['lat'] = query = params.get('lat')
             p['lon'] = query = params.get('lon')
             p['radius'] = query = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name, query, **p)}
         elif action in ('io/annotate/image/query', 'io/annotate/image/entity', 'io/annotate/image/query_batch'):
             self._slice_validate(start_row, stop_row, 'r')
             secret = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             task = base64.urlsafe_b64encode(uuid.uuid4().bytes)[:-2]
             p = {}
             image_column = base64.urlsafe_b64decode(params['imageColumn'])
             if action == 'io/annotate/image/entity':
                 entity_column = base64.urlsafe_b64decode(params['entityColumn'])
                 data = 'hbase://localhost:9090/images/%s/%s?entity=%s&image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row),
                                                                                    entity_column, image_column)
                 p['type'] = 'image_entity'
             elif action == 'io/annotate/image/query':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query'
                 p['query'] = query
             elif action == 'io/annotate/image/query_batch':
                 query = params['query']
                 data = 'hbase://localhost:9090/images/%s/%s?image=%s' % (base64.urlsafe_b64encode(start_row), base64.urlsafe_b64encode(stop_row), image_column)
                 p['type'] = 'image_query_batch'
                 p['query'] = query
             else:
                 bottle.abort(400)
             p['num_tasks'] = 100
             p['mode'] = 'standalone'
             try:
                 redis_host, redis_port = ANNOTATORS.add_task(task, self.owner, secret, data, p).split(':')
             except annotators.CapacityException:
                 bottle.abort(503)
             p['setup'] = True
             p['reset'] = True
             p['secret'] = secret
             p['redis_address'] = redis_host
             p['redis_port'] = int(redis_port)
             mturk_vision.manager(data=data, **p)
             return {'task': task}
         else:
             bottle.abort(400)
Beispiel #2
0
 def post_slice(self, start_row, stop_row, params, files):
     if files:
         bottle.abort(400)
     params = {k: base64.b64decode(v) for k, v in params.items()}
     action = params['action']
     with thrift_new() as thrift:
         manager = PicarusManager(thrift=thrift)
         if action == 'io/thumbnail':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_thumbnail(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/exif':
             self._slice_validate(start_row, stop_row, 'rw')
             manager.image_exif(start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/link':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_input, model_link = _takeout_input_model_link_from_key(manager, model_key)
             manager.takeout_chain_job([model_link], chain_input, model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'io/chain':
             self._slice_validate(start_row, stop_row, 'rw')
             model_key = params['model']
             chain_inputs, model_chain = zip(*_takeout_input_model_chain_from_key(manager, model_key))
             manager.takeout_chain_job(list(model_chain), chain_inputs[0], model_key, start_row=start_row, stop_row=stop_row)
             return {}
         elif action == 'i/faces':
             # TODO: Temporary, remove when done
             names = set(['George_W_Bush', 'Colin_Powell', 'Tony_Blair', 'Donald_Rumsfeld', 'Gerhard_Schroeder',
                          'Ariel_Sharon', 'Hugo_Chavez', 'Junichiro_Koizumi', 'Serena_Williams', 'John_Ashcroft'])
             self._slice_validate(start_row, stop_row, 'r')
             import cv2
             r = None
             labels = {}
             pos = 0
             neg = 0
             data = []
             lab = []
             num_train = 2000
             for n, (cur_row, cur_cols) in enumerate(hadoopy_hbase.scanner(thrift, self.table,
                                                                           start_row=start_row, per_call=10,
                                                                           stop_row=stop_row, columns=['data:image', 'meta:class'])):
                 cur_class = cur_cols['meta:class']
                 if cur_class not in names:
                     continue
                 if cur_class not in labels:
                     labels[cur_class] = len(labels)
                 label = labels[cur_class]
                 image = cv2.imdecode(np.fromstring(cur_cols['data:image'], np.uint8), 0)
                 # Crop
                 image = np.ascontiguousarray(image[62:-62, 62:-62])
                 #if n == 0:
                 #    cv2.imwrite('out.png', image)
                 if n < num_train:
                     lab.append(label)
                     data.append(image)
                 else:
                     if r is None:
                         r = cv2.createLBPHFaceRecognizer()
                         r.train(data, np.array(lab))
                         print('TRAINED-----------------------')
                     pred = r.predict(image)[0]
                     print((pred, label))
                     if pred == label:
                         pos += 1
                     else:
                         neg += 1
                 print((cur_class, image.shape, n, pos, neg, pos / float(pos + neg + .00000001)))
         elif action == 'io/garbage':
             self._slice_validate(start_row, stop_row, 'rw')
             columns_removed = set()
             columns_kept = set()
             # TODO: Get all user models and save those too
             active_models = set()
             for cur_row, cur_cols in hadoopy_hbase.scanner(thrift, self.table, filter='KeyOnlyFilter()',
                                                            start_row=start_row, per_call=10,
                                                            stop_row=stop_row):
                 for k in cur_cols.keys():
                     if not (k.startswith('meta:') or k.startswith('thum:') or k == 'data:image' or k in active_models):
                         if k not in columns_removed:
                             columns_removed.add(k)
                             print(columns_removed)
                             print(len(columns_removed))
                     else:
                         if k not in columns_kept:
                             columns_kept.add(k)
                             print(columns_kept)
                             print(len(columns_kept))
             return {'columnsRemoved': list(columns_removed), 'columnsKept': list(columns_kept)}
         elif action == 'i/dedupe/identical':
             self._slice_validate(start_row, stop_row, 'r')
             col = params['column']
             features = {}
             dedupe_feature = lambda x, y: features.setdefault(base64.b64encode(hashlib.md5(y).digest()), []).append(base64.b64encode(x))
             for cur_row, cur_col in hadoopy_hbase.scanner_row_column(thrift, self.table, column=col,
                                                                      start_row=start_row, per_call=10,
                                                                      stop_row=stop_row):
                 dedupe_feature(cur_row, cur_col)
             bottle.response.headers["Content-type"] = "application/json"
             return json.dumps([{'rows': y} for x, y in features.items() if len(y) > 1])
         elif action == 'o/crawl/flickr':
             self._slice_validate(start_row, stop_row, 'w')
             # Only slices where the start_row can be used as a prefix may be used
             assert start_row and ord(start_row[-1]) != 255 and start_row[:-1] + chr(ord(start_row[-1]) + 1) == stop_row
             p = {}
             row_prefix = start_row
             assert row_prefix.find(':') != -1
             print('params[%r]' % params)
             class_name = params.get('className')
             query = params['query']
             p['lat'] = params.get('lat')
             p['lon'] = params.get('lon')
             p['radius'] = params.get('radius')
             p['api_key'] = params.get('apiKey', FLICKR_API_KEY)
             p['api_secret'] = params.get('apiSecret', FLICKR_API_SECRET)
             if not p['api_key'] or not p['api_secret']:
                 bottle.abort(400)  # Either we don't have a default or the user provided an empty key
             if 'hasGeo' in params:
                 p['has_geo'] = params['hasGeo'] == '1'
             if 'onePerOwner' in params:
                 p['one_per_owner'] = params['onePerOwner'] == '1'
             try:
                 p['min_upload_date'] = int(params['minUploadDate'])
             except KeyError:
                 pass
             try:
                 p['max_rows'] = int(params['maxRows'])
             except KeyError:
                 pass
             try:
                 p['max_upload_date'] = int(params['maxUploadDate'])
             except KeyError:
                 pass
             try:
                 p['page'] = int(params['page'])
             except KeyError:
                 pass
             return {'numRows': crawlers.flickr_crawl(crawlers.HBaseCrawlerStore(thrift, row_prefix), class_name=class_name, query=query, **p)}
         else:
             bottle.abort(400)