def delete_rows(prefix): assert ord(prefix[-1]) != 255 stop_row = prefix[:-1] + chr(ord(prefix[-1]) + 1) for x, y in hadoopy_hbase.scanner(c, "images", start_row=prefix, stop_row=stop_row): assert x.startswith(prefix) print(repr(x)) c.deleteAllRow("images", x)
def _features_to_classifier(self, classifier, class_positive, input_table, input_feature_column, input_class_column, output_table, output_row, output_column, max_per_label=None, **kw): row_dict = hadoopy_hbase.HBaseRowDict(output_table, output_column, db=self.hb) row_cols = hadoopy_hbase.scanner(self.hb, input_table, columns=[input_feature_column, input_class_column], **kw) label_features = {0: [], 1: []} for row, cols in row_cols: if max_per_label is not None and len(label_features[0]) >= max_per_label and len(label_features[1]) >= max_per_label: break label = int(cols[input_class_column] == class_positive) if max_per_label is None or len(label_features[label]) < max_per_label: print(label) print(cols[input_class_column]) label_features[label].append(cols[input_feature_column]) else: print('Skipping[%d]' % label) labels = [0] * len(label_features[0]) + [1] * len(label_features[1]) features = label_features[0] + label_features[1] features_to_classifier(classifier, labels, features) cp = picarus.api.Classifier() cp.name = '%s-%s-indoor' % (self.images_table, self.feature_name) # TODO(brandyn): Indoor specific ATM if isinstance(self.feature_dict, dict): cp.feature = json.dumps(self.feature_dict) cp.feature_format = cp.JSON_IMPORT else: cp.feature = pickle.dumps(self.feature_dict, -1) cp.feature_format = cp.PICKLE cp.classifier = pickle.dumps(classifier, -1) cp.classifier_format = cp.PICKLE row_dict[output_row] = cp.SerializeToString() print('Train')
def evaluate_classifier_class_distance_list(self, classifier_key, **kw): classifier = picarus.api.feature_classifier_frompb(self.key_to_classifier_pb(classifier_key)) input_dict = self.key_to_input_model_param(classifier_key)[0] feature_key = input_dict['feature'] metadata_key = input_dict['meta'] row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[feature_key, metadata_key], **kw) cm = {} # [true][pred] total = 0 correct = 0 for row, columns in row_cols: feature = picarus.api.np_fromstring(columns[feature_key]) print(feature.shape) c = classifier(feature) print(c) total += 1 try: pred_class = c[0]['class'] except IndexError: pred_class = '' true_class = columns[metadata_key] if pred_class == true_class: correct += 1 try: cm.setdefault(true_class, {})[pred_class] += 1 except KeyError: cm.setdefault(true_class, {})[pred_class] = 1 print(cm) print(correct / float(total)) print(correct / float(total)) return {'cm': cm, 'total': total, 'correct': correct}
def livedata(): print_request() row_to_time = lambda row: 2**31 - int(row[6:]) time_to_row = lambda t: 'camera' + str(2**31 - t) out = [] try: THRIFT_LOCK.acquire() start_row = 'camera' row_skip = 5 cur_time = 0 for _ in range(5): for row, cols in hadoopy_hbase.scanner(THRIFT, 'testtable', start_row=start_row, stop_row='camerb', max_rows=1): cur_time = row_to_time(row) out.append({'row': row, 'time': cur_time, 'columns': cols}) cur_time -= row_skip start_row = time_to_row(cur_time) row_skip *= 2 if cur_time < 0: break finally: THRIFT_LOCK.release() return {'data': out}
def get_slice(self, start_row, stop_row, columns, params, files): self._slice_validate(start_row, stop_row, 'r') max_rows = min(10000, int(params.get('maxRows', 1))) print('MaxRows[%d]' % max_rows) max_bytes = min(5242880, int(params.get('maxBytes', 5242880))) filter_string = params.get('filter') print('filter string[%s]' % filter_string) exclude_start = bool(int(params.get('excludeStart', 0))) out = [] per_call = 1 max_byte_count = 0 with thrift_lock() as thrift: scanner = hadoopy_hbase.scanner(thrift, self.table, per_call=per_call, columns=columns, start_row=start_row, stop_row=stop_row, filter=filter_string) cur_row = start_row byte_count = 0 for row_num, (cur_row, cur_columns) in enumerate(scanner, 1): if exclude_start and row_num == 1: continue out.append(encode_row(cur_row, cur_columns)) cur_byte_count = self._byte_count_rows(out[-1:]) byte_count += cur_byte_count # Compute the number of rows we should try to get by using the max sized row # that we have seen as an upper bound. max_byte_count = max(1, max(max_byte_count, cur_byte_count)) per_call = max(1, min((max_bytes - byte_count) / max_byte_count, max_rows - len(out))) if len(out) >= max_rows or byte_count >= max_bytes: break bottle.response.headers["Content-type"] = "application/json" return json.dumps(out)
def classifier_sklearn(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) label_features = {0: [], 1: []} for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['feature'], inputs['meta']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: try: label = int(cols[inputs['meta']] == params['class_positive']) label_features[label].append(cols[inputs['feature']]) except KeyError: continue labels = [0] * len(label_features[0]) + [1] * len(label_features[1]) features = label_features[0] + label_features[1] features = np.asfarray([msgpack.loads(x)[0] for x in features]) import sklearn.svm classifier = sklearn.svm.LinearSVC() classifier.fit(features, np.asarray(labels)) factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64} model_link = {'name': 'picarus.LinearClassifier', 'kw': {'coefficients': classifier.coef_.tolist()[0], 'intercept': classifier.intercept_[0]}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature', 'output_type': 'binary_class_confidence', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def index_hamming_feature2d(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) hashes = [] #keypoints = [] labels = [] indeces = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['feature2d_binary']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: f = msgpack.loads(cols[inputs['feature2d_binary']]) print(f[2][0]) hashes.append(f[0]) #keypoints += f[1] indeces += [len(labels)] * f[2][0] labels.append(row) print(len(labels)) hashes = ''.join(hashes) factory_info = {'slices': slices, 'num_hashes': len(indeces), 'num_images': len(labels), 'data': 'slices', 'params': params, 'inputs': inputsb64} #'keypoints': keypoints, model_link = {'name': 'picarus.HammingFeature2dHashIndex', 'kw': {'hashes': hashes, 'indeces': indeces, 'labels': labels, 'max_results': params['max_results'], 'max_keypoint_results': params['max_keypoint_results'], 'hamming_thresh': params['hamming_thresh']}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature2d_binary']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['feature2d_binary'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature2d_binary', 'output_type': 'distance_image_rows', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def classifier_kernel_sklearn(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) label_features = {0: [], 1: []} for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['feature'], inputs['meta']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: try: label = int(cols[inputs['meta']] == params['class_positive']) label_features[label].append(cols[inputs['feature']]) except KeyError: continue kernel = {'hik': kernels.histogram_intersection}[params['kernel']] labels = [0] * len(label_features[0]) + [1] * len(label_features[1]) features = label_features[0] + label_features[1] features = np.asfarray([msgpack.loads(x)[0] for x in features]) gram = kernel(features, features) import sklearn.svm classifier = sklearn.svm.SVC(kernel='precomputed') classifier.fit(gram, np.asarray(labels)) factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64} support_vectors = features[classifier.support_, :].ravel().tolist() dual_coef = classifier.dual_coef_.ravel().tolist() intercept = float(classifier.intercept_.ravel()[0]) model_link = {'name': 'picarus.KernelClassifier', 'kw': {'support_vectors': support_vectors, 'dual_coef': dual_coef, 'intercept': intercept, 'kernel': params['kernel']}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature', 'output_type': 'binary_class_confidence', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def classifier_localnbnn(params, inputsub64, schema): print(inputsub64) inputs = {x: base64.urlsafe_b64decode(y) for x, y in inputsub64.items()} print(inputs) features = [] indeces = [] num_features = 0 feature_size = 0 labels_dict = {} labels = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, data_table.table, columns=[inputs['multi_feature'], inputs['meta']], start_row=start_row, stop_row=stop_row) for _, cols in row_cols: try: label = cols[inputs['meta']] f, s = msgpack.unpackb(cols[inputs['multi_feature']]) if label not in labels_dict: labels_dict[label] = len(labels_dict) labels.append(label) feature_size = s[1] num_features += s[0] features += f indeces += [labels_dict[label]] * s[0] except KeyError: pass factory_info = {'slices': slices, 'data': 'slices', 'params': params, 'inputs': inputsub64} model = {'name': 'picarus.LocalNBNNClassifier', 'kw': {'features': features, 'indeces': indeces, 'labels': labels, 'feature_size': feature_size, 'max_results': params['max_results']}} return {'input': inputsub64['multi_feature'], 'model': model, 'input_type': 'feature', 'output_type': 'multi_class_distance', 'email': self.owner, 'name': manager.model_to_name(model), 'factory_info': json.dumps(factory_info)}
def classifier_sklearn(params, inputs, schema): label_features = {0: [], 1: []} for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, data_table.table, columns=[base64.urlsafe_b64decode(inputs['feature']), base64.urlsafe_b64decode(inputs['meta'])], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: try: label = int(cols[base64.urlsafe_b64decode(inputs['meta'])] == params['class_positive']) label_features[label].append(cols[base64.urlsafe_b64decode(inputs['feature'])]) except KeyError: continue labels = [0] * len(label_features[0]) + [1] * len(label_features[1]) features = label_features[0] + label_features[1] features = np.asfarray([msgpack.unpackb(x)[0] for x in features]) num_nans = 0 for feature in features: if np.any(np.isnan(feature)): num_nans += 1 print(feature) print('NumNans[%d]' % num_nans) import sklearn.svm classifier = sklearn.svm.LinearSVC() classifier.fit(features, np.asarray(labels)) factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputs} model = {'name': 'picarus.LinearClassifier', 'kw': {'coefficients': classifier.coef_.tolist()[0], 'intercept': classifier.intercept_[0]}} return {'input': inputs['feature'], 'model': model, 'input_type': 'feature', 'output_type': 'binary_class_confidence', 'email': self.owner, 'name': manager.model_to_name(model), 'factory_info': json.dumps(factory_info)}
def classifier_localnbnn(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) features = [] indeces = [] num_features = 0 feature_size = 0 labels_dict = {} labels = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['multi_feature'], inputs['meta']], start_row=start_row, stop_row=stop_row) for _, cols in row_cols: try: label = cols[inputs['meta']] f, s = msgpack.loads(cols[inputs['multi_feature']]) if label not in labels_dict: labels_dict[label] = len(labels_dict) labels.append(label) feature_size = s[1] num_features += s[0] features += f indeces += [labels_dict[label]] * s[0] except KeyError: pass factory_info = {'slices': slices, 'data': 'slices', 'params': params, 'inputs': inputsb64} model_link = {'name': 'picarus.LocalNBNNClassifier', 'kw': {'features': features, 'indeces': indeces, 'labels': labels, 'feature_size': feature_size, 'max_results': params['max_results']}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['multi_feature']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['multi_feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'multi_feature', 'output_type': 'multi_class_distance', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def inner(num_rows, **kw): row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[self.image_column, self.indoor_class_column], **kw) for x, (_, cols) in enumerate(row_cols): print(repr(x)) if x >= num_rows: break yield cols[self.indoor_class_column], imfeat.image_fromstring(cols[self.image_column])
def _build_index(self, si, index, input_table, input_hash_column, input_class_column, output_table, output_row, output_column, **kw): row_dict = hadoopy_hbase.HBaseRowDict(output_table, output_column, db=self.hb) row_cols = hadoopy_hbase.scanner(self.hb, input_table, columns=[input_hash_column, input_class_column], **kw) metadata, hashes = zip(*[(json.dumps([cols[input_class_column], base64.b64encode(row)]), cols[input_hash_column]) for row, cols in row_cols]) row_dict[output_row] = hashes_to_index(si, index, metadata, hashes)
def delete_slice(self, start_row, stop_row): self._slice_validate(start_row, stop_row, 'w') # NOTE: This only fetches rows that have a column in data:image (it is a significant optimization) # NOTE: Only parameters allowed, no "files" due to memory restrictions with thrift_lock() as thrift: for row, _ in hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, stop_row=stop_row, filter='KeyOnlyFilter()'): thrift.deleteAllRow(self.table, row) return {}
def _scanner(self, *args, **kw): import hadoopy_hbase for x, y in self._slices: print((x, y)) for z in hadoopy_hbase.scanner(self._hbase, self._table, start_row=x, stop_row=y, *args, **kw): yield z
def index_train(model_dict, model_param, inputs): index = call_import(model_dict) row_cols = hadoopy_hbase.scanner(thrift, self.table, columns=[inputs['hash'], inputs['meta']], start_row=start_row, stop_row=stop_row) metadata, hashes = zip(*[(json.dumps([cols[inputs['meta']], base64.urlsafe_b64encode(row)]), cols[inputs['hash']]) for row, cols in row_cols]) hashes = np.ascontiguousarray(np.asfarray([np.fromstring(h, dtype=np.uint8) for h in hashes])) index = index.store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64)) index.metadata = metadata return index
def kmeans_cluster_mfeat(model_dict, model_param, inputs): # TODO: This needs to be finished, determine if we want quantizer level or cluster level clusterer = call_import(model_dict) features = [] row_cols = hadoopy_hbase.scanner(thrift, self.table, columns=[inputs['multi_feature']], start_row=start_row, stop_row=stop_row) # TODO: We'll want to check that we aren't clustering too much data by placing constraints for row, columns in row_cols: features.append(picarus.api.np_fromstring(columns[inputs['multi_feature']])) features = np.vstack(features) return clusterer.cluster(features)
def patch_slice(self, start_row, stop_row, params, files): self._slice_validate(start_row, stop_row, 'w') # NOTE: This only fetches rows that have a column in data:image (it is a significant optimization) # NOTE: Only parameters allowed, no "files" due to memory restrictions mutations = [] for x, y in params.items(): mutations.append(hadoopy_hbase.Mutation(column=base64.urlsafe_b64decode(x), value=base64.b64decode(y))) if mutations: with thrift_lock() as thrift: for row, _ in hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, stop_row=stop_row, filter='KeyOnlyFilter()', columns=['data:image']): thrift.mutateRow(self.table, row, mutations) return {}
def features_to_classifier_class_distance_list(self, feature_key, metadata_column, classifier, **kw): row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[feature_key, metadata_column], **kw) label_values = ((cols[metadata_column], np.asfarray(picarus.api.np_fromstring(cols[feature_key]))) for _, cols in row_cols) classifier.train(label_values) feature_input, feature, _ = self.key_to_input_model_param(feature_key) classifier_ser = pickle.dumps(classifier, -1) print(len(classifier_ser)) k = image_retrieval.input_model_param_to_key('pred:', input={'feature': feature_key, 'meta': metadata_column}, model=classifier, param={'classifier_type': 'class_distance_list'}) print(repr(k)) return k
def evaluate_masks(self, cm_ilp): # Go through each mask and compare it to the annotation results row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[self.masks_gt_column]) cms = {'train': np.zeros((self.texton_num_classes, self.texton_num_classes), dtype=np.int32), 'test': np.zeros((self.texton_num_classes, self.texton_num_classes), dtype=np.int32)} ilps = [] if cm_ilp: ilp_weights = json.load(open('ilp_weights.js')) # load weights from previous run ilp_weights['ilp_tables'] = np.asfarray(ilp_weights['ilp_tables']) for row, columns in row_cols: gt = picarus.api.np_fromstring(columns[self.masks_gt_column]) ilp_pred = np.fromstring(self.hb.get(self.images_table, row, self.feature_prediction_column)[0].value, dtype=np.double)[0] print(ilp_pred) masks = picarus.api.np_fromstring(self.hb.get(self.images_table, row, self.masks_column)[0].value) if cm_ilp: try: bin_index = [x for x, y in enumerate(ilp_weights['bins']) if y >= ilp_pred][0] except IndexError: bin_index = ilp_weights['ilp_tables'].shape[1] if bin_index != 0: bin_index -= 1 print('bin_index[%d][%f]' % (bin_index, ilp_pred)) masks *= ilp_weights['ilp_tables'][:, bin_index] masks_argmax = np.argmax(masks, 2) gt_sums = np.sum(gt.reshape(-1, gt.shape[2]), 0).tolist() print(gt_sums) if row.startswith('sun397train'): cm = cms['train'] ilps.append({'gt_sums': gt_sums, 'ilp_pred': ilp_pred, 'gt_size': gt.shape[0] * gt.shape[1]}) else: cm = cms['test'] for mask_num in range(gt.shape[2]): if not np.any(gt[:, :, mask_num]): continue print(mask_num) preds = masks_argmax[gt[:, :, mask_num].nonzero()] h, bins = np.histogram(preds, np.arange(self.texton_num_classes + 1)) np.testing.assert_equal(bins, np.arange(self.texton_num_classes + 1)) cm[mask_num] += h json.dump({'cms': {'train': cms['train'].tolist(), 'test': cms['test'].tolist()}, 'cm_ilp': cm_ilp, 'ilps': ilps}, open('eval.js', 'w')) for split in ['train', 'test']: cm = cms[split] print(split) print(cm) if np.any(cm): print(((cm / float(np.sum(cm))) * 100).astype(np.int32)) classes = [z[1] for z in sorted([(y['mask_num'], x) for x, y in self.texton_classes.items()])] title_suffix = 'w ilp)' if cm_ilp else 'w/o ilp)' fn_suffix = '_ilp.png' if cm_ilp else '.png' save_confusion_matrix(cms['test'], classes, 'confmat_test' + fn_suffix, title='Confusion Matrix (test ' + title_suffix) save_confusion_matrix(cms['train'], classes, 'confmat_train' + fn_suffix, title='Confusion Matrix (train ' + title_suffix)
def cluster_points_local(self, **kw): row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[self.image_column], **kw) feature_func = imfeat.HOGLatent(16) num_clusters = 100 features = [] for row, columns in row_cols: image = imfeat.image_fromstring(columns[self.image_column]) features.append(feature_func.compute_dense(image)) features = np.vstack(features) clusters = sp.cluster.vq.kmeans(features, num_clusters)[0] print(clusters.shape) json.dump(clusters.tolist(), open('clusters.js', 'w'))
def get_table(self, columns): user_column = 'user:'******'user:'******'user', '%s', =, 'binaryprefix:r', true, true)" % self.owner outs = [] with thrift_lock() as thrift: for row, cols in hadoopy_hbase.scanner(thrift, self.table, columns=columns + [user_column], filter=hbase_filter): self._row_validate(row, 'r', thrift) if not output_user: del cols[user_column] outs.append(encode_row(row, cols)) bottle.response.headers["Content-type"] = "application/json" return json.dumps(outs)
def hashes_to_index(self, hasher_key, metadata_column, index, **kw): hasher_input, hasher, _ = self.key_to_input_model_param(hasher_key) feature_input, feature, _ = self.key_to_input_model_param(hasher_input['feature']) row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[hasher_key, metadata_column], **kw) metadata, hashes = zip(*[(json.dumps([cols[metadata_column], base64.urlsafe_b64encode(row)]), cols[hasher_key]) for row, cols in row_cols]) hashes = np.ascontiguousarray(np.asfarray([np.fromstring(h, dtype=np.uint8) for h in hashes])) index = index.store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64)) index.metadata = metadata k = image_retrieval.input_model_param_to_key('srch:', input={'hash': hasher_key, 'meta': metadata_column}, model=index) print(repr(k)) return k
def _prediction_to_conf_gt(self, class_positive, input_table, input_prediction_column, input_class_column, **kw): row_cols = hadoopy_hbase.scanner(self.hb, input_table, columns=[input_prediction_column, input_class_column], **kw) pos_confs = [] neg_confs = [] for row, cols in row_cols: pred = float(np.fromstring(cols[input_prediction_column], dtype=np.double)[0]) print(repr(row)) if cols[input_class_column] == class_positive: pos_confs.append(pred) else: neg_confs.append(pred) pos_confs.sort() neg_confs.sort() open('confs.js', 'w').write(json.dumps({'pos_confs': pos_confs, 'neg_confs': neg_confs})) print(len(pos_confs)) print(len(neg_confs))
def index_spherical(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) hashes = [] labels = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['hash']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: hashes.append(cols[inputs['hash']]) labels.append(row) hashes = ''.join(hashes) factory_info = {'slices': slices, 'num_hashes': len(labels), 'data': 'slices', 'params': params, 'inputs': inputsb64} model_link = {'name': 'picarus.SphericalHashIndex', 'kw': {'hashes': hashes, 'indeces': range(len(labels)), 'labels': labels, 'max_results': params['max_results']}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['hash']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['hash'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'hash', 'output_type': 'distance_image_rows', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def get_slice(self, start_row, stop_row, columns, params, files): self._slice_validate(start_row, stop_row, 'r') max_rows = min(10000, int(params.get('maxRows', 1))) max_bytes = min(1048576, int(params.get('maxBytes', 1048576))) filter_string = params.get('filter') print('filter string[%s]' % filter_string) exclude_start = bool(int(params.get('excludeStart', 0))) with thrift_lock() as thrift: scanner = hadoopy_hbase.scanner(thrift, self.table, per_call=10, columns=columns, start_row=start_row, stop_row=stop_row, filter=filter_string) out = [] cur_row = start_row byte_count = 0 for row_num, (cur_row, cur_columns) in enumerate(scanner, 1): if exclude_start and row_num == 1: continue out.append(encode_row(cur_row, cur_columns)) byte_count += self._byte_count_rows(out[-1:]) if len(out) >= max_rows or byte_count >= max_bytes: break bottle.response.headers["Content-type"] = "application/json" return json.dumps(out)
def feature_bovw_mask(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) features = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['mask_feature']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: cur_feature = msgpack.loads(cols[inputs['mask_feature']]) cur_feature = np.array(cur_feature[0]).reshape((-1, cur_feature[1][2])) features += random.sample(cur_feature, min(len(cur_feature), params['max_per_row'])) print(len(features)) features = np.asfarray(features) clusters = sp.cluster.vq.kmeans(features, params['num_clusters'])[0] num_clusters = clusters.shape[0] factory_info = {'slices': slices, 'num_features': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64} model_link = {'name': 'picarus.BOVWImageFeature', 'kw': {'clusters': clusters.ravel().tolist(), 'num_clusters': num_clusters, 'levels': params['levels']}} model_chain = tables._takeout_model_chain_from_key(manager, inputs['mask_feature']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['mask_feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature', 'output_type': 'feature', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def scanner(self, table, start_row=None, stop_row=None, columns=None, keys_only=False, per_call=1, column_filter=None): filts = ['KeyOnlyFilter()'] if keys_only else [] if column_filter: sanitary = lambda x: re.search("^[a-zA-Z0-9@\.:]+$", x) filter_family, filter_column = column_filter[0].split(':') if column_filter[1] == '=': filter_relation = '=' filter_value = 'binary:' + column_filter[2] elif column_filter[1] == '!=': filter_relation = '!=' filter_value = 'binary:' + column_filter[2] elif column_filter[1] == 'startswith': filter_relation = '=' filter_value = 'binaryprefix:' + column_filter[2] else: bottle.abort(400) # Bad filter if any(not sanitary(x) for x in [filter_family, filter_column, filter_value]): bottle.abort(400) filts.append( "SingleColumnValueFilter ('%s', '%s', %s, '%s', true, true)" % (filter_family, filter_column, filter_relation, filter_value)) filt = ' AND '.join(filts) if not filt: filt = None return hadoopy_hbase.scanner(self._thrift, table, columns=columns, start_row=start_row, stop_row=stop_row, filter=filt, per_call=per_call)
def hasher_spherical(queue, params, inputs, schema, start_stop_rows, table, owner): thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs) features = [] for start_row, stop_row in start_stop_rows: row_cols = hadoopy_hbase.scanner(thrift, table, columns=[inputs['feature']], start_row=start_row, stop_row=stop_row) for row, cols in row_cols: cur_feature = msgpack.loads(cols[inputs['feature']]) features.append(np.array(cur_feature[0])) print('num_features[%d]' % len(features)) features = np.asfarray(features) out = picarus_takeout.spherical_hasher_train(features, params['num_pivots'], params['eps_m'], params['eps_s'], params['max_iters']) out = {'pivots': out['pivots'].ravel().tolist(), 'threshs': out['threshs'].tolist()} #out = picarus.modules.spherical_hash.train_takeout(features, params['num_pivots'], params['eps_m'], params['eps_s'], params['max_iters']) factory_info = {'slices': slices, 'num_features': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64} model_link = {'name': 'picarus.SphericalHasher', 'kw': out} model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link] queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature', 'output_type': 'hash', 'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
def features_to_classifier_sklearn_decision_func(self, feature_key, metadata_column, class_positive, classifier, max_per_label=None, **kw): row_cols = hadoopy_hbase.scanner(self.hb, self.images_table, columns=[feature_key, metadata_column], **kw) label_features = {0: [], 1: []} for row, cols in row_cols: if max_per_label is not None and len(label_features[0]) >= max_per_label and len(label_features[1]) >= max_per_label: break label = int(cols[metadata_column] == class_positive) if max_per_label is None or len(label_features[label]) < max_per_label: label_features[label].append(cols[feature_key]) print label, cols[metadata_column] else: print('Skipping[%d]' % label) labels = [0] * len(label_features[0]) + [1] * len(label_features[1]) features = label_features[0] + label_features[1] features = np.asfarray([picarus.api.np_fromstring(x) for x in features]) classifier.fit(features, np.asarray(labels)) k = image_retrieval.input_model_param_to_key('pred:', input={'feature': feature_key, 'meta': metadata_column}, model=classifier, param={'class_positive': class_positive, 'classifier_type': 'sklearn_decision_func'}) print(repr(k)) return k
import hadoopy_hbase import time c = hadoopy_hbase.connect('localhost') cnt = 0 st = time.time() N = 5000 for x in hadoopy_hbase.scanner(c, 'flickr', per_call=N, columns=['metadata:license']): cnt += 1 if cnt % N == 0: print(((time.time() - st) / N, cnt)) st = time.time()
import tempfile import zlib import json import os import random import numpy as np import imfeat import picarus.modules import picarus.api logging.basicConfig(level=logging.DEBUG) a = hadoopy_hbase.connect() hrc = picarus.modules.HashRetrievalClassifier() hrc.load(open('sun397_feature_index.pb').read()) for num, (row, cols) in enumerate( hadoopy_hbase.scanner(a, 'images', start_row='sun397train')): if num > 2: break print cols['feat:superpixel'][:50] image = imfeat.image_fromstring(cols['data:image_320']) print imfeat.image_fromstring(cols['data:image']).shape print imfeat.image_fromstring(cols['data:image_320']).shape print('image_75sq[%d]' % len(cols['data:image_75sq'])) print row cur_f = picarus.api.np_fromstring(cols['feat:gist']) cur_h = np.fromstring(cols['hash:gist'], dtype=np.uint8) print 'HOG', picarus.api.np_fromstring( cols['feat:bovw_hog_levels2_sbin16_blocks1_clusters100']) print 'Hash Bits[%d]' % (cur_h.size * 8, ) print 'Feature Dims[%d]' % (cur_f.size, ) f = hrc.feature(image)
def display(): client = hadoopy_hbase.connect('localhost') for x in hadoopy_hbase.scanner(client, 'flickr', ['metadata:title']): print(x)
def scanner_row_column(client, table, column, **kw): scanner = hadoopy_hbase.scanner(client, table, columns=[column], **kw) for row, cols in scanner: yield row, cols[column]