Example #1
0
def delete_rows(prefix):
    assert ord(prefix[-1]) != 255
    stop_row = prefix[:-1] + chr(ord(prefix[-1]) + 1)
    for x, y in hadoopy_hbase.scanner(c, "images", start_row=prefix, stop_row=stop_row):
        assert x.startswith(prefix)
        print(repr(x))
        c.deleteAllRow("images", x)
Example #2
0
 def _features_to_classifier(self, classifier, class_positive, input_table, input_feature_column, input_class_column, output_table, output_row, output_column, max_per_label=None, **kw):
     row_dict = hadoopy_hbase.HBaseRowDict(output_table,
                                           output_column, db=self.hb)
     row_cols = hadoopy_hbase.scanner(self.hb, input_table,
                                      columns=[input_feature_column, input_class_column], **kw)
     label_features = {0: [], 1: []}
     for row, cols in row_cols:
         if max_per_label is not None and len(label_features[0]) >= max_per_label and len(label_features[1]) >= max_per_label:
             break
         label = int(cols[input_class_column] == class_positive)
         if max_per_label is None or len(label_features[label]) < max_per_label:
             print(label)
             print(cols[input_class_column])
             label_features[label].append(cols[input_feature_column])
         else:
             print('Skipping[%d]' % label)
     labels = [0] * len(label_features[0]) + [1] * len(label_features[1])
     features = label_features[0] + label_features[1]
     features_to_classifier(classifier, labels, features)
     cp = picarus.api.Classifier()
     cp.name = '%s-%s-indoor' % (self.images_table, self.feature_name)  # TODO(brandyn): Indoor specific ATM
     if isinstance(self.feature_dict, dict):
         cp.feature = json.dumps(self.feature_dict)
         cp.feature_format = cp.JSON_IMPORT
     else:
         cp.feature = pickle.dumps(self.feature_dict, -1)
         cp.feature_format = cp.PICKLE
     cp.classifier = pickle.dumps(classifier, -1)
     cp.classifier_format = cp.PICKLE
     row_dict[output_row] = cp.SerializeToString()
     print('Train')
Example #3
0
 def evaluate_classifier_class_distance_list(self, classifier_key, **kw):
     classifier = picarus.api.feature_classifier_frompb(self.key_to_classifier_pb(classifier_key))
     input_dict = self.key_to_input_model_param(classifier_key)[0]
     feature_key = input_dict['feature']
     metadata_key = input_dict['meta']
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[feature_key, metadata_key], **kw)
     cm = {}  # [true][pred]
     total = 0
     correct = 0
     for row, columns in row_cols:
         feature = picarus.api.np_fromstring(columns[feature_key])
         print(feature.shape)
         c = classifier(feature)
         print(c)
         total += 1
         try:
             pred_class = c[0]['class']
         except IndexError:
             pred_class = ''
         true_class = columns[metadata_key]
         if pred_class == true_class:
             correct += 1
         try:
             cm.setdefault(true_class, {})[pred_class] += 1
         except KeyError:
             cm.setdefault(true_class, {})[pred_class] = 1
         print(cm)
         print(correct / float(total))
     print(correct / float(total))
     return {'cm': cm, 'total': total, 'correct': correct}
Example #4
0
def livedata():
    print_request()
    row_to_time = lambda row: 2**31 - int(row[6:])
    time_to_row = lambda t: 'camera' + str(2**31 - t)
    out = []
    try:
        THRIFT_LOCK.acquire()
        start_row = 'camera'
        row_skip = 5
        cur_time = 0
        for _ in range(5):
            for row, cols in hadoopy_hbase.scanner(THRIFT,
                                                   'testtable',
                                                   start_row=start_row,
                                                   stop_row='camerb',
                                                   max_rows=1):
                cur_time = row_to_time(row)
                out.append({'row': row, 'time': cur_time, 'columns': cols})
                cur_time -= row_skip
                start_row = time_to_row(cur_time)
                row_skip *= 2
            if cur_time < 0:
                break
    finally:
        THRIFT_LOCK.release()
    return {'data': out}
Example #5
0
 def get_slice(self, start_row, stop_row, columns, params, files):
     self._slice_validate(start_row, stop_row, 'r')
     max_rows = min(10000, int(params.get('maxRows', 1)))
     print('MaxRows[%d]' % max_rows)
     max_bytes = min(5242880, int(params.get('maxBytes', 5242880)))
     filter_string = params.get('filter')
     print('filter string[%s]' % filter_string)
     exclude_start = bool(int(params.get('excludeStart', 0)))
     out = []
     per_call = 1
     max_byte_count = 0
     with thrift_lock() as thrift:
         scanner = hadoopy_hbase.scanner(thrift, self.table, per_call=per_call, columns=columns,
                                         start_row=start_row, stop_row=stop_row, filter=filter_string)
         cur_row = start_row
         byte_count = 0
         for row_num, (cur_row, cur_columns) in enumerate(scanner, 1):
             if exclude_start and row_num == 1:
                 continue
             out.append(encode_row(cur_row, cur_columns))
             cur_byte_count = self._byte_count_rows(out[-1:])
             byte_count += cur_byte_count
             # Compute the number of rows we should try to get by using the max sized row
             # that we have seen as an upper bound.
             max_byte_count = max(1, max(max_byte_count, cur_byte_count))
             per_call = max(1, min((max_bytes - byte_count) / max_byte_count, max_rows - len(out)))
             if len(out) >= max_rows or byte_count >= max_bytes:
                 break
     bottle.response.headers["Content-type"] = "application/json"
     return json.dumps(out)
Example #6
0
def classifier_sklearn(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    label_features = {0: [], 1: []}
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['feature'], inputs['meta']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            try:
                label = int(cols[inputs['meta']] == params['class_positive'])
                label_features[label].append(cols[inputs['feature']])
            except KeyError:
                continue
    labels = [0] * len(label_features[0]) + [1] * len(label_features[1])
    features = label_features[0] + label_features[1]
    features = np.asfarray([msgpack.loads(x)[0] for x in features])
    import sklearn.svm
    classifier = sklearn.svm.LinearSVC()
    classifier.fit(features, np.asarray(labels))
    factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    model_link = {'name': 'picarus.LinearClassifier', 'kw': {'coefficients': classifier.coef_.tolist()[0],
                                                             'intercept': classifier.intercept_[0]}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature',
                                                  'output_type': 'binary_class_confidence', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
Example #7
0
def index_hamming_feature2d(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    hashes = []
    #keypoints = []
    labels = []
    indeces = []
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['feature2d_binary']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            f = msgpack.loads(cols[inputs['feature2d_binary']])
            print(f[2][0])
            hashes.append(f[0])
            #keypoints += f[1]
            indeces += [len(labels)] * f[2][0]
            labels.append(row)
            print(len(labels))
    hashes = ''.join(hashes)
    factory_info = {'slices': slices, 'num_hashes': len(indeces), 'num_images': len(labels), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    #'keypoints': keypoints,
    model_link = {'name': 'picarus.HammingFeature2dHashIndex', 'kw': {'hashes': hashes,
                                                                      'indeces': indeces, 'labels': labels,
                                                                      'max_results': params['max_results'],
                                                                      'max_keypoint_results': params['max_keypoint_results'],
                                                                      'hamming_thresh': params['hamming_thresh']}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature2d_binary']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['feature2d_binary'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature2d_binary',
                                                  'output_type': 'distance_image_rows', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
Example #8
0
def classifier_kernel_sklearn(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    label_features = {0: [], 1: []}
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['feature'], inputs['meta']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            try:
                label = int(cols[inputs['meta']] == params['class_positive'])
                label_features[label].append(cols[inputs['feature']])
            except KeyError:
                continue

    kernel = {'hik': kernels.histogram_intersection}[params['kernel']]
    labels = [0] * len(label_features[0]) + [1] * len(label_features[1])
    features = label_features[0] + label_features[1]
    features = np.asfarray([msgpack.loads(x)[0] for x in features])
    gram = kernel(features, features)
    import sklearn.svm
    classifier = sklearn.svm.SVC(kernel='precomputed')
    classifier.fit(gram, np.asarray(labels))
    factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    support_vectors = features[classifier.support_, :].ravel().tolist()
    dual_coef = classifier.dual_coef_.ravel().tolist()
    intercept = float(classifier.intercept_.ravel()[0])
    model_link = {'name': 'picarus.KernelClassifier', 'kw': {'support_vectors': support_vectors,
                                                             'dual_coef': dual_coef,
                                                             'intercept': intercept,
                                                             'kernel': params['kernel']}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature',
                                                  'output_type': 'binary_class_confidence', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
Example #9
0
 def classifier_localnbnn(params, inputsub64, schema):
     print(inputsub64)
     inputs = {x: base64.urlsafe_b64decode(y) for x, y in inputsub64.items()}
     print(inputs)
     features = []
     indeces = []
     num_features = 0
     feature_size = 0
     labels_dict = {}
     labels = []
     for start_row, stop_row in start_stop_rows:
         row_cols = hadoopy_hbase.scanner(thrift, data_table.table,
                                          columns=[inputs['multi_feature'], inputs['meta']], start_row=start_row, stop_row=stop_row)
         for _, cols in row_cols:
             try:
                 label = cols[inputs['meta']]
                 f, s = msgpack.unpackb(cols[inputs['multi_feature']])
                 if label not in labels_dict:
                     labels_dict[label] = len(labels_dict)
                     labels.append(label)
                 feature_size = s[1]
                 num_features += s[0]
                 features += f
                 indeces += [labels_dict[label]] * s[0]
             except KeyError:
                 pass
     factory_info = {'slices': slices, 'data': 'slices', 'params': params, 'inputs': inputsub64}
     model = {'name': 'picarus.LocalNBNNClassifier', 'kw': {'features': features, 'indeces': indeces, 'labels': labels,
                                                            'feature_size': feature_size, 'max_results': params['max_results']}}
     return {'input': inputsub64['multi_feature'], 'model': model, 'input_type': 'feature', 'output_type': 'multi_class_distance',
             'email': self.owner, 'name': manager.model_to_name(model), 'factory_info': json.dumps(factory_info)}
Example #10
0
 def classifier_sklearn(params, inputs, schema):
     label_features = {0: [], 1: []}
     for start_row, stop_row in start_stop_rows:
         row_cols = hadoopy_hbase.scanner(thrift, data_table.table,
                                          columns=[base64.urlsafe_b64decode(inputs['feature']), base64.urlsafe_b64decode(inputs['meta'])],
                                          start_row=start_row, stop_row=stop_row)
         for row, cols in row_cols:
             try:
                 label = int(cols[base64.urlsafe_b64decode(inputs['meta'])] == params['class_positive'])
                 label_features[label].append(cols[base64.urlsafe_b64decode(inputs['feature'])])
             except KeyError:
                 continue
     labels = [0] * len(label_features[0]) + [1] * len(label_features[1])
     features = label_features[0] + label_features[1]
     features = np.asfarray([msgpack.unpackb(x)[0] for x in features])
     num_nans = 0
     for feature in features:
         if np.any(np.isnan(feature)):
             num_nans += 1
             print(feature)
     print('NumNans[%d]' % num_nans)
     import sklearn.svm
     classifier = sklearn.svm.LinearSVC()
     classifier.fit(features, np.asarray(labels))
     factory_info = {'slices': slices, 'num_rows': len(features), 'data': 'slices', 'params': params, 'inputs': inputs}
     model = {'name': 'picarus.LinearClassifier', 'kw': {'coefficients': classifier.coef_.tolist()[0],
                                                         'intercept': classifier.intercept_[0]}}
     return {'input': inputs['feature'], 'model': model, 'input_type': 'feature', 'output_type': 'binary_class_confidence',
             'email': self.owner, 'name': manager.model_to_name(model), 'factory_info': json.dumps(factory_info)}
Example #11
0
def classifier_localnbnn(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    features = []
    indeces = []
    num_features = 0
    feature_size = 0
    labels_dict = {}
    labels = []
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['multi_feature'], inputs['meta']], start_row=start_row, stop_row=stop_row)
        for _, cols in row_cols:
            try:
                label = cols[inputs['meta']]
                f, s = msgpack.loads(cols[inputs['multi_feature']])
                if label not in labels_dict:
                    labels_dict[label] = len(labels_dict)
                    labels.append(label)
                feature_size = s[1]
                num_features += s[0]
                features += f
                indeces += [labels_dict[label]] * s[0]
            except KeyError:
                pass
    factory_info = {'slices': slices, 'data': 'slices', 'params': params, 'inputs': inputsb64}
    model_link = {'name': 'picarus.LocalNBNNClassifier', 'kw': {'features': features, 'indeces': indeces, 'labels': labels,
                                                                'feature_size': feature_size, 'max_results': params['max_results']}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['multi_feature']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['multi_feature'], 'model_link': model_link, 'model_chain': model_chain,
                                                  'input_type': 'multi_feature', 'output_type': 'multi_class_distance',
                                                  'email': owner, 'name': manager.model_to_name(model_link), 'factory_info': json.dumps(factory_info)}))
Example #12
0
 def inner(num_rows, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[self.image_column, self.indoor_class_column], **kw)
     for x, (_, cols) in enumerate(row_cols):
         print(repr(x))
         if x >= num_rows:
             break
         yield cols[self.indoor_class_column], imfeat.image_fromstring(cols[self.image_column])
Example #13
0
 def _build_index(self, si, index, input_table, input_hash_column, input_class_column, output_table, output_row, output_column, **kw):
     row_dict = hadoopy_hbase.HBaseRowDict(output_table,
                                           output_column, db=self.hb)
     row_cols = hadoopy_hbase.scanner(self.hb, input_table,
                                      columns=[input_hash_column, input_class_column], **kw)
     metadata, hashes = zip(*[(json.dumps([cols[input_class_column], base64.b64encode(row)]), cols[input_hash_column])
                              for row, cols in row_cols])
     row_dict[output_row] = hashes_to_index(si, index, metadata, hashes)
Example #14
0
 def delete_slice(self, start_row, stop_row):
     self._slice_validate(start_row, stop_row, 'w')
     # NOTE: This only fetches rows that have a column in data:image (it is a significant optimization)
     # NOTE: Only parameters allowed, no "files" due to memory restrictions
     with thrift_lock() as thrift:
         for row, _ in hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, stop_row=stop_row, filter='KeyOnlyFilter()'):
             thrift.deleteAllRow(self.table, row)
     return {}
Example #15
0
 def _scanner(self, *args, **kw):
     import hadoopy_hbase
     for x, y in self._slices:
         print((x, y))
         for z in hadoopy_hbase.scanner(self._hbase, self._table,
                                        start_row=x,
                                        stop_row=y,
                                        *args, **kw):
             yield z
Example #16
0
 def index_train(model_dict, model_param, inputs):
     index = call_import(model_dict)
     row_cols = hadoopy_hbase.scanner(thrift, self.table,
                                      columns=[inputs['hash'], inputs['meta']], start_row=start_row, stop_row=stop_row)
     metadata, hashes = zip(*[(json.dumps([cols[inputs['meta']], base64.urlsafe_b64encode(row)]), cols[inputs['hash']])
                              for row, cols in row_cols])
     hashes = np.ascontiguousarray(np.asfarray([np.fromstring(h, dtype=np.uint8) for h in hashes]))
     index = index.store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
     index.metadata = metadata
     return index
Example #17
0
 def kmeans_cluster_mfeat(model_dict, model_param, inputs):
     # TODO: This needs to be finished, determine if we want quantizer level or cluster level
     clusterer = call_import(model_dict)
     features = []
     row_cols = hadoopy_hbase.scanner(thrift, self.table,
                                      columns=[inputs['multi_feature']], start_row=start_row, stop_row=stop_row)
     # TODO: We'll want to check that we aren't clustering too much data by placing constraints
     for row, columns in row_cols:
         features.append(picarus.api.np_fromstring(columns[inputs['multi_feature']]))
     features = np.vstack(features)
     return clusterer.cluster(features)
Example #18
0
 def patch_slice(self, start_row, stop_row, params, files):
     self._slice_validate(start_row, stop_row, 'w')
     # NOTE: This only fetches rows that have a column in data:image (it is a significant optimization)
     # NOTE: Only parameters allowed, no "files" due to memory restrictions
     mutations = []
     for x, y in params.items():
         mutations.append(hadoopy_hbase.Mutation(column=base64.urlsafe_b64decode(x), value=base64.b64decode(y)))
     if mutations:
         with thrift_lock() as thrift:
             for row, _ in hadoopy_hbase.scanner(thrift, self.table, start_row=start_row, stop_row=stop_row, filter='KeyOnlyFilter()', columns=['data:image']):
                 thrift.mutateRow(self.table, row, mutations)
     return {}
Example #19
0
 def features_to_classifier_class_distance_list(self, feature_key, metadata_column, classifier, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[feature_key, metadata_column], **kw)
     label_values = ((cols[metadata_column], np.asfarray(picarus.api.np_fromstring(cols[feature_key]))) for _, cols in row_cols)
     classifier.train(label_values)
     feature_input, feature, _ = self.key_to_input_model_param(feature_key)
     classifier_ser = pickle.dumps(classifier, -1)
     print(len(classifier_ser))
     k = image_retrieval.input_model_param_to_key('pred:', input={'feature': feature_key, 'meta': metadata_column},
                                                  model=classifier, param={'classifier_type': 'class_distance_list'})
     print(repr(k))
     return k
Example #20
0
 def evaluate_masks(self, cm_ilp):
     # Go through each mask and compare it to the annotation results
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[self.masks_gt_column])
     cms = {'train': np.zeros((self.texton_num_classes, self.texton_num_classes), dtype=np.int32),
            'test': np.zeros((self.texton_num_classes, self.texton_num_classes), dtype=np.int32)}
     ilps = []
     if cm_ilp:
         ilp_weights = json.load(open('ilp_weights.js'))  # load weights from previous run
         ilp_weights['ilp_tables'] = np.asfarray(ilp_weights['ilp_tables'])
     for row, columns in row_cols:
         gt = picarus.api.np_fromstring(columns[self.masks_gt_column])
         ilp_pred = np.fromstring(self.hb.get(self.images_table, row, self.feature_prediction_column)[0].value, dtype=np.double)[0]
         print(ilp_pred)
         masks = picarus.api.np_fromstring(self.hb.get(self.images_table, row, self.masks_column)[0].value)
         if cm_ilp:
             try:
                 bin_index = [x for x, y in enumerate(ilp_weights['bins']) if y >= ilp_pred][0]
             except IndexError:
                 bin_index = ilp_weights['ilp_tables'].shape[1]
             if bin_index != 0:
                 bin_index -= 1
             print('bin_index[%d][%f]' % (bin_index, ilp_pred))
             masks *= ilp_weights['ilp_tables'][:, bin_index]
         masks_argmax = np.argmax(masks, 2)
         gt_sums = np.sum(gt.reshape(-1, gt.shape[2]), 0).tolist()
         print(gt_sums)
         if row.startswith('sun397train'):
             cm = cms['train']
             ilps.append({'gt_sums': gt_sums, 'ilp_pred': ilp_pred, 'gt_size': gt.shape[0] * gt.shape[1]})
         else:
             cm = cms['test']
         for mask_num in range(gt.shape[2]):
             if not np.any(gt[:, :, mask_num]):
                 continue
             print(mask_num)
             preds = masks_argmax[gt[:, :, mask_num].nonzero()]
             h, bins = np.histogram(preds, np.arange(self.texton_num_classes + 1))
             np.testing.assert_equal(bins, np.arange(self.texton_num_classes + 1))
             cm[mask_num] += h
         json.dump({'cms': {'train': cms['train'].tolist(), 'test': cms['test'].tolist()}, 'cm_ilp': cm_ilp, 'ilps': ilps}, open('eval.js', 'w'))
         for split in ['train', 'test']:
             cm = cms[split]
             print(split)
             print(cm)
             if np.any(cm):
                 print(((cm / float(np.sum(cm))) * 100).astype(np.int32))
     classes = [z[1] for z in sorted([(y['mask_num'], x) for x, y in self.texton_classes.items()])]
     title_suffix = 'w ilp)' if cm_ilp else 'w/o ilp)'
     fn_suffix = '_ilp.png' if cm_ilp else '.png'
     save_confusion_matrix(cms['test'], classes, 'confmat_test' + fn_suffix, title='Confusion Matrix (test ' + title_suffix)
     save_confusion_matrix(cms['train'], classes, 'confmat_train' + fn_suffix, title='Confusion Matrix (train ' + title_suffix)
Example #21
0
 def cluster_points_local(self, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[self.image_column], **kw)
     feature_func = imfeat.HOGLatent(16)
     num_clusters = 100
     features = []
     for row, columns in row_cols:
         image = imfeat.image_fromstring(columns[self.image_column])
         features.append(feature_func.compute_dense(image))
     features = np.vstack(features)
     clusters = sp.cluster.vq.kmeans(features, num_clusters)[0]
     print(clusters.shape)
     json.dump(clusters.tolist(), open('clusters.js', 'w'))
Example #22
0
 def cluster_points_local(self, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[self.image_column], **kw)
     feature_func = imfeat.HOGLatent(16)
     num_clusters = 100
     features = []
     for row, columns in row_cols:
         image = imfeat.image_fromstring(columns[self.image_column])
         features.append(feature_func.compute_dense(image))
     features = np.vstack(features)
     clusters = sp.cluster.vq.kmeans(features, num_clusters)[0]
     print(clusters.shape)
     json.dump(clusters.tolist(), open('clusters.js', 'w'))
Example #23
0
 def get_table(self, columns):
     user_column = 'user:'******'user:'******'user', '%s', =, 'binaryprefix:r', true, true)" % self.owner
     outs = []
     with thrift_lock() as thrift:
         for row, cols in hadoopy_hbase.scanner(thrift, self.table, columns=columns + [user_column], filter=hbase_filter):
             self._row_validate(row, 'r', thrift)
             if not output_user:
                 del cols[user_column]
             outs.append(encode_row(row, cols))
     bottle.response.headers["Content-type"] = "application/json"
     return json.dumps(outs)
Example #24
0
 def hashes_to_index(self, hasher_key, metadata_column, index, **kw):
     hasher_input, hasher, _ = self.key_to_input_model_param(hasher_key)
     feature_input, feature, _ = self.key_to_input_model_param(hasher_input['feature'])
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[hasher_key, metadata_column], **kw)
     metadata, hashes = zip(*[(json.dumps([cols[metadata_column], base64.urlsafe_b64encode(row)]), cols[hasher_key])
                              for row, cols in row_cols])
     hashes = np.ascontiguousarray(np.asfarray([np.fromstring(h, dtype=np.uint8) for h in hashes]))
     index = index.store_hashes(hashes, np.arange(len(metadata), dtype=np.uint64))
     index.metadata = metadata
     k = image_retrieval.input_model_param_to_key('srch:', input={'hash': hasher_key, 'meta': metadata_column}, model=index)
     print(repr(k))
     return k
Example #25
0
 def _prediction_to_conf_gt(self, class_positive, input_table, input_prediction_column, input_class_column, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, input_table,
                                      columns=[input_prediction_column, input_class_column], **kw)
     pos_confs = []
     neg_confs = []
     for row, cols in row_cols:
         pred = float(np.fromstring(cols[input_prediction_column], dtype=np.double)[0])
         print(repr(row))
         if cols[input_class_column] == class_positive:
             pos_confs.append(pred)
         else:
             neg_confs.append(pred)
     pos_confs.sort()
     neg_confs.sort()
     open('confs.js', 'w').write(json.dumps({'pos_confs': pos_confs, 'neg_confs': neg_confs}))
     print(len(pos_confs))
     print(len(neg_confs))
Example #26
0
def index_spherical(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    hashes = []
    labels = []
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['hash']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            hashes.append(cols[inputs['hash']])
            labels.append(row)
    hashes = ''.join(hashes)
    factory_info = {'slices': slices, 'num_hashes': len(labels), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    model_link = {'name': 'picarus.SphericalHashIndex', 'kw': {'hashes': hashes,
                                                               'indeces': range(len(labels)), 'labels': labels,
                                                               'max_results': params['max_results']}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['hash']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['hash'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'hash',
                                                  'output_type': 'distance_image_rows', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
Example #27
0
 def get_slice(self, start_row, stop_row, columns, params, files):
     self._slice_validate(start_row, stop_row, 'r')
     max_rows = min(10000, int(params.get('maxRows', 1)))
     max_bytes = min(1048576, int(params.get('maxBytes', 1048576)))
     filter_string = params.get('filter')
     print('filter string[%s]' % filter_string)
     exclude_start = bool(int(params.get('excludeStart', 0)))
     with thrift_lock() as thrift:
         scanner = hadoopy_hbase.scanner(thrift, self.table, per_call=10, columns=columns,
                                         start_row=start_row, stop_row=stop_row, filter=filter_string)
     out = []
     cur_row = start_row
     byte_count = 0
     for row_num, (cur_row, cur_columns) in enumerate(scanner, 1):
         if exclude_start and row_num == 1:
             continue
         out.append(encode_row(cur_row, cur_columns))
         byte_count += self._byte_count_rows(out[-1:])
         if len(out) >= max_rows or byte_count >= max_bytes:
             break
     bottle.response.headers["Content-type"] = "application/json"
     return json.dumps(out)
Example #28
0
def feature_bovw_mask(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    features = []
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['mask_feature']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            cur_feature = msgpack.loads(cols[inputs['mask_feature']])
            cur_feature = np.array(cur_feature[0]).reshape((-1, cur_feature[1][2]))
            features += random.sample(cur_feature, min(len(cur_feature), params['max_per_row']))
            print(len(features))
    features = np.asfarray(features)
    clusters = sp.cluster.vq.kmeans(features, params['num_clusters'])[0]
    num_clusters = clusters.shape[0]
    factory_info = {'slices': slices, 'num_features': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    model_link = {'name': 'picarus.BOVWImageFeature', 'kw': {'clusters': clusters.ravel().tolist(), 'num_clusters': num_clusters,
                                                             'levels': params['levels']}}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['mask_feature']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['mask_feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature',
                                                  'output_type': 'feature', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
Example #29
0
 def scanner(self,
             table,
             start_row=None,
             stop_row=None,
             columns=None,
             keys_only=False,
             per_call=1,
             column_filter=None):
     filts = ['KeyOnlyFilter()'] if keys_only else []
     if column_filter:
         sanitary = lambda x: re.search("^[a-zA-Z0-9@\.:]+$", x)
         filter_family, filter_column = column_filter[0].split(':')
         if column_filter[1] == '=':
             filter_relation = '='
             filter_value = 'binary:' + column_filter[2]
         elif column_filter[1] == '!=':
             filter_relation = '!='
             filter_value = 'binary:' + column_filter[2]
         elif column_filter[1] == 'startswith':
             filter_relation = '='
             filter_value = 'binaryprefix:' + column_filter[2]
         else:
             bottle.abort(400)  # Bad filter
         if any(not sanitary(x)
                for x in [filter_family, filter_column, filter_value]):
             bottle.abort(400)
         filts.append(
             "SingleColumnValueFilter ('%s', '%s', %s, '%s', true, true)" %
             (filter_family, filter_column, filter_relation, filter_value))
     filt = ' AND '.join(filts)
     if not filt:
         filt = None
     return hadoopy_hbase.scanner(self._thrift,
                                  table,
                                  columns=columns,
                                  start_row=start_row,
                                  stop_row=stop_row,
                                  filter=filt,
                                  per_call=per_call)
Example #30
0
def hasher_spherical(queue, params, inputs, schema, start_stop_rows, table, owner):
    thrift, manager, slices, inputsb64 = _setup(start_stop_rows, inputs)
    features = []
    for start_row, stop_row in start_stop_rows:
        row_cols = hadoopy_hbase.scanner(thrift, table,
                                         columns=[inputs['feature']],
                                         start_row=start_row, stop_row=stop_row)
        for row, cols in row_cols:
            cur_feature = msgpack.loads(cols[inputs['feature']])
            features.append(np.array(cur_feature[0]))
    print('num_features[%d]' % len(features))
    features = np.asfarray(features)
    out = picarus_takeout.spherical_hasher_train(features, params['num_pivots'], params['eps_m'], params['eps_s'], params['max_iters'])
    out = {'pivots': out['pivots'].ravel().tolist(),
           'threshs': out['threshs'].tolist()}
    #out = picarus.modules.spherical_hash.train_takeout(features, params['num_pivots'], params['eps_m'], params['eps_s'], params['max_iters'])
    factory_info = {'slices': slices, 'num_features': len(features), 'data': 'slices', 'params': params, 'inputs': inputsb64}
    model_link = {'name': 'picarus.SphericalHasher', 'kw': out}
    model_chain = tables._takeout_model_chain_from_key(manager, inputs['feature']) + [model_link]
    queue.put(manager.input_model_param_to_key(**{'input': inputs['feature'], 'model_link': model_link, 'model_chain': model_chain, 'input_type': 'feature',
                                                  'output_type': 'hash', 'email': owner, 'name': manager.model_to_name(model_link),
                                                  'factory_info': json.dumps(factory_info)}))
def livedata():
    print_request()
    row_to_time = lambda row: 2**31 - int(row[6:])
    time_to_row = lambda t: 'camera' + str(2**31 - t)
    out = []
    try:
        THRIFT_LOCK.acquire()
        start_row = 'camera'
        row_skip = 5
        cur_time = 0
        for _ in range(5):
            for row, cols in hadoopy_hbase.scanner(THRIFT, 'testtable', start_row=start_row, stop_row='camerb', max_rows=1):
                cur_time = row_to_time(row)
                out.append({'row': row, 'time': cur_time, 'columns': cols})
                cur_time -= row_skip
                start_row = time_to_row(cur_time)
                row_skip *= 2
            if cur_time < 0:
                break
    finally:
        THRIFT_LOCK.release()
    return {'data': out}
Example #32
0
 def features_to_classifier_sklearn_decision_func(self, feature_key, metadata_column, class_positive, classifier, max_per_label=None, **kw):
     row_cols = hadoopy_hbase.scanner(self.hb, self.images_table,
                                      columns=[feature_key, metadata_column], **kw)
     label_features = {0: [], 1: []}
     for row, cols in row_cols:
         if max_per_label is not None and len(label_features[0]) >= max_per_label and len(label_features[1]) >= max_per_label:
             break
         label = int(cols[metadata_column] == class_positive)
         if max_per_label is None or len(label_features[label]) < max_per_label:
             label_features[label].append(cols[feature_key])
             print label, cols[metadata_column]
         else:
             print('Skipping[%d]' % label)
     labels = [0] * len(label_features[0]) + [1] * len(label_features[1])
     features = label_features[0] + label_features[1]
     features = np.asfarray([picarus.api.np_fromstring(x) for x in features])
     classifier.fit(features, np.asarray(labels))
     k = image_retrieval.input_model_param_to_key('pred:', input={'feature': feature_key, 'meta': metadata_column},
                                                  model=classifier, param={'class_positive': class_positive,
                                                                           'classifier_type': 'sklearn_decision_func'})
     print(repr(k))
     return k
import hadoopy_hbase
import time

c = hadoopy_hbase.connect('localhost')
cnt = 0
st = time.time()
N = 5000
for x in hadoopy_hbase.scanner(c, 'flickr', per_call=N, columns=['metadata:license']):
    cnt += 1
    if cnt % N == 0:
        print(((time.time() - st) / N, cnt))
        st = time.time()
import tempfile
import zlib
import json
import os
import random
import numpy as np
import imfeat
import picarus.modules
import picarus.api
logging.basicConfig(level=logging.DEBUG)

a = hadoopy_hbase.connect()
hrc = picarus.modules.HashRetrievalClassifier()
hrc.load(open('sun397_feature_index.pb').read())
for num, (row, cols) in enumerate(
        hadoopy_hbase.scanner(a, 'images', start_row='sun397train')):
    if num > 2:
        break
    print cols['feat:superpixel'][:50]
    image = imfeat.image_fromstring(cols['data:image_320'])
    print imfeat.image_fromstring(cols['data:image']).shape
    print imfeat.image_fromstring(cols['data:image_320']).shape
    print('image_75sq[%d]' % len(cols['data:image_75sq']))
    print row
    cur_f = picarus.api.np_fromstring(cols['feat:gist'])
    cur_h = np.fromstring(cols['hash:gist'], dtype=np.uint8)
    print 'HOG', picarus.api.np_fromstring(
        cols['feat:bovw_hog_levels2_sbin16_blocks1_clusters100'])
    print 'Hash Bits[%d]' % (cur_h.size * 8, )
    print 'Feature Dims[%d]' % (cur_f.size, )
    f = hrc.feature(image)
Example #35
0
def display():
    client = hadoopy_hbase.connect('localhost')
    for x in hadoopy_hbase.scanner(client, 'flickr', ['metadata:title']):
        print(x)
Example #36
0
def scanner_row_column(client, table, column, **kw):
    scanner = hadoopy_hbase.scanner(client, table, columns=[column], **kw)
    for row, cols in scanner:
        yield row, cols[column]