def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) super(MyOperation, self).Run() return True
def CreateTestData(self): self.test_data_path = '/tmp/TestWgcFilter/' self.images_uri = 'local://%s/images.pert' % self.test_data_path self.features_uri = 'local://%s/features.pert' % self.test_data_path self.feature_counts_uri = 'local://%s/features_counts.pert' % self.test_data_path test_images_path = os.path.dirname(__file__) if not py_pert.Exists(self.images_uri): randomize = False max_images = None iwutil.PackImagesDirectoryToPert(test_images_path, self.images_uri, randomize, max_images) if not py_pert.Exists(self.features_uri): iwutil.ExtractFeatures(self.images_uri, self.features_uri) if not py_pert.Exists(self.feature_counts_uri): cbirutil.CreateFeatureCountsTable(self.features_uri, self.feature_counts_uri) return
def Run(self): matches_uri = self.GetInput('matches').GetUri() photoids_uri = self.GetInput('photoids').GetUri() image_graph_uri = self.GetOutput().GetUri() ok, ig = py_imagegraph.CreateImageGraph2(matches_uri, photoids_uri) CHECK(ok) py_imagegraph.SaveImageGraph(ig, image_graph_uri) CHECK(py_pert.Exists(image_graph_uri)) return
def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) reader = py_pert.StringTableReader() print 'about to open reader' reader.Open(self.uri) print 'about to use reader' count = 0 for k, v in reader: print k count += 1 if count > 5: break return True
def __CreateValidUriCacheString(self, uris_to_cache): #TODO(kheath): if hadoop pipes -files flag is passed a uri with fewer than /// slashes after scheme: it fails... This rewrites uris to deal with this bug. # Remove this hack one this bug if fixed in hadoop. clean_cache_uris = [] for uri in uris_to_cache: ok, scheme, path, error = py_pert.ParseUri(uri) CHECK(ok) CHECK_EQ(scheme, 'maprfs') CHECK(py_pert.Exists(uri), 'uri not there: %s' % uri) clean_cache_uris.append(py_pert.CanonicalizeUri(uri)) uri_cache_string = ','.join(clean_cache_uris) return uri_cache_string
def __init__(self, resource_prefix, images, feature_extractor_params): super(ExtractFeaturesFlow,self).__init__() CHECK(feature_extractor_params.IsInitialized()) resource_prefix = str(resource_prefix) self.SetPipesBinary(__file__, 'mr_extract_features') self.AddInput('images', images) if not py_pert.Exists(resource_prefix): writer = py_pert.StringTableShardWriter() writer.Open('%s/dummy.foo' % (resource_prefix)) writer.Close() self.AddOutput('features', core.PertResource(self, "%s/features.pert" % resource_prefix) ) self.output_chunk_size_bytes = 1024 * (2**20) # 1 GB is max self.AddParam('feature_extractor_params', feature_extractor_params) return
def Run(self): matches_uri = self.GetInput('matches').GetUri() image_region_graph_uri = self.GetOutput().GetUri() tide_uri = self.GetInput('tide').GetUri() #ok, irg = py_imageregiongraph.CreateImageRegionGraph(matches_uri, tide_uri) #CHECK(ok) #py_imageregiongraph.SaveImageRegionGraph(irg, image_region_graph_uri) ok = py_imageregiongraph.CreateAndSaveImageRegionGraph(matches_uri, tide_uri, image_region_graph_uri) CHECK(ok) CHECK(py_pert.Exists(image_region_graph_uri)) return
def Run(self): bow_uri = self.GetInput('bow').GetUri() reader = py_pert.StringTableReader() CHECK(reader.Open(bow_uri)) visual_vocab_size = self.cbir_bow_params.visual_vocab_size num_docs = reader.Entries() index = None if self.cbir_bow_params.implementation == 'inria': index = py_inria.InriaIndex() elif self.cbir_bow_params.implementation == 'ctis': index = py_ctis.CtisIndex() index.StartCreate(visual_vocab_size, num_docs) else: LOG(FATAL, 'unexpected') #vv_uri = self.GetInput('visual_vocab').GetUri() temp_ivf_filepath = tempfile.mkdtemp() bag_of_words = bow_pb2.BagOfWords() progress = iwutil.MakeProgressBar(reader.Entries()) for i, (key, value) in enumerate(reader): image_id = iwutil.KeyToUint64(key) bag_of_words.ParseFromString(value) index.Add(image_id, bag_of_words) progress.update(i) index.Save(temp_ivf_filepath) py_pert.Remove(self.index_base_uri) mr.CopyUri('local://' + temp_ivf_filepath , self.index_base_uri) CHECK(py_pert.Exists(self.index_base_uri + '/index.ivf')) CHECK(py_pert.Exists(self.index_base_uri + '/index.ivfids')) shutil.rmtree(temp_ivf_filepath, ignore_errors=True) return True
def Run(self): """ this is a dummy class... doesn't need to do anything.""" vv_uri = self.GetOutput().GetUri() CHECK(py_pert.Exists(vv_uri), 'expected uri to exist: %s' % (vv_uri)) self.is_done = True # hack: needed because the timestamp condition doesn't hold here! return
def PreRunConfig(self): sorted_match_matches_uri = self.GetInput('sorted_match_batches').GetUri() CHECK(py_pert.Exists(sorted_match_matches_uri), 'expected uri to exist: %s' % (sorted_match_matches_uri)) return
def Run(self): LOG( INFO, 'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...' ) time.sleep(30) itergraph_state = LoadObjectFromUri( self.GetInput('prev_state').GetUri()) reader = py_pert.StringTableReader() CHECK(reader.Open(self.GetInput('candidates').GetUri())) self.match_groups = {} num_selected_candidates = 0 pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase) num_edges_skipped_max_degree_constraint = 0 num_edges_skipped_max_replication_constraint = 0 prev_score = -float('inf') for ordering_key, candidate_pair_data in reader: image_a_id, image_b_id = iwutil.ParseUint64KeyPair( candidate_pair_data) if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id): #print 'skipping previous attempted edge' continue # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score) score = iwutil.KeyToDouble(ordering_key) CHECK_GE(score, prev_score) prev_score = score if image_a_id not in self.match_groups: self.match_groups[image_a_id] = [] match_group_size = len(self.match_groups[image_a_id]) if match_group_size < self.max_batch_size: # test vertex degree condition degree_a = itergraph_state.GetDegree(image_a_id) degree_b = itergraph_state.GetDegree(image_b_id) # version 1: skip candidate edge if either of the vertices has many edges #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree: # version 2: skip candidate edge only if both of the vertices have many edges if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree: # test max replication condition num_replications = self._GetNumReplications(image_b_id) if num_replications < self.max_replication_factor: self._IncrementReplications(image_b_id) self.match_groups[image_a_id].append(image_b_id) num_selected_candidates += 1 pbar.update(num_selected_candidates) else: num_edges_skipped_max_replication_constraint += 1 else: num_edges_skipped_max_degree_constraint += 1 if num_selected_candidates >= self.max_candidates_per_phase: break pbar.finish() print '' print '' print 'num_edges_skipped_max_replication_constraint: %d' % ( num_edges_skipped_max_replication_constraint) print 'num_edges_skipped_max_degree_constraint: %d' % ( num_edges_skipped_max_degree_constraint) print '' print '' # write out the match plan (must be sorted by key for future join stage) metadata_entries = [] for batch_id, (batch_primary_image, batch_image_ids) in enumerate( self.match_groups.iteritems()): if len(batch_image_ids) == 0: continue batch_name = iwutil.Uint64ToKey(batch_id) CHECK(batch_name) CHECK(len(batch_name)) match_batch_metadata = iw_pb2.MatchBatchMetadata() match_batch_metadata.image_id = batch_primary_image match_batch_metadata.batch_name = batch_name match_batch_metadata.is_primary = True metadata_entries.append(match_batch_metadata) for image_id in batch_image_ids: next_metadata = iw_pb2.MatchBatchMetadata() next_metadata.image_id = image_id next_metadata.batch_name = batch_name next_metadata.is_primary = False metadata_entries.append(next_metadata) # image_id will be the key of output, so need to sort by image_id metadata_entries.sort(key=lambda m: m.image_id) match_batches_uri = self.GetOutput('sorted_match_batches').GetUri() # TODO(heathkh): "closing" doesn't flush to disk... this is a bug! # match_plan_writer = py_pert.ProtoTableWriter() # num_shards_features = py_pert.GetNumShards(self.features.GetUri()) # CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features)) # for metadata in metadata_entries: # CHECK(metadata.IsInitialized()) # key = iwutil.Uint64ToKey(metadata.image_id) # CHECK(match_plan_writer.Add(key, metadata.SerializeToString())) # match_plan_writer.Close() # TODO(kheath): Work around for above bug is to run a MR stage to reshard tmp_match_batches_uri = self.GetOutput( 'sorted_match_batches').GetUri() + '_to_be_sharded' match_plan_writer = py_pert.ProtoTableWriter() num_shards_features = py_pert.GetNumShards(self.features.GetUri()) CHECK( match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), tmp_match_batches_uri, 1)) for metadata in metadata_entries: CHECK(metadata.IsInitialized()) CHECK( match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id), metadata.SerializeToString())) match_plan_writer.Close() # manually reshard pertedit_bin = 'pertedit' cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % ( pertedit_bin, tmp_match_batches_uri, match_batches_uri, num_shards_features) print cmd CHECK_EQ(ExecuteCmd(cmd), 0) CHECK(py_pert.Exists(match_batches_uri)) ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri) CHECK(ok) CHECK(len(fp), 32) CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e', 'invalid hash of empty string') return