def __init__(self, output_uri, raw_images, crop_fraction, min_dimension_pixels, min_area_pixels, max_area_pixels): super(ScrubImagesFlow, self).__init__() self.parameters['crop_fraction'] = crop_fraction self.parameters['min_dimension_pixels'] = min_dimension_pixels self.parameters['min_area_pixels'] = min_area_pixels self.parameters['max_area_pixels'] = max_area_pixels self.AddInput('raw_images', raw_images) self.AddOutput('scrubbed_images', core.PertResource(self, '%s/photoid_to_image.pert' % (output_uri))) self.SetPipesBinary(__file__, 'mr_scrub_images') self.num_reduce_jobs = py_pert.GetNumShards(raw_images.GetUri()) return
def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) super(MyOperation, self).Run() return True
def Run(self): print 'pid: %s' % os.getpid() print 'id(py_pert): %s' % id(py_pert) ok, scheme, path = py_pert.ParseUri(self.uri) print 'path: %s' % path print 'exists: %s' % py_pert.Exists(self.uri) if py_pert.Exists(self.uri): print 'num shards: %s' % py_pert.GetNumShards(self.uri) reader = py_pert.StringTableReader() print 'about to open reader' reader.Open(self.uri) print 'about to use reader' count = 0 for k, v in reader: print k count += 1 if count > 5: break return True
def ComputeMaxNumSplits(input_uri): """ Calculate the number of splits that are possible for a given pert file.""" total_size_bytes = CalculatePertFileSize(input_uri) num_shards = py_pert.GetNumShards(input_uri) num_entries, max_block_size = GetUriSplitInfo(input_uri) min_split_size = max_block_size if min_split_size == 0: LOG(FATAL, 'The input is empty: %s' % input_uri) if not num_entries: LOG(FATAL, 'pert file has no entries: %s' % input_uri) # if have more splits than this, then one of them will be too small max_num_splits = max(1, int(total_size_bytes / float(min_split_size))) #print 'total_size_bytes: %d' % (total_size_bytes) #print 'min_split_size: %d' % (min_split_size) #print 'max_num_splits: %d' % (max_num_splits) # we can't have more splits than we have entries if num_entries < max_num_splits: max_num_splits = num_entries return max_num_splits
def Run(self): LOG( INFO, 'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...' ) time.sleep(30) itergraph_state = LoadObjectFromUri( self.GetInput('prev_state').GetUri()) reader = py_pert.StringTableReader() CHECK(reader.Open(self.GetInput('candidates').GetUri())) self.match_groups = {} num_selected_candidates = 0 pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase) num_edges_skipped_max_degree_constraint = 0 num_edges_skipped_max_replication_constraint = 0 prev_score = -float('inf') for ordering_key, candidate_pair_data in reader: image_a_id, image_b_id = iwutil.ParseUint64KeyPair( candidate_pair_data) if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id): #print 'skipping previous attempted edge' continue # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score) score = iwutil.KeyToDouble(ordering_key) CHECK_GE(score, prev_score) prev_score = score if image_a_id not in self.match_groups: self.match_groups[image_a_id] = [] match_group_size = len(self.match_groups[image_a_id]) if match_group_size < self.max_batch_size: # test vertex degree condition degree_a = itergraph_state.GetDegree(image_a_id) degree_b = itergraph_state.GetDegree(image_b_id) # version 1: skip candidate edge if either of the vertices has many edges #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree: # version 2: skip candidate edge only if both of the vertices have many edges if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree: # test max replication condition num_replications = self._GetNumReplications(image_b_id) if num_replications < self.max_replication_factor: self._IncrementReplications(image_b_id) self.match_groups[image_a_id].append(image_b_id) num_selected_candidates += 1 pbar.update(num_selected_candidates) else: num_edges_skipped_max_replication_constraint += 1 else: num_edges_skipped_max_degree_constraint += 1 if num_selected_candidates >= self.max_candidates_per_phase: break pbar.finish() print '' print '' print 'num_edges_skipped_max_replication_constraint: %d' % ( num_edges_skipped_max_replication_constraint) print 'num_edges_skipped_max_degree_constraint: %d' % ( num_edges_skipped_max_degree_constraint) print '' print '' # write out the match plan (must be sorted by key for future join stage) metadata_entries = [] for batch_id, (batch_primary_image, batch_image_ids) in enumerate( self.match_groups.iteritems()): if len(batch_image_ids) == 0: continue batch_name = iwutil.Uint64ToKey(batch_id) CHECK(batch_name) CHECK(len(batch_name)) match_batch_metadata = iw_pb2.MatchBatchMetadata() match_batch_metadata.image_id = batch_primary_image match_batch_metadata.batch_name = batch_name match_batch_metadata.is_primary = True metadata_entries.append(match_batch_metadata) for image_id in batch_image_ids: next_metadata = iw_pb2.MatchBatchMetadata() next_metadata.image_id = image_id next_metadata.batch_name = batch_name next_metadata.is_primary = False metadata_entries.append(next_metadata) # image_id will be the key of output, so need to sort by image_id metadata_entries.sort(key=lambda m: m.image_id) match_batches_uri = self.GetOutput('sorted_match_batches').GetUri() # TODO(heathkh): "closing" doesn't flush to disk... this is a bug! # match_plan_writer = py_pert.ProtoTableWriter() # num_shards_features = py_pert.GetNumShards(self.features.GetUri()) # CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features)) # for metadata in metadata_entries: # CHECK(metadata.IsInitialized()) # key = iwutil.Uint64ToKey(metadata.image_id) # CHECK(match_plan_writer.Add(key, metadata.SerializeToString())) # match_plan_writer.Close() # TODO(kheath): Work around for above bug is to run a MR stage to reshard tmp_match_batches_uri = self.GetOutput( 'sorted_match_batches').GetUri() + '_to_be_sharded' match_plan_writer = py_pert.ProtoTableWriter() num_shards_features = py_pert.GetNumShards(self.features.GetUri()) CHECK( match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), tmp_match_batches_uri, 1)) for metadata in metadata_entries: CHECK(metadata.IsInitialized()) CHECK( match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id), metadata.SerializeToString())) match_plan_writer.Close() # manually reshard pertedit_bin = 'pertedit' cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % ( pertedit_bin, tmp_match_batches_uri, match_batches_uri, num_shards_features) print cmd CHECK_EQ(ExecuteCmd(cmd), 0) CHECK(py_pert.Exists(match_batches_uri)) ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri) CHECK(ok) CHECK(len(fp), 32) CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e', 'invalid hash of empty string') return