def __init__(self, lpbench_root_uri, config): super(MainFlow, self).__init__() root_uri = '%s/%s' % (lpbench_root_uri, EnsureString(config.dataset_name)) input_images = core.PertResource(self, '%s/photoid_to_image.pert' % (root_uri), check_exists=True, is_generated=False) tide = core.PertResource(self, '%s/objectid_to_object.pert' % (root_uri), check_exists=True, is_generated=False) params = config.iter_graph # extract features feature_uri_base = '%s/%s/' % ( root_uri, iwutil.HashProto(params.tuned_feature_extractor_params)) feature_flow = util.ExtractFeaturesFlow( feature_uri_base, input_images, params.tuned_feature_extractor_params.params) features = feature_flow.GetOutput() cbir_params = params.cbir cbir_uri_base = '%s/cbir/%s/' % (feature_uri_base, iwutil.HashProto(cbir_params)) cbir_flow = util.CbirMetaFlow(feature_uri_base, features, cbir_params) cbir_results = cbir_flow.cbir_results itergraph_uri_base = "%s/itergraph/%s/" % (cbir_uri_base, iwutil.HashProto(params)) build_graph_flow = itergraph.BuildIterativeGraphFlow( itergraph_uri_base, features, cbir_results, params, tide) matches = build_graph_flow.merged_matches # eval match graph matches_to_ig_flow = util.MatchesToImageGraphFlow( itergraph_uri_base, matches, tide) image_graph = matches_to_ig_flow.GetOutput() eval1_flow = util.LabelpropEval1Flow(itergraph_uri_base, image_graph, tide) matches_to_irg_flow = util.MatchesToImageRegionGraphFlow( itergraph_uri_base, matches, tide) image_region_graph = matches_to_irg_flow.GetOutput() eval2_flow = util.LabelpropEval2Flow(itergraph_uri_base, image_region_graph, tide) self.matches = matches self.eval1 = eval1_flow.GetOutput() self.eval2 = eval2_flow.GetOutput() return
def __init__(self, root_uri, features, index_shard, shard_num, num_neighbors_per_shard, ): super(CbirQueryIndexShardFlow, self).__init__() # Modify PipesFlow defaults self.SetPipesBinary(__file__, 'mr_cbir_query_index_shard') # Specify resource dependencies CHECK(isinstance(num_neighbors_per_shard, int), 'expected integer but got: %f' % num_neighbors_per_shard) self.AddInput('features', features, is_primary=True) self.AddInput('index_shard', index_shard, add_to_cache=True) self.AddOutput('query_results', core.PertResource(self, "%s/query_results/shard_%05d.pert" % (root_uri, shard_num))) # set the required parameters the MR job expects to find self.parameters['num_neighbors_per_shard'] = num_neighbors_per_shard # only include keypoints when querying the first shard include_keypoints = 0 if shard_num == 0: include_keypoints = 1 self.parameters['include_keypoints'] = include_keypoints # deal with fact that map stage is memory intensive # make sure only a few mappers runs on a node self.map_slots_per_node = 1 self.desired_splits_per_map_slot = 8.0; # override the default ram allocation for the JVM to leave ram for pipes self.parameters['mapred.child.java.opts'] = '-Xmx1024m' self.parameters['mapred.map.child.java.opts'] = '-Xmx1024m' self.parameters['mapred.reduce.child.java.opts'] = '-Xmx1024m' self.parameters['mapred.map.tasks.speculative.execution']='false' # don't allow speculative execution which might launch duplicate mappers # the task could be quite slow... make sure framework doesn't kill it timeout_min = 120 timeout_ms = 60000*timeout_min self.parameters['mapred.task.timeout'] = str(timeout_ms) #self.parameters['profiler'] = 'on' #self.parameters['profile_timeout_sec'] = '600' return
def __init__(self, input, output_uri, num_shards): super(SortFlow,self).__init__() self.SetPipesBinary(__file__, 'mr_sort') self.num_reduce_jobs = num_shards self.AddInput('input', input) self.AddOutput('output', core.PertResource(self, output_uri) ) return
def __init__(self, feature_uri_base, features): super(CountFeaturesFlow,self).__init__() self.SetPipesBinary(__file__, 'mr_cbir_count_features') self.num_reduce_jobs = 1 self.AddInput('features', features) self.AddOutput('feature_count', core.PertResource(self, '%s/feature_counts.pert' % feature_uri_base) ) return
def __init__(self, resource_prefix,ordering_method, match_candidates): super(SortMatchCandidatesFlow, self).__init__() self.SetPipesBinary(__file__, 'mr_sort_match_candidates') self.AddInput('match_candidates', match_candidates) self.AddOutput('sorted_match_candidates', core.PertResource(self, "%s/sorted_match_candidates.pert" % resource_prefix) ) self.parameters['ordering_method'] = ordering_method return
def __init__(self, resource_prefix, ordering_method, cbir_results): super(CreateUniqueMatchCandidatesFlow, self).__init__() self.SetPipesBinary(__file__, 'mr_create_unique_match_candidates') self.AddInput('cbir_results', cbir_results) self.AddOutput('match_candidates', core.PertResource(self, "%s/unqiue_match_candidates.pert" % resource_prefix) ) self.parameters['ordering_method'] = ordering_method return
def __init__(self, output_uri, images, scale): super(CropAndScaleImagesFlow, self).__init__() LOG(INFO, images.GetFlow()) self.parameters['scale'] = scale self.AddInput('images', images) self.AddOutput('cropped_images', core.PertResource(self, '%s/cropped_scaled_photoid_to_image.pert' % (output_uri))) self.SetPipesBinary(__file__, 'mr_crop_border') return
def __init__(self, input_uri, dst_uri): super(CopyPertFlow,self).__init__() #self.input = core.PertResource(self, input_uri, is_generated=False, check_exists=True) #self.AddInput('input', input) # every flow must have an input... this is a dummy input that will trigger the generation of the proto record self.input_uri = input_uri ok, scheme, path, error = py_pert.ParseUri(self.input_uri) input_basename = os.path.basename(path) self.AddOutput('output', core.PertResource(self, "%s/%s" % (dst_uri, input_basename) )) return
def __init__(self, output_uri, raw_images, crop_fraction, min_dimension_pixels, min_area_pixels, max_area_pixels): super(ScrubImagesFlow, self).__init__() self.parameters['crop_fraction'] = crop_fraction self.parameters['min_dimension_pixels'] = min_dimension_pixels self.parameters['min_area_pixels'] = min_area_pixels self.parameters['max_area_pixels'] = max_area_pixels self.AddInput('raw_images', raw_images) self.AddOutput('scrubbed_images', core.PertResource(self, '%s/photoid_to_image.pert' % (output_uri))) self.SetPipesBinary(__file__, 'mr_scrub_images') self.num_reduce_jobs = py_pert.GetNumShards(raw_images.GetUri()) return
def __init__(self, resource_prefix, features, sorted_match_batches, image_matcher_config, match_phase): super(MatchBatchesFlow, self).__init__() self.reserve_all_resources = True # keep scheduler from running other jobs at same time self.SetPipesBinary(__file__, 'mr_match_batches') self.output_is_sorted = False self.AddInput('features', features) self.AddInput('sorted_match_batches', sorted_match_batches) self.AddOutput('matches', core.PertResource(self, "%s/matches.pert" % resource_prefix) ) self.primary_input_uri = self.GetInput('features').GetUri() self.secondary_input_uri = self.GetInput('sorted_match_batches').GetUri() self.AddParam('image_matcher_config_proto', image_matcher_config) self.AddParam('match_phase', match_phase) return
def __init__(self, resource_prefix, images, feature_extractor_params): super(ExtractFeaturesFlow,self).__init__() CHECK(feature_extractor_params.IsInitialized()) resource_prefix = str(resource_prefix) self.SetPipesBinary(__file__, 'mr_extract_features') self.AddInput('images', images) if not py_pert.Exists(resource_prefix): writer = py_pert.StringTableShardWriter() writer.Open('%s/dummy.foo' % (resource_prefix)) writer.Close() self.AddOutput('features', core.PertResource(self, "%s/features.pert" % resource_prefix) ) self.output_chunk_size_bytes = 1024 * (2**20) # 1 GB is max self.AddParam('feature_extractor_params', feature_extractor_params) return
def __init__(self, base_uri, dataset_name, feature_type, desired_features_per_megapixel): super(TuneFeatureExtractorDensityFlow, self).__init__() images_uri = '%s/%s/photoid_to_image.pert' % (base_uri, dataset_name) self.dataset_name = dataset_name self.feature_type = feature_type self.desired_features_per_megapixel = desired_features_per_megapixel self.images = core.PertResource(self, images_uri, check_exists=True, is_generated = False) self.AddOutput('tuned_params', core.PertResource(self, "%s/%s/%s_fpm%s.pert" % (base_uri, dataset_name, feature_type, desired_features_per_megapixel))) p = iw_pb2.FeatureExtractorParams() if feature_type == 'sift': p.ocv_sift_params.num_octave_layers = 3 p.ocv_sift_params.contrast_threshold = 0.04 p.ocv_sift_params.edge_threshold = 30 p.ocv_sift_params.sigma = 1.2 p.ocv_sift_params.upright = False p.ocv_sift_params.root_sift_normalization = False CHECK(p.ocv_sift_params.IsInitialized()) elif feature_type == 'usift': p.ocv_sift_params.num_octave_layers = 3 p.ocv_sift_params.contrast_threshold = 0.04 p.ocv_sift_params.edge_threshold = 30 p.ocv_sift_params.sigma = 0.6 p.ocv_sift_params.upright = True p.ocv_sift_params.root_sift_normalization = False CHECK(p.ocv_sift_params.IsInitialized()) elif feature_type == 'ahess': p.vgg_affine_sift_params.type = p.vgg_affine_sift_params.AFFINE_HESSIAN p.vgg_affine_sift_params.threshold = 200 p.vgg_affine_sift_params.root_sift_normalization = False CHECK(p.vgg_affine_sift_params.IsInitialized()) else: LOG(FATAL, 'unknown feature type specified: %s' % (feature_type)) self.initial_extractor_params = p return
def __init__(self, resource_prefix, features, visual_vocab): super(CreateBagOfWordsFlow, self).__init__() self.reserve_all_resources = True # keep scheduler from running other jobs at same time # Specify resource dependencies self.AddInput('features', features) self.AddInput('visual_vocab', visual_vocab, add_to_cache = True) self.AddOutput('bow', core.PertResource(self, "%s/bow.pert" % resource_prefix) ) # Modify PipesFlow defaults self.SetPipesBinary(__file__, 'mr_create_bag_of_words') self.desired_splits_per_core = 2 self.input_path = features.GetUri() self.force_max_map_slots_per_node = 4 #TODO(kheath): change this to compute slots from memory requirement instead of depending on a fixed size node and fixed size input return
def __init__(self, resource_prefix, matches_list): super(MergeMatchesFlow,self).__init__() self.SetPipesBinary(__file__, 'mr_merge_matches') CHECK_GE(len(matches_list), 1) # make sure this is a list of resources input_uris = [] for i, matches in enumerate(matches_list): CHECK(isinstance(matches, core.Resource)) self.AddInput('matches_%d' % i, matches) input_uris.append(matches.GetUri()) self.input_path = ','.join(input_uris) self.AddOutput('merged_matches', core.PertResource(self, "%s/merged_matches.pert" % resource_prefix) ) #self.num_reduce_jobs = 1 #self.num_reduce_jobs = mr.GetNumActiveTaskTrackers()*2 return
def __init__(self, resource_prefix, bow, bow_index, cbir_bow_params ): super(BowCbirQueryIndexFlow,self).__init__() self.SetPipesBinary(__file__, 'mr_cbir_bow_query_index') number_of_cores_per_machine = 7.0 ram_per_machine_gb = 6.0 min_ram_per_mapper_gb = 6.0 mappers_per_machine = max(1, int(ram_per_machine_gb/min_ram_per_mapper_gb)) CHECK_LE(mappers_per_machine, number_of_cores_per_machine) self.desired_splits_per_core = 4/number_of_cores_per_machine self.force_max_map_slots_per_node = mappers_per_machine print 'desired_splits_per_core: %d' % self.desired_splits_per_core print 'force_max_map_slots_per_node: %d' % self.force_max_map_slots_per_node self.AddInput('bow', bow, is_primary=True) self.AddInput('bow_index', bow_index, add_to_cache = True) self.AddOutput('cbir_results', core.PertResource(self, "%s/match_candidates.pert" % resource_prefix) ) self.AddParam('cbir_bow_params', cbir_bow_params) self.AddParam('mapred.tasktracker.map.tasks.maximum', 1) return
def __init__(self, resource_prefix, neighbors, feature_counts, query_scorer_params): super(CbirScoreResultsFlow,self).__init__() self.resource_prefix = resource_prefix # Configure PipesFlow parent class self.SetPipesBinary(__file__, 'mr_cbir_score_results') self.AddInput('neighbors', neighbors, is_primary=True) self.AddInput('feature_counts', feature_counts, add_to_cache=True) self.AddOutput('cbir_results', core.PertResource(self, "%s/cbir_results.pert" % resource_prefix) ) #self.AddOutput('cbir_config', core.FileResource(self, "%s/cbir_config.txt" % resource_prefix) ) self.config_pert = core.FileResource(self, "%s/cbir_config.txt" % resource_prefix) self.parameters['feature_counts_uri'] = feature_counts.GetUri() CHECK(query_scorer_params.IsInitialized()) self.parameters['query_scorer_params'] = mr.Base64EncodeProto(query_scorer_params) #TODO(heathkh): use getter / setters to simplify setting proto paramaters correctly # since we have multiple inputs... self.query_scorer_params = query_scorer_params timeout_min = 90 timeout_ms = 60000*timeout_min self.parameters['mapred.task.timeout'] = str(timeout_ms) #self.parameters['profiler'] = 'on' #self.parameters['profile_timeout_sec'] = '60' return
def __init__(self, root_uri, query_results): super(CbirMergeQueryResultShardsFlow, self).__init__() # Specify resource dependencies CHECK_GT(len(query_results), 0) # create comma seperated list of input uris input_uris = [] for i, query_result in enumerate(query_results): self.AddInput('query_results_%d' % i, query_result) input_uris.append(query_result.GetUri()) self.input_path = ','.join(input_uris) self.AddOutput('merged_query_results', core.PertResource(self, "%s/merged_query_results.pert" % (root_uri) )) # Modify PipesFlow defaults self.SetPipesBinary(__file__, 'mr_cbir_merge_query_results') # override the default ram allocation for the JVM to leave ram for pipes mapper self.parameters['mapred.child.java.opts'] = '-Xmx1024m' self.parameters['mapred.reduce.child.java.opts'] = '-Xmx4024m' self.parameters['mapred.reduce.tasks.speculative.execution']='false' return
def __init__(self, base_uri, matches, tide): super(MatchesToImageRegionGraphFlow, self).__init__() self.AddInput('matches', matches ) self.AddInput('tide', tide ) self.AddOutput('image_region_graph', core.PertResource(self, "%s/image_region_graph.pert" % base_uri) ) return
def __init__(self, base_uri, matches, photoids): super(MatchesToImageGraphFlow, self).__init__() self.AddInput('matches', matches ) self.AddInput('photoids', photoids ) self.AddOutput('image_graph', core.PertResource(self, "%s/image_graph.pert" % base_uri) ) return
def __init__(self, input, output_uri): super(PertDropValueFlow,self).__init__() self.SetPipesBinary(__file__, 'mr_drop_value') self.AddInput('input', input) self.AddOutput('output', core.PertResource(self, output_uri) ) return
def __init__(self, output_uri, input): super(FindDuplicatesFlow, self).__init__() self.AddInput('input', input) self.AddOutput('duplicated_keys', core.PertResource(self, '%s/duplicated_keys.pert' % (output_uri))) self.SetPipesBinary(__file__, 'mr_find_duplicates') return
def __init__(self, features, visual_vocab_uri): super(GetBagOfWordsVisualVocabFlow, self).__init__() self.AddInput('features', features) # dummy dependancy because all flows must have an input to get scheduled... to be fixed! self.AddOutput('visual_vocab', core.PertResource(self, visual_vocab_uri, is_generated=False, check_exists=True)) return
def __init__(self, base_uri, image_region_graph, tide): super(LabelpropEval2Flow, self).__init__() self.AddInput('image_region_graph', image_region_graph ) self.AddInput('tide', tide ) self.AddOutput('labelprop_eval2', core.PertResource(self, "%s/labelprop_eval2.pert" % base_uri) ) return