Ejemplo n.º 1
0
    def __init__(self, lpbench_root_uri, config):
        super(MainFlow, self).__init__()
        root_uri = '%s/%s' % (lpbench_root_uri,
                              EnsureString(config.dataset_name))
        input_images = core.PertResource(self,
                                         '%s/photoid_to_image.pert' %
                                         (root_uri),
                                         check_exists=True,
                                         is_generated=False)
        tide = core.PertResource(self,
                                 '%s/objectid_to_object.pert' % (root_uri),
                                 check_exists=True,
                                 is_generated=False)
        params = config.iter_graph

        # extract features
        feature_uri_base = '%s/%s/' % (
            root_uri, iwutil.HashProto(params.tuned_feature_extractor_params))
        feature_flow = util.ExtractFeaturesFlow(
            feature_uri_base, input_images,
            params.tuned_feature_extractor_params.params)
        features = feature_flow.GetOutput()

        cbir_params = params.cbir
        cbir_uri_base = '%s/cbir/%s/' % (feature_uri_base,
                                         iwutil.HashProto(cbir_params))

        cbir_flow = util.CbirMetaFlow(feature_uri_base, features, cbir_params)
        cbir_results = cbir_flow.cbir_results

        itergraph_uri_base = "%s/itergraph/%s/" % (cbir_uri_base,
                                                   iwutil.HashProto(params))
        build_graph_flow = itergraph.BuildIterativeGraphFlow(
            itergraph_uri_base, features, cbir_results, params, tide)
        matches = build_graph_flow.merged_matches

        # eval match graph
        matches_to_ig_flow = util.MatchesToImageGraphFlow(
            itergraph_uri_base, matches, tide)
        image_graph = matches_to_ig_flow.GetOutput()
        eval1_flow = util.LabelpropEval1Flow(itergraph_uri_base, image_graph,
                                             tide)

        matches_to_irg_flow = util.MatchesToImageRegionGraphFlow(
            itergraph_uri_base, matches, tide)
        image_region_graph = matches_to_irg_flow.GetOutput()
        eval2_flow = util.LabelpropEval2Flow(itergraph_uri_base,
                                             image_region_graph, tide)

        self.matches = matches
        self.eval1 = eval1_flow.GetOutput()
        self.eval2 = eval2_flow.GetOutput()
        return
Ejemplo n.º 2
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, root_uri, features, index_shard, shard_num, num_neighbors_per_shard, ):
   super(CbirQueryIndexShardFlow, self).__init__()
   # Modify PipesFlow defaults
   self.SetPipesBinary(__file__, 'mr_cbir_query_index_shard')
   # Specify resource dependencies
   CHECK(isinstance(num_neighbors_per_shard, int), 'expected integer but got: %f' % num_neighbors_per_shard)
   self.AddInput('features', features, is_primary=True)
   self.AddInput('index_shard', index_shard, add_to_cache=True) 
   self.AddOutput('query_results', core.PertResource(self, "%s/query_results/shard_%05d.pert" % (root_uri, shard_num)))
   # set the required parameters the MR job expects to find    
   
   self.parameters['num_neighbors_per_shard'] = num_neighbors_per_shard
   # only include keypoints when querying the first shard
   include_keypoints = 0
   if shard_num == 0:
     include_keypoints = 1    
   self.parameters['include_keypoints'] = include_keypoints
   # deal with fact that map stage is memory intensive
   # make sure only a few mappers runs on a node
   self.map_slots_per_node = 1
   self.desired_splits_per_map_slot = 8.0;
   # override the default ram allocation for the JVM to leave ram for pipes 
   self.parameters['mapred.child.java.opts'] = '-Xmx1024m'
   self.parameters['mapred.map.child.java.opts'] = '-Xmx1024m'
   self.parameters['mapred.reduce.child.java.opts'] = '-Xmx1024m'    
   self.parameters['mapred.map.tasks.speculative.execution']='false'  # don't allow speculative execution which might launch duplicate mappers
   # the task could be quite slow... make sure framework doesn't kill it
   timeout_min = 120
   timeout_ms = 60000*timeout_min
   self.parameters['mapred.task.timeout'] = str(timeout_ms)    
   #self.parameters['profiler'] = 'on'
   #self.parameters['profile_timeout_sec'] = '600'
   return
Ejemplo n.º 3
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, input, output_uri, num_shards):
   super(SortFlow,self).__init__()
   self.SetPipesBinary(__file__, 'mr_sort')
   self.num_reduce_jobs = num_shards
   self.AddInput('input', input)   
   self.AddOutput('output', core.PertResource(self, output_uri) )    
   return
Ejemplo n.º 4
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, feature_uri_base, features):
   super(CountFeaturesFlow,self).__init__()
   self.SetPipesBinary(__file__, 'mr_cbir_count_features')
   self.num_reduce_jobs = 1
   self.AddInput('features', features)   
   self.AddOutput('feature_count', core.PertResource(self, '%s/feature_counts.pert' % feature_uri_base) )    
   return
Ejemplo n.º 5
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix,ordering_method, match_candidates):
   super(SortMatchCandidatesFlow, self).__init__()
   self.SetPipesBinary(__file__, 'mr_sort_match_candidates')        
   self.AddInput('match_candidates', match_candidates)
   self.AddOutput('sorted_match_candidates', core.PertResource(self, "%s/sorted_match_candidates.pert" % resource_prefix) )
   self.parameters['ordering_method'] = ordering_method
   return  
Ejemplo n.º 6
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, ordering_method, cbir_results):
   super(CreateUniqueMatchCandidatesFlow, self).__init__()
   self.SetPipesBinary(__file__, 'mr_create_unique_match_candidates')    
   self.AddInput('cbir_results', cbir_results)
   self.AddOutput('match_candidates', core.PertResource(self, "%s/unqiue_match_candidates.pert" % resource_prefix) )
   self.parameters['ordering_method'] = ordering_method    
   return
Ejemplo n.º 7
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, output_uri, images, scale):
   super(CropAndScaleImagesFlow, self).__init__()
   LOG(INFO, images.GetFlow())
   self.parameters['scale'] = scale
   self.AddInput('images', images)    
   self.AddOutput('cropped_images', core.PertResource(self, '%s/cropped_scaled_photoid_to_image.pert' % (output_uri)))
   self.SetPipesBinary(__file__, 'mr_crop_border')
   return
Ejemplo n.º 8
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, input_uri, dst_uri):
   super(CopyPertFlow,self).__init__()
   #self.input = core.PertResource(self, input_uri, is_generated=False, check_exists=True)
   #self.AddInput('input', input)  # every flow must have an input... this is a dummy input that will trigger the generation of the proto record
   self.input_uri = input_uri
   ok, scheme, path, error = py_pert.ParseUri(self.input_uri)
   input_basename = os.path.basename(path) 
   self.AddOutput('output', core.PertResource(self, "%s/%s" % (dst_uri, input_basename) ))    
   return
Ejemplo n.º 9
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, output_uri, raw_images, crop_fraction, min_dimension_pixels, min_area_pixels, max_area_pixels):
   super(ScrubImagesFlow, self).__init__()    
   self.parameters['crop_fraction'] = crop_fraction
   self.parameters['min_dimension_pixels'] = min_dimension_pixels
   self.parameters['min_area_pixels'] = min_area_pixels
   self.parameters['max_area_pixels'] = max_area_pixels
   
   self.AddInput('raw_images', raw_images)    
   self.AddOutput('scrubbed_images', core.PertResource(self, '%s/photoid_to_image.pert' % (output_uri)))
   self.SetPipesBinary(__file__, 'mr_scrub_images')
   self.num_reduce_jobs = py_pert.GetNumShards(raw_images.GetUri())
   return
Ejemplo n.º 10
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, features, sorted_match_batches, image_matcher_config, match_phase):
   super(MatchBatchesFlow, self).__init__()
   self.reserve_all_resources = True  # keep scheduler from running other jobs at same time
   self.SetPipesBinary(__file__, 'mr_match_batches')
   self.output_is_sorted = False
   self.AddInput('features', features)
   self.AddInput('sorted_match_batches', sorted_match_batches)
   self.AddOutput('matches', core.PertResource(self, "%s/matches.pert" % resource_prefix) )
   self.primary_input_uri = self.GetInput('features').GetUri()
   self.secondary_input_uri = self.GetInput('sorted_match_batches').GetUri()
   self.AddParam('image_matcher_config_proto', image_matcher_config)
   self.AddParam('match_phase', match_phase)
   return
Ejemplo n.º 11
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, images, feature_extractor_params):
   super(ExtractFeaturesFlow,self).__init__()
   CHECK(feature_extractor_params.IsInitialized())
   resource_prefix = str(resource_prefix)    
   self.SetPipesBinary(__file__, 'mr_extract_features')    
   self.AddInput('images', images)    
   if not py_pert.Exists(resource_prefix):
     writer = py_pert.StringTableShardWriter()
     writer.Open('%s/dummy.foo' % (resource_prefix))
     writer.Close()        
   self.AddOutput('features', core.PertResource(self, "%s/features.pert" % resource_prefix) )
   self.output_chunk_size_bytes = 1024 * (2**20) # 1 GB is max    
   self.AddParam('feature_extractor_params', feature_extractor_params)
   return
Ejemplo n.º 12
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, base_uri, dataset_name, feature_type, desired_features_per_megapixel):
   super(TuneFeatureExtractorDensityFlow, self).__init__()
   images_uri = '%s/%s/photoid_to_image.pert' % (base_uri, dataset_name)
   self.dataset_name = dataset_name
   self.feature_type = feature_type
   self.desired_features_per_megapixel = desired_features_per_megapixel
   self.images = core.PertResource(self, images_uri, check_exists=True, is_generated = False)
   self.AddOutput('tuned_params', core.PertResource(self, "%s/%s/%s_fpm%s.pert" % (base_uri, dataset_name, feature_type, desired_features_per_megapixel)))
   
   p = iw_pb2.FeatureExtractorParams()
   if feature_type == 'sift':      
     p.ocv_sift_params.num_octave_layers = 3
     p.ocv_sift_params.contrast_threshold = 0.04
     p.ocv_sift_params.edge_threshold = 30
     p.ocv_sift_params.sigma = 1.2
     p.ocv_sift_params.upright = False
     p.ocv_sift_params.root_sift_normalization = False
     CHECK(p.ocv_sift_params.IsInitialized())
   elif feature_type == 'usift':  
     p.ocv_sift_params.num_octave_layers = 3
     p.ocv_sift_params.contrast_threshold = 0.04
     p.ocv_sift_params.edge_threshold = 30
     p.ocv_sift_params.sigma = 0.6
     p.ocv_sift_params.upright = True
     p.ocv_sift_params.root_sift_normalization = False
     CHECK(p.ocv_sift_params.IsInitialized())      
   elif feature_type == 'ahess':
     p.vgg_affine_sift_params.type = p.vgg_affine_sift_params.AFFINE_HESSIAN
     p.vgg_affine_sift_params.threshold = 200
     p.vgg_affine_sift_params.root_sift_normalization = False
     CHECK(p.vgg_affine_sift_params.IsInitialized())
   else:
     LOG(FATAL, 'unknown feature type specified: %s' % (feature_type))
   
   self.initial_extractor_params = p
   return
Ejemplo n.º 13
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, features, visual_vocab):
   super(CreateBagOfWordsFlow, self).__init__()
   
   self.reserve_all_resources = True  # keep scheduler from running other jobs at same time
    
   # Specify resource dependencies
   self.AddInput('features', features)     
   self.AddInput('visual_vocab', visual_vocab, add_to_cache = True)    
   self.AddOutput('bow', core.PertResource(self, "%s/bow.pert" % resource_prefix) )    
       
   # Modify PipesFlow defaults
   self.SetPipesBinary(__file__, 'mr_create_bag_of_words')
   self.desired_splits_per_core = 2
   self.input_path = features.GetUri()
   self.force_max_map_slots_per_node = 4 #TODO(kheath): change this to compute slots from memory requirement instead of depending on a fixed size node and fixed size input
   return
Ejemplo n.º 14
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, matches_list):
   super(MergeMatchesFlow,self).__init__()
   self.SetPipesBinary(__file__, 'mr_merge_matches')
   CHECK_GE(len(matches_list), 1) # make sure this is a list of resources
   
   input_uris = []
   for i, matches in enumerate(matches_list):
     CHECK(isinstance(matches, core.Resource))
     self.AddInput('matches_%d' % i, matches)
     input_uris.append(matches.GetUri())
   self.input_path = ','.join(input_uris)
   self.AddOutput('merged_matches', core.PertResource(self, "%s/merged_matches.pert" % resource_prefix) )    
   
   #self.num_reduce_jobs = 1
   #self.num_reduce_jobs = mr.GetNumActiveTaskTrackers()*2
   return
Ejemplo n.º 15
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, bow, bow_index, cbir_bow_params ):
   super(BowCbirQueryIndexFlow,self).__init__()
   self.SetPipesBinary(__file__, 'mr_cbir_bow_query_index')    
   number_of_cores_per_machine = 7.0
   ram_per_machine_gb = 6.0
   min_ram_per_mapper_gb = 6.0
   mappers_per_machine = max(1, int(ram_per_machine_gb/min_ram_per_mapper_gb))
   CHECK_LE(mappers_per_machine, number_of_cores_per_machine)
   self.desired_splits_per_core = 4/number_of_cores_per_machine
   self.force_max_map_slots_per_node = mappers_per_machine
   print 'desired_splits_per_core: %d' %  self.desired_splits_per_core
   print 'force_max_map_slots_per_node: %d' %  self.force_max_map_slots_per_node
   self.AddInput('bow', bow, is_primary=True)   
   self.AddInput('bow_index', bow_index, add_to_cache = True)
   self.AddOutput('cbir_results', core.PertResource(self, "%s/match_candidates.pert" % resource_prefix) )
   self.AddParam('cbir_bow_params', cbir_bow_params)    
   self.AddParam('mapred.tasktracker.map.tasks.maximum', 1)
   return
Ejemplo n.º 16
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, resource_prefix, neighbors, feature_counts, query_scorer_params):
   super(CbirScoreResultsFlow,self).__init__()
   self.resource_prefix = resource_prefix
   # Configure PipesFlow parent class   
   self.SetPipesBinary(__file__, 'mr_cbir_score_results')        
   self.AddInput('neighbors', neighbors, is_primary=True)
   self.AddInput('feature_counts', feature_counts, add_to_cache=True)
   self.AddOutput('cbir_results', core.PertResource(self, "%s/cbir_results.pert" % resource_prefix) )
   #self.AddOutput('cbir_config', core.FileResource(self, "%s/cbir_config.txt" % resource_prefix) )
   self.config_pert = core.FileResource(self, "%s/cbir_config.txt" % resource_prefix)
   self.parameters['feature_counts_uri'] = feature_counts.GetUri() 
   CHECK(query_scorer_params.IsInitialized()) 
   self.parameters['query_scorer_params'] = mr.Base64EncodeProto(query_scorer_params) #TODO(heathkh): use getter / setters to simplify setting proto paramaters correctly      
   # since we have multiple inputs...
   self.query_scorer_params = query_scorer_params
   timeout_min = 90
   timeout_ms = 60000*timeout_min
   self.parameters['mapred.task.timeout'] = str(timeout_ms)
   #self.parameters['profiler'] = 'on'
   #self.parameters['profile_timeout_sec'] = '60'
   return
Ejemplo n.º 17
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, root_uri, query_results):
   super(CbirMergeQueryResultShardsFlow, self).__init__()
   # Specify resource dependencies
   CHECK_GT(len(query_results), 0)
   
   # create comma seperated list of input uris
   input_uris = []
   for i, query_result in enumerate(query_results):
     self.AddInput('query_results_%d' % i, query_result)
     input_uris.append(query_result.GetUri())    
   self.input_path = ','.join(input_uris)
    
   self.AddOutput('merged_query_results', core.PertResource(self, "%s/merged_query_results.pert" % (root_uri) ))    
           
   # Modify PipesFlow defaults
   self.SetPipesBinary(__file__, 'mr_cbir_merge_query_results')
   
   # override the default ram allocation for the JVM to leave ram for pipes mapper
   self.parameters['mapred.child.java.opts'] = '-Xmx1024m'
   self.parameters['mapred.reduce.child.java.opts'] = '-Xmx4024m'
   self.parameters['mapred.reduce.tasks.speculative.execution']='false'    
   return
Ejemplo n.º 18
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, base_uri, matches, tide):
   super(MatchesToImageRegionGraphFlow, self).__init__()
   self.AddInput('matches', matches )    
   self.AddInput('tide', tide )        
   self.AddOutput('image_region_graph', core.PertResource(self, "%s/image_region_graph.pert" % base_uri)  )    
   return
Ejemplo n.º 19
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, base_uri, matches, photoids):
   super(MatchesToImageGraphFlow, self).__init__()
   self.AddInput('matches', matches )    
   self.AddInput('photoids', photoids )        
   self.AddOutput('image_graph', core.PertResource(self, "%s/image_graph.pert" % base_uri)  )    
   return
Ejemplo n.º 20
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, input, output_uri):
   super(PertDropValueFlow,self).__init__()
   self.SetPipesBinary(__file__, 'mr_drop_value')
   self.AddInput('input', input)   
   self.AddOutput('output', core.PertResource(self, output_uri) )    
   return  
Ejemplo n.º 21
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, output_uri, input):
   super(FindDuplicatesFlow, self).__init__()
   self.AddInput('input', input)    
   self.AddOutput('duplicated_keys', core.PertResource(self, '%s/duplicated_keys.pert' % (output_uri)))
   self.SetPipesBinary(__file__, 'mr_find_duplicates')    
   return
Ejemplo n.º 22
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, features, visual_vocab_uri):
   super(GetBagOfWordsVisualVocabFlow, self).__init__()
   self.AddInput('features', features) # dummy dependancy because all flows must have an input to get scheduled... to be fixed!     
   self.AddOutput('visual_vocab', core.PertResource(self, visual_vocab_uri, is_generated=False, check_exists=True))    
   return
Ejemplo n.º 23
0
Archivo: util.py Proyecto: heathkh/iwct
 def __init__(self, base_uri, image_region_graph, tide):
   super(LabelpropEval2Flow, self).__init__()
   self.AddInput('image_region_graph', image_region_graph )
   self.AddInput('tide', tide )        
   self.AddOutput('labelprop_eval2', core.PertResource(self, "%s/labelprop_eval2.pert" % base_uri)  )    
   return