Beispiel #1
0
    def Run(self):

        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)

        super(MyOperation, self).Run()

        return True
Beispiel #2
0
 def CreateTestData(self):
     self.test_data_path = '/tmp/TestWgcFilter/'
     self.images_uri = 'local://%s/images.pert' % self.test_data_path
     self.features_uri = 'local://%s/features.pert' % self.test_data_path
     self.feature_counts_uri = 'local://%s/features_counts.pert' % self.test_data_path
     test_images_path = os.path.dirname(__file__)
     if not py_pert.Exists(self.images_uri):
         randomize = False
         max_images = None
         iwutil.PackImagesDirectoryToPert(test_images_path, self.images_uri,
                                          randomize, max_images)
     if not py_pert.Exists(self.features_uri):
         iwutil.ExtractFeatures(self.images_uri, self.features_uri)
     if not py_pert.Exists(self.feature_counts_uri):
         cbirutil.CreateFeatureCountsTable(self.features_uri,
                                           self.feature_counts_uri)
     return
Beispiel #3
0
 def Run(self):
   matches_uri = self.GetInput('matches').GetUri()
   photoids_uri = self.GetInput('photoids').GetUri()
   image_graph_uri = self.GetOutput().GetUri()
   ok, ig = py_imagegraph.CreateImageGraph2(matches_uri, photoids_uri)
   CHECK(ok)
   py_imagegraph.SaveImageGraph(ig, image_graph_uri)    
   CHECK(py_pert.Exists(image_graph_uri))
   return
Beispiel #4
0
    def Run(self):
        print 'pid: %s' % os.getpid()
        print 'id(py_pert): %s' % id(py_pert)
        ok, scheme, path = py_pert.ParseUri(self.uri)
        print 'path: %s' % path
        print 'exists: %s' % py_pert.Exists(self.uri)
        if py_pert.Exists(self.uri):
            print 'num shards: %s' % py_pert.GetNumShards(self.uri)
            reader = py_pert.StringTableReader()
            print 'about to open reader'
            reader.Open(self.uri)
            print 'about to use reader'
            count = 0
            for k, v in reader:
                print k
                count += 1
                if count > 5:
                    break

        return True
Beispiel #5
0
    def __CreateValidUriCacheString(self, uris_to_cache):
        #TODO(kheath): if hadoop pipes -files flag is passed a uri with fewer than /// slashes after scheme: it fails... This rewrites uris to deal with this bug.
        # Remove this hack one this bug if fixed in hadoop.
        clean_cache_uris = []
        for uri in uris_to_cache:
            ok, scheme, path, error = py_pert.ParseUri(uri)
            CHECK(ok)
            CHECK_EQ(scheme, 'maprfs')
            CHECK(py_pert.Exists(uri), 'uri not there: %s' % uri)
            clean_cache_uris.append(py_pert.CanonicalizeUri(uri))

        uri_cache_string = ','.join(clean_cache_uris)
        return uri_cache_string
Beispiel #6
0
 def __init__(self, resource_prefix, images, feature_extractor_params):
   super(ExtractFeaturesFlow,self).__init__()
   CHECK(feature_extractor_params.IsInitialized())
   resource_prefix = str(resource_prefix)    
   self.SetPipesBinary(__file__, 'mr_extract_features')    
   self.AddInput('images', images)    
   if not py_pert.Exists(resource_prefix):
     writer = py_pert.StringTableShardWriter()
     writer.Open('%s/dummy.foo' % (resource_prefix))
     writer.Close()        
   self.AddOutput('features', core.PertResource(self, "%s/features.pert" % resource_prefix) )
   self.output_chunk_size_bytes = 1024 * (2**20) # 1 GB is max    
   self.AddParam('feature_extractor_params', feature_extractor_params)
   return
Beispiel #7
0
 def Run(self):
   matches_uri = self.GetInput('matches').GetUri()
   image_region_graph_uri = self.GetOutput().GetUri()
   tide_uri = self.GetInput('tide').GetUri()
   
   #ok, irg = py_imageregiongraph.CreateImageRegionGraph(matches_uri, tide_uri)
   #CHECK(ok)
   #py_imageregiongraph.SaveImageRegionGraph(irg, image_region_graph_uri)
   
   ok = py_imageregiongraph.CreateAndSaveImageRegionGraph(matches_uri, tide_uri, image_region_graph_uri)
   CHECK(ok)
       
   CHECK(py_pert.Exists(image_region_graph_uri))
   return
Beispiel #8
0
 def Run(self):             
   bow_uri = self.GetInput('bow').GetUri()
   reader = py_pert.StringTableReader()    
   CHECK(reader.Open(bow_uri))    
   visual_vocab_size = self.cbir_bow_params.visual_vocab_size
   num_docs = reader.Entries()
   index = None
   if self.cbir_bow_params.implementation == 'inria':
     index = py_inria.InriaIndex()
   elif self.cbir_bow_params.implementation == 'ctis':
     index = py_ctis.CtisIndex()
     index.StartCreate(visual_vocab_size, num_docs)
   else:
     LOG(FATAL, 'unexpected')  
   
   #vv_uri = self.GetInput('visual_vocab').GetUri()
   temp_ivf_filepath = tempfile.mkdtemp()
       
   bag_of_words = bow_pb2.BagOfWords()
   progress = iwutil.MakeProgressBar(reader.Entries())
   for i, (key, value) in enumerate(reader):
     image_id = iwutil.KeyToUint64(key)
     bag_of_words.ParseFromString(value)
     index.Add(image_id, bag_of_words)
     progress.update(i)
   
   index.Save(temp_ivf_filepath)
   
   
   py_pert.Remove(self.index_base_uri)
   mr.CopyUri('local://' + temp_ivf_filepath , self.index_base_uri)    
   CHECK(py_pert.Exists(self.index_base_uri + '/index.ivf'))
   CHECK(py_pert.Exists(self.index_base_uri + '/index.ivfids'))
   
   shutil.rmtree(temp_ivf_filepath, ignore_errors=True)
   return True     
Beispiel #9
0
 def Run(self):
   """ this is a dummy class... doesn't need to do anything."""
   vv_uri = self.GetOutput().GetUri()
   CHECK(py_pert.Exists(vv_uri), 'expected uri to exist: %s' % (vv_uri))
   self.is_done = True # hack: needed because the timestamp condition doesn't hold here!
   return
Beispiel #10
0
 def PreRunConfig(self):
   sorted_match_matches_uri = self.GetInput('sorted_match_batches').GetUri()
   CHECK(py_pert.Exists(sorted_match_matches_uri), 'expected uri to exist: %s' % (sorted_match_matches_uri))
   return
Beispiel #11
0
    def Run(self):
        LOG(
            INFO,
            'waiting to let running processes give up memory... I need a lot and may not get enough if we rush things...'
        )
        time.sleep(30)
        itergraph_state = LoadObjectFromUri(
            self.GetInput('prev_state').GetUri())
        reader = py_pert.StringTableReader()
        CHECK(reader.Open(self.GetInput('candidates').GetUri()))
        self.match_groups = {}
        num_selected_candidates = 0

        pbar = iwutil.MakeProgressBar(self.max_candidates_per_phase)
        num_edges_skipped_max_degree_constraint = 0
        num_edges_skipped_max_replication_constraint = 0
        prev_score = -float('inf')

        for ordering_key, candidate_pair_data in reader:
            image_a_id, image_b_id = iwutil.ParseUint64KeyPair(
                candidate_pair_data)
            if itergraph_state.PreviouslyAttempted(image_a_id, image_b_id):
                #print 'skipping previous attempted edge'
                continue
            # check precondition... candidates pert is sorted (increasing by rank or by negative cbir score)
            score = iwutil.KeyToDouble(ordering_key)
            CHECK_GE(score, prev_score)
            prev_score = score

            if image_a_id not in self.match_groups:
                self.match_groups[image_a_id] = []

            match_group_size = len(self.match_groups[image_a_id])

            if match_group_size < self.max_batch_size:
                # test vertex degree condition
                degree_a = itergraph_state.GetDegree(image_a_id)
                degree_b = itergraph_state.GetDegree(image_b_id)

                # version 1: skip candidate edge if either of the vertices has many edges
                #if degree_a < self.max_vertex_degree and degree_b < self.max_vertex_degree:

                # version 2: skip candidate edge only if both of the vertices have many edges
                if degree_a < self.max_vertex_degree or degree_b < self.max_vertex_degree:
                    # test max replication condition
                    num_replications = self._GetNumReplications(image_b_id)
                    if num_replications < self.max_replication_factor:
                        self._IncrementReplications(image_b_id)
                        self.match_groups[image_a_id].append(image_b_id)
                        num_selected_candidates += 1
                        pbar.update(num_selected_candidates)
                    else:
                        num_edges_skipped_max_replication_constraint += 1
                else:
                    num_edges_skipped_max_degree_constraint += 1

            if num_selected_candidates >= self.max_candidates_per_phase:
                break

        pbar.finish()

        print ''
        print ''
        print 'num_edges_skipped_max_replication_constraint: %d' % (
            num_edges_skipped_max_replication_constraint)
        print 'num_edges_skipped_max_degree_constraint: %d' % (
            num_edges_skipped_max_degree_constraint)
        print ''
        print ''

        # write out the match plan (must be sorted by key for future join stage)
        metadata_entries = []

        for batch_id, (batch_primary_image, batch_image_ids) in enumerate(
                self.match_groups.iteritems()):
            if len(batch_image_ids) == 0:
                continue
            batch_name = iwutil.Uint64ToKey(batch_id)
            CHECK(batch_name)
            CHECK(len(batch_name))
            match_batch_metadata = iw_pb2.MatchBatchMetadata()
            match_batch_metadata.image_id = batch_primary_image
            match_batch_metadata.batch_name = batch_name
            match_batch_metadata.is_primary = True
            metadata_entries.append(match_batch_metadata)

            for image_id in batch_image_ids:
                next_metadata = iw_pb2.MatchBatchMetadata()
                next_metadata.image_id = image_id
                next_metadata.batch_name = batch_name
                next_metadata.is_primary = False
                metadata_entries.append(next_metadata)

        # image_id will be the key of output, so need to sort by image_id
        metadata_entries.sort(key=lambda m: m.image_id)
        match_batches_uri = self.GetOutput('sorted_match_batches').GetUri()

        # TODO(heathkh): "closing" doesn't flush to disk... this is a bug!
        #    match_plan_writer = py_pert.ProtoTableWriter()
        #    num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        #    CHECK(match_plan_writer.Open(iw_pb2.MatchBatchMetadata(), match_batches_uri, num_shards_features))
        #    for metadata in metadata_entries:
        #      CHECK(metadata.IsInitialized())
        #      key = iwutil.Uint64ToKey(metadata.image_id)
        #      CHECK(match_plan_writer.Add(key, metadata.SerializeToString()))
        #    match_plan_writer.Close()

        # TODO(kheath):   Work around for above bug is to run a MR stage to reshard
        tmp_match_batches_uri = self.GetOutput(
            'sorted_match_batches').GetUri() + '_to_be_sharded'
        match_plan_writer = py_pert.ProtoTableWriter()
        num_shards_features = py_pert.GetNumShards(self.features.GetUri())
        CHECK(
            match_plan_writer.Open(iw_pb2.MatchBatchMetadata(),
                                   tmp_match_batches_uri, 1))

        for metadata in metadata_entries:
            CHECK(metadata.IsInitialized())
            CHECK(
                match_plan_writer.Add(iwutil.Uint64ToKey(metadata.image_id),
                                      metadata.SerializeToString()))
        match_plan_writer.Close()

        # manually reshard
        pertedit_bin = 'pertedit'
        cmd = '%s --input %s --output %s --new_block_size_mb=10 --num_output_shards=%d' % (
            pertedit_bin, tmp_match_batches_uri, match_batches_uri,
            num_shards_features)
        print cmd
        CHECK_EQ(ExecuteCmd(cmd), 0)

        CHECK(py_pert.Exists(match_batches_uri))

        ok, fp = py_pert.GetShardSetFingerprint(match_batches_uri)
        CHECK(ok)
        CHECK(len(fp), 32)
        CHECK_NE(fp, 'd41d8cd98f00b204e9800998ecf8427e',
                 'invalid hash of empty string')

        return