Esempio n. 1
0
 def Exists(self):
     """ Returns true if the uri exists and contains some part files. """
     # To solve the problem of detecting the PERT as existing when some of the
     # shards are not yet there needs to be addressed.  A general solution would
     # indicate how many part files are expected either in the part filename or
     # inside the file payload.
     # A less general, but simpler change is to handle the case when the output
     # is being put in a directory by the MR framework which creates a "flag"
     # file names _SUCCESS when all part files are deposited.
     if not py_pert.Exists(self.uri):
         return False
     # Check if we are in a directory being generated by mr and check for _SUCCESS file
     resource_generated_by_mr = py_pert.Exists(self.uri + '/_logs')
     if resource_generated_by_mr:
         #LOG(INFO, 'pert uses mr type exist rules: %s' % self.uri)
         mr_resource_done = py_pert.Exists(self.uri + '/_SUCCESS')
         if not mr_resource_done:
             return False
     # Otherwise, check that shards are there and there are none missing between
     # largest and smallest part id
     # TODO(heathkh): We can still have an error where the part with largest ids
     # are missing and we can't tell because the filenames don't indicate how
     # many are in the set in total
     shards = py_pert.GetShardUris(self.uri)
     if not shards or not py_pert.ShardSetIsValid(shards):
         return False
     return True
Esempio n. 2
0
def test_CopyLocalToUri():
  local_uri = "local://tmp/data/test_ufs.pert";
  remote_uri = "maprfs://data/tmp/test_ufs.pert";
  
  CreateTestFile(local_uri)
  
  ok, scheme, path, error = py_pert.ParseUri(local_uri)
  CHECK(ok)
  
  py_pert.CopyLocalToUri(path, remote_uri)
  
  CHECK(py_pert.Exists(local_uri))
  CHECK(py_pert.Exists(remote_uri))
  
  reader = py_pert.StringTableReader()
  reader.Open(remote_uri)
  
  expected_count = 1000
  count = 0
  for (key, value), (expected_key, expected_value) in zip(reader, GenerateTestData()):
    CHECK_EQ(key, expected_key)
    CHECK_EQ(value, expected_value)
    count += 1
    
  CHECK_EQ(count, expected_count)
  
  
  print py_pert.ListDirectory(local_uri)
  print py_pert.ListDirectory(remote_uri)
  
  return
Esempio n. 3
0
def EnsureChunkSizeForUri(uri, desired_block_size):
    CHECK_EQ(desired_block_size % (2**16), 0)  # must be a multiple of 2**16
    CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri)
    CHECK(py_pert.IsFile(uri), 'expected uri to be a file: %s' % uri)
    ok, actual_chunk_size = py_pert.ChunkSize(uri)
    CHECK(ok)
    CHECK_EQ(
        desired_block_size, actual_chunk_size,
        'Expected chunk size of %d but actual chunk size is %d for uri: %s' %
        (desired_block_size, actual_chunk_size, uri))
    return True
Esempio n. 4
0
def GetChunkSizeForUri(uri):
    CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri)
    CHECK(
        py_pert.IsDirectory(uri),
        'Chunk size only defined for directories... See mapr docs for details')
    nfs_path = mr.UriToNfsPath(uri)
    dfs_attribute_path = '%s/.dfs_attributes' % (nfs_path)
    lines = open(dfs_attribute_path, 'r').readlines()
    #print lines
    tokens = lines[2].split('=')
    CHECK_EQ(tokens[0], 'ChunkSize')
    chunksize = long(tokens[1])
    return chunksize
Esempio n. 5
0
def test_OpenSplit():
  remote_uri = "maprfs://data/itergraph/tide_v13/photoid_to_image.pert/part-00046"
  
  CHECK(py_pert.Exists(remote_uri))
  
  reader = py_pert.StringTableShardReader()
  split_start = 4598228
  split_length = 1255113
  split_end = split_start + split_length
  reader.OpenSplit(remote_uri, split_start, split_end)
  
  count = 0
  for key, value in reader:
    count += 1
    
  print count
  return
Esempio n. 6
0
 def Run(self):
     print 'about to run pipes flow: %s' % (self.pipes_binary)
     mr_driver = self.MakeDriver()
     # set output directory property to create files with required chunk size
     if self.output_chunk_size_bytes != None:
         if not py_pert.Exists(self.output_path):
             nfs_path = mr.UriToNfsPath(self.output_path)
             os.makedirs(nfs_path)
         SetChunkSizeForUri(self.output_path, self.output_chunk_size_bytes)
         CHECK_EQ(
             GetChunkSizeForUri(self.output_path),
             self.output_chunk_size_bytes
         )  # verify the features file will have a block size of 4 GB
     status = mr_driver.Run()
     # ensure output was created with the required chunk size
     if self.output_chunk_size_bytes != None:
         # ensure the created output has the requested chunk size
         for uri in py_pert.GetShardUris(self.output_path):
             EnsureChunkSizeForUri(uri, self.output_chunk_size_bytes)
     return status
Esempio n. 7
0
def SetChunkSizeForUri(uri, block_size):
    CHECK_EQ(block_size % (2**16), 0)  # must be a multiple of 2**16
    CHECK_LE(
        block_size, 1024 * (2**20),
        'Currently libmaprfs has a limitation that prevents chunk sizes greater than 1GB.'
    )
    CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri)
    CHECK(
        py_pert.IsDirectory(uri),
        'Chunk size only defined for directories... See mapr docs for details')
    nfs_path = mr.UriToNfsPath(uri)
    dfs_attribute_path = '%s/.dfs_attributes' % (nfs_path)
    control_file = open(dfs_attribute_path, 'w')
    control_file.write(
        '# lines beginning with # are treated as comments\nCompression=true\nChunkSize=%d'
        % (block_size))
    control_file.close()
    new_block_size = GetChunkSizeForUri(uri)
    CHECK_EQ(new_block_size, block_size)
    return True
Esempio n. 8
0
def test_CopyLocalToUri():
  CHECK(py_pert.Exists(fingerprint_uri))
  input_file = py_pert.OpenInput(fingerprint_uri)    
  ok, fingerprint = input_file.ReadToString()
Esempio n. 9
0
 def Exists(self):
     return py_pert.Exists(self.uri)
Esempio n. 10
0
def CheckUriExistsOrDie(uri):
    CHECK(py_pert.Exists(uri), 'expected uri to exist: %s' % uri)
    return