def test_proto_to_proto(self): input_key = CreateTestProto('key_foo') input_value = CreateTestProto('value_foo') test_bucket_name = 's3dict_test_foobar_%s' % time.time() key_type = deluge_pb2.ResourceProvenance() value_type = deluge_pb2.ResourceProvenance() d = s3dict.S3ProtoDictionary(test_bucket_name, key_type, value_type) self.assertEqual(d.Get(input_key), None) d.Set(input_key, input_value) output_value = d.Get(input_key) self.assertEqual(output_value, input_value) #print 'key: %s' % input_key #print 'value: %s' % output_value return
def GetProvenanceDict(): global provenance_dict if not provenance_dict: provenance_dict = s3dict.S3ProtoDictionary( 'deluge_resource_provenance_db_v02', value_proto=deluge_pb2.ResourceProvenance()) return provenance_dict
def CreateTestProto(param): test_proto = deluge_pb2.ResourceProvenance() test_proto.fingerprint = param test_proto.flow = param test_proto.name = param test_proto.uri = param test_proto.start_time_sec = 1000 test_proto.end_time_sec = 1010 test_proto.input_fingerprints.extend(['fp1','fp2', 'fp3']) return test_proto
def Execute(self): """ Runs the flow and records provenance metadata for all outputs.""" # to be called only by scheduler # get fingerprints of all inputs input_fingerprints = [] for input_name, input_resource in self.inputs.iteritems(): fingerprint = input_resource.GetFingerprint() input_fingerprints.append(fingerprint) if input_resource.is_generated: record = provenance.GetResourceProvenanceRecord(fingerprint) if record == None: LOG( INFO, 'failed to find record for fingerprint: %s' % (fingerprint)) LOG(INFO, 'uri: %s' % (input_resource.GetUri())) LOG( INFO, 'You are missing provenance record for a generated resource: %s. You probably terminated a flow... but the MR was not stopped. The resource appears to be there, but has no "finish time" record. You probably want to delete it so it gets recreated along with proper metadata.' % (input_resource)) delete_file = raw_input() if delete_file == 'yes': py_pert.Remove(input_resource.GetUri()) LOG(FATAL, 'please restart the flow...') start_time_sec = time.time() self.Run() end_time_sec = time.time() for output_name, output_resource in self.outputs.iteritems(): if output_resource.flow != self: LOG( INFO, "I don't own this output... not creating provenance record for it: %s" % output_resource ) # Only the flow that created the output should generate provenance record for it... otherwise later flows may overwrite the provenance info and cause a loss of info about the intermediate steps. continue output_fingerprint = output_resource.GetFingerprint() record = deluge_pb2.ResourceProvenance() record.fingerprint = output_fingerprint record.flow = self.__class__.__name__ record.name = output_name record.uri = output_resource.GetUri() record.start_time_sec = start_time_sec record.end_time_sec = end_time_sec record.input_fingerprints.extend(input_fingerprints) CHECK(record.IsInitialized()) provenance.SetResourceProvenanceRecord(record) return