def test_simple_output_value_file(self): """test writing a simple output value file""" collection_id = 1001 segment_id = 42 data_size = 1024 data = random_string(data_size) output_value_file = OutputValueFile( self._database_connection, _repository_path ) self.assertEqual(output_value_file.size, 0) output_value_file.write_data_for_one_sequence( collection_id, segment_id, data ) self.assertEqual(output_value_file.size, data_size) output_value_file.close() value_file_row = _retrieve_value_file_row( self._database_connection, output_value_file._value_file_id ) self.assertEqual(value_file_row.size, data_size) data_md5_hash = hashlib.md5(data).digest() self.assertEqual(str(value_file_row.hash), data_md5_hash) self.assertEqual(value_file_row.sequence_count, 1) self.assertEqual(value_file_row.min_segment_id, segment_id) self.assertEqual(value_file_row.max_segment_id, segment_id) self.assertEqual(value_file_row.distinct_collection_count, 1) self.assertEqual(value_file_row.collection_ids, [collection_id, ])
def store_sequence( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, segment_size, zfec_padding_size, segment_md5_digest, segment_adler32, sequence_num, data, user_request_id ): """ store one piece (sequence) of segment data """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("request {0}: " \ "store_sequence {1} {2} {3} {4} {5}: {6} ({7})".format( user_request_id, collection_id, key, unified_id, timestamp_repr, segment_num, sequence_num, segment_size)) segment_entry = self._active_segments[segment_key] # if this write would put us over the max size, # start a new output value file if self._value_file.size + segment_size > _max_value_file_size: self._value_file.close() space_id = find_least_volume_space_id("journal", self._file_space_info) self._value_file = OutputValueFile(self._connection, space_id, self._repository_path) segment_sequence_row = segment_sequence_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], zfec_padding_size=zfec_padding_size, value_file_id=self._value_file.value_file_id, sequence_num=sequence_num, value_file_offset=self._value_file.size, size=segment_size, hash=psycopg2.Binary(segment_md5_digest), adler32=segment_adler32, ) self._value_file.write_data_for_one_sequence( collection_id, segment_entry["segment-id"], data ) _insert_segment_sequence_row(self._connection, segment_sequence_row)
def __init__(self, connection, repository_path): self._log = logging.getLogger("Writer") self._connection = connection self._repository_path = repository_path self._active_segments = dict() # open a new value file at startup self._value_file = OutputValueFile( self._connection, self._repository_path )
def store_sequence( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, segment_size, zfec_padding_size, segment_md5_digest, segment_adler32, sequence_num, data ): """ store one piece (sequence) of segment data """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("store_sequence %s %s %s %s %s: %s (%s)" % ( collection_id, key, unified_id, timestamp_repr, segment_num, sequence_num, segment_size )) segment_entry = self._active_segments[segment_key] # if this write would put us over the max size, # start a new output value file if self._value_file.size + segment_size > _max_value_file_size: self._value_file.close() self._value_file = OutputValueFile( self._connection, self._repository_path ) segment_sequence_row = segment_sequence_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], zfec_padding_size=zfec_padding_size, value_file_id=self._value_file.value_file_id, sequence_num=sequence_num, value_file_offset=self._value_file.size, size=segment_size, hash=psycopg2.Binary(segment_md5_digest), adler32=segment_adler32, ) self._value_file.write_data_for_one_sequence( collection_id, segment_entry["segment-id"], data ) _insert_segment_sequence_row(self._connection, segment_sequence_row)
def __init__(self, connection, file_space_info, repository_path, active_segments, completions ): self._log = logging.getLogger("Writer") self._connection = connection self._file_space_info = file_space_info self._repository_path = repository_path self._active_segments = active_segments self._completions = completions space_id = find_least_volume_space_id("journal", self._file_space_info) # open a new value file at startup self._value_file = OutputValueFile(self._connection, space_id, self._repository_path)
class Writer(object): """ Manage writing segment values to disk """ def __init__(self, connection, file_space_info, repository_path, active_segments, completions ): self._log = logging.getLogger("Writer") self._connection = connection self._file_space_info = file_space_info self._repository_path = repository_path self._active_segments = active_segments self._completions = completions space_id = find_least_volume_space_id("journal", self._file_space_info) # open a new value file at startup self._value_file = OutputValueFile(self._connection, space_id, self._repository_path) @property def value_file_hash(self): """ return the hash of the currently open value file """ assert self._value_file is not None return hash(self._value_file) def sync_value_file(self): """ sync the current value file """ assert self._value_file is not None self._value_file.sync() # Ticket #70 Data writer causes "already a transaction in progress" # warning in the PostgreSQL log if len(self._completions) == 0: return # at this point we can complete all pending archives self._connection.begin_transaction() try: for completion in self._completions: completion.pre_commit_process() except Exception: self._log.exception("sync_value_file") self._connection.rollback() raise self._connection.commit() for completion in self._completions: completion.post_commit_process() self._completions[:] = [] @property def value_file_is_synced(self): assert self._value_file is not None return self._value_file.is_synced def close(self): assert self._value_file is not None self.sync_value_file() self._value_file.close() self._value_file = None def start_new_segment( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, source_node_id, handoff_node_id, user_request_id ): """ Initiate storing a segment of data for a file """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("request {0}: " \ "start_new_segment {1} {2} {3} {4} {5} {6} {7}".format( user_request_id, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, source_node_id,)) if segment_key in self._active_segments: raise ValueError("duplicate segment %s" % (segment_key, )) timestamp = parse_timestamp_repr(timestamp_repr) self._active_segments[segment_key] = { "segment-id" : _insert_new_segment_row(self._connection, collection_id, unified_id, key, timestamp, conjoined_part, segment_num, source_node_id, handoff_node_id), } def store_sequence( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, segment_size, zfec_padding_size, segment_md5_digest, segment_adler32, sequence_num, data, user_request_id ): """ store one piece (sequence) of segment data """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("request {0}: " \ "store_sequence {1} {2} {3} {4} {5}: {6} ({7})".format( user_request_id, collection_id, key, unified_id, timestamp_repr, segment_num, sequence_num, segment_size)) segment_entry = self._active_segments[segment_key] # if this write would put us over the max size, # start a new output value file if self._value_file.size + segment_size > _max_value_file_size: self._value_file.close() space_id = find_least_volume_space_id("journal", self._file_space_info) self._value_file = OutputValueFile(self._connection, space_id, self._repository_path) segment_sequence_row = segment_sequence_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], zfec_padding_size=zfec_padding_size, value_file_id=self._value_file.value_file_id, sequence_num=sequence_num, value_file_offset=self._value_file.size, size=segment_size, hash=psycopg2.Binary(segment_md5_digest), adler32=segment_adler32, ) self._value_file.write_data_for_one_sequence( collection_id, segment_entry["segment-id"], data ) _insert_segment_sequence_row(self._connection, segment_sequence_row) def set_tombstone( self, collection_id, key, unified_id_to_delete, unified_id, timestamp, segment_num, source_node_id, handoff_node_id, user_request_id, ): """ mark a key as deleted """ _insert_segment_tombstone_row( self._connection, collection_id, key, unified_id, timestamp, segment_num, unified_id_to_delete, source_node_id, handoff_node_id ) def cancel_active_archives_from_node(self, source_node_id, timestamp): """ cancel all segment rows * from a specifiic source node * are in active status * with a timestamp earlier than the specified time. This is triggered by a web server restart """ _cancel_segment_rows(self._connection, source_node_id, timestamp) def cancel_active_archive(self, unified_id, conjoined_part, segment_num, user_request_id): """ cancel an archive that is in progress, presumably due to failure at the web server """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("request {0}: " \ "cancel_active_archive {1}".format(user_request_id, segment_key)) # 2012-02-27 dougfort -- there is a race condition where the web # server sends out cancellations on an archive that has completed # because it hasn't reveived the final message yet try: self._active_segments.pop(segment_key) except KeyError: pass _cancel_segment_row(self._connection, unified_id, conjoined_part, segment_num) def start_conjoined_archive( self, collection_id, key, unified_id, timestamp, handoff_node_id ): """ start a conjoined archive """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "create_timestamp" : timestamp, "handoff_node_id" : handoff_node_id, } _insert_conjoined_row(self._connection, conjoined_dict) def abort_conjoined_archive( self, collection_id, key, unified_id, timestamp, handoff_node_id ): """ mark a conjoined archive as aborted """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "abort_timestamp" : timestamp, "handoff_node_id" : handoff_node_id, } _set_conjoined_abort_timestamp(self._connection, conjoined_dict) def finish_conjoined_archive( self, collection_id, key, unified_id, timestamp, handoff_node_id, ): """ mark a conjoined archive as finished """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "complete_timestamp" : timestamp, "handoff_node_id" : handoff_node_id, } _set_conjoined_complete_timestamp(self._connection, conjoined_dict)
class Writer(object): """ Manage writing segment values to disk """ def __init__(self, connection, repository_path): self._log = logging.getLogger("Writer") self._connection = connection self._repository_path = repository_path self._active_segments = dict() # open a new value file at startup self._value_file = OutputValueFile( self._connection, self._repository_path ) def close(self): self._value_file.close() def start_new_segment( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, source_node_id, handoff_node_id, ): """ Initiate storing a segment of data for a file """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("start_new_segment %s %s %s %s %s %s %s" % ( collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, source_node_id, )) if segment_key in self._active_segments: raise ValueError("duplicate segment %s" % (segment_key, )) timestamp = parse_timestamp_repr(timestamp_repr) self._active_segments[segment_key] = { "segment-id" : _insert_new_segment_row(self._connection, collection_id, unified_id, key, timestamp, conjoined_part, segment_num, source_node_id, handoff_node_id), } def store_sequence( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, segment_size, zfec_padding_size, segment_md5_digest, segment_adler32, sequence_num, data ): """ store one piece (sequence) of segment data """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("store_sequence %s %s %s %s %s: %s (%s)" % ( collection_id, key, unified_id, timestamp_repr, segment_num, sequence_num, segment_size )) segment_entry = self._active_segments[segment_key] # if this write would put us over the max size, # start a new output value file if self._value_file.size + segment_size > _max_value_file_size: self._value_file.close() self._value_file = OutputValueFile( self._connection, self._repository_path ) segment_sequence_row = segment_sequence_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], zfec_padding_size=zfec_padding_size, value_file_id=self._value_file.value_file_id, sequence_num=sequence_num, value_file_offset=self._value_file.size, size=segment_size, hash=psycopg2.Binary(segment_md5_digest), adler32=segment_adler32, ) self._value_file.write_data_for_one_sequence( collection_id, segment_entry["segment-id"], data ) _insert_segment_sequence_row(self._connection, segment_sequence_row) def finish_new_segment( self, collection_id, unified_id, timestamp_repr, conjoined_part, segment_num, file_size, file_adler32, file_hash, meta_dict, ): """ finalize storing one segment of data for a file """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("finish_new_segment %s %s" % ( unified_id, segment_num, )) segment_entry = self._active_segments.pop(segment_key) timestamp = parse_timestamp_repr(timestamp_repr) meta_rows = list() for meta_key, meta_value in meta_dict.items(): meta_row = meta_row_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], meta_key=meta_key, meta_value=meta_value, timestamp=timestamp ) meta_rows.append(meta_row) _finalize_segment_row( self._connection, segment_entry["segment-id"], file_size, file_adler32, file_hash, meta_rows ) def set_tombstone( self, collection_id, key, unified_id_to_delete, unified_id, timestamp, segment_num, source_node_id, handoff_node_id ): """ mark a key as deleted """ _insert_segment_tombstone_row( self._connection, collection_id, key, unified_id, timestamp, segment_num, unified_id_to_delete, source_node_id, handoff_node_id ) def cancel_active_archives_from_node(self, source_node_id, timestamp): """ cancel all segment rows * from a specifiic source node * are in active status * with a timestamp earlier than the specified time. This is triggered by a web server restart """ _cancel_segment_rows(self._connection, source_node_id, timestamp) def cancel_active_archive(self, unified_id, conjoined_part, segment_num): """ cancel an archive that is in progress, presumably due to failure at the web server """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("cancel_active_archive %s %s" % ( unified_id, segment_num )) # 2012-02-27 dougfort -- there is a race condition where the web # server sends out cancellations on an archive that has completed # because it hasn't reveived the final message yet try: segment_entry = self._active_segments.pop(segment_key) except KeyError: pass else: _cancel_segment_row(self._connection, segment_entry["segment-id"]) def purge_handoff_source( self, collection_id, unified_id, handoff_node_id ): """ delete rows for a handoff source """ _purge_handoff_source( self._connection, collection_id, unified_id, handoff_node_id ) def start_conjoined_archive( self, collection_id, key, unified_id, timestamp ): """ start a conjoined archive """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "create_timestamp" : timestamp } _insert_conjoined_row(self._connection, conjoined_dict) def abort_conjoined_archive( self, collection_id, key, unified_id, timestamp ): """ mark a conjoined archive as aborted """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "abort_timestamp" : timestamp } _set_conjoined_abort_timestamp(self._connection, conjoined_dict) def finish_conjoined_archive( self, collection_id, key, unified_id, timestamp ): """ mark a conjoined archive as finished """ conjoined_dict = { "collection_id" : collection_id, "key" : key, "unified_id" : unified_id, "complete_timestamp" : timestamp } _set_conjoined_complete_timestamp(self._connection, conjoined_dict)