def test_simple_output_value_file(self):
        """test writing a simple output value file"""
        collection_id = 1001
        segment_id = 42
        data_size = 1024
        data = random_string(data_size)
        output_value_file = OutputValueFile(
            self._database_connection, _repository_path
        )
        self.assertEqual(output_value_file.size, 0)
        output_value_file.write_data_for_one_sequence(
            collection_id, segment_id, data
        )
        self.assertEqual(output_value_file.size, data_size)
        output_value_file.close()
        
        value_file_row = _retrieve_value_file_row(
            self._database_connection, output_value_file._value_file_id
        )

        self.assertEqual(value_file_row.size, data_size)
        data_md5_hash = hashlib.md5(data).digest()
        self.assertEqual(str(value_file_row.hash), data_md5_hash)
        self.assertEqual(value_file_row.sequence_count, 1)
        self.assertEqual(value_file_row.min_segment_id, segment_id)
        self.assertEqual(value_file_row.max_segment_id, segment_id)
        self.assertEqual(value_file_row.distinct_collection_count, 1)
        self.assertEqual(value_file_row.collection_ids, [collection_id, ])
Ejemplo n.º 2
0
    def store_sequence(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num, 
        segment_size,
        zfec_padding_size,
        segment_md5_digest,
        segment_adler32,
        sequence_num, 
        data,
        user_request_id
    ):
        """
        store one piece (sequence) of segment data
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("request {0}: " \
                       "store_sequence {1} {2} {3} {4} {5}: {6} ({7})".format(
                       user_request_id,
                       collection_id, 
                       key, 
                       unified_id,
                       timestamp_repr, 
                       segment_num, 
                       sequence_num,
                       segment_size))
        segment_entry = self._active_segments[segment_key]

        # if this write would put us over the max size,
        # start a new output value file
        if self._value_file.size + segment_size > _max_value_file_size:
            self._value_file.close()
            space_id = find_least_volume_space_id("journal",
                                                  self._file_space_info)
            self._value_file = OutputValueFile(self._connection, 
                                               space_id,
                                               self._repository_path)

        segment_sequence_row = segment_sequence_template(
            collection_id=collection_id,
            segment_id=segment_entry["segment-id"],
            zfec_padding_size=zfec_padding_size,
            value_file_id=self._value_file.value_file_id,
            sequence_num=sequence_num,
            value_file_offset=self._value_file.size,
            size=segment_size,
            hash=psycopg2.Binary(segment_md5_digest),
            adler32=segment_adler32,
        )

        self._value_file.write_data_for_one_sequence(
            collection_id, segment_entry["segment-id"], data
        )

        _insert_segment_sequence_row(self._connection, segment_sequence_row)
Ejemplo n.º 3
0
    def __init__(self, connection, repository_path):
        self._log = logging.getLogger("Writer")
        self._connection = connection
        self._repository_path = repository_path
        self._active_segments = dict()

        # open a new value file at startup
        self._value_file = OutputValueFile(
            self._connection, self._repository_path
        )
Ejemplo n.º 4
0
    def store_sequence(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num, 
        segment_size,
        zfec_padding_size,
        segment_md5_digest,
        segment_adler32,
        sequence_num, 
        data
    ):
        """
        store one piece (sequence) of segment data
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("store_sequence %s %s %s %s %s: %s (%s)" % (
            collection_id, 
            key, 
            unified_id,
            timestamp_repr, 
            segment_num, 
            sequence_num,
            segment_size
        ))
        segment_entry = self._active_segments[segment_key]

        # if this write would put us over the max size,
        # start a new output value file
        if self._value_file.size + segment_size > _max_value_file_size:
            self._value_file.close()
            self._value_file = OutputValueFile(
                self._connection, self._repository_path
            )

        segment_sequence_row = segment_sequence_template(
            collection_id=collection_id,
            segment_id=segment_entry["segment-id"],
            zfec_padding_size=zfec_padding_size,
            value_file_id=self._value_file.value_file_id,
            sequence_num=sequence_num,
            value_file_offset=self._value_file.size,
            size=segment_size,
            hash=psycopg2.Binary(segment_md5_digest),
            adler32=segment_adler32,
        )

        self._value_file.write_data_for_one_sequence(
            collection_id, segment_entry["segment-id"], data
        )

        _insert_segment_sequence_row(self._connection, segment_sequence_row)
Ejemplo n.º 5
0
    def __init__(self, 
                 connection, 
                 file_space_info, 
                 repository_path, 
                 active_segments, 
                 completions
    ):
        self._log = logging.getLogger("Writer")
        self._connection = connection
        self._file_space_info = file_space_info
        self._repository_path = repository_path
        self._active_segments = active_segments
        self._completions = completions
        
        space_id = find_least_volume_space_id("journal", self._file_space_info)

        # open a new value file at startup
        self._value_file = OutputValueFile(self._connection, 
                                           space_id, 
                                           self._repository_path)
Ejemplo n.º 6
0
class Writer(object):
    """
    Manage writing segment values to disk
    """
    def __init__(self, 
                 connection, 
                 file_space_info, 
                 repository_path, 
                 active_segments, 
                 completions
    ):
        self._log = logging.getLogger("Writer")
        self._connection = connection
        self._file_space_info = file_space_info
        self._repository_path = repository_path
        self._active_segments = active_segments
        self._completions = completions
        
        space_id = find_least_volume_space_id("journal", self._file_space_info)

        # open a new value file at startup
        self._value_file = OutputValueFile(self._connection, 
                                           space_id, 
                                           self._repository_path)

    @property
    def value_file_hash(self):
        """
        return the hash of the currently open value file
        """
        assert self._value_file is not None
        return hash(self._value_file)

    def sync_value_file(self):
        """
        sync the current value file
        """
        assert self._value_file is not None
        self._value_file.sync()

        # Ticket #70 Data writer causes "already a transaction in progress" 
        # warning in the PostgreSQL log
        if len(self._completions) == 0:
            return

        # at this point we can complete all pending archives

        self._connection.begin_transaction()
        try:
            for completion in self._completions:
                completion.pre_commit_process()
        except Exception:
            self._log.exception("sync_value_file")
            self._connection.rollback()
            raise
        self._connection.commit()

        for completion in self._completions:
            completion.post_commit_process()

        self._completions[:] = []

    @property
    def value_file_is_synced(self):
        assert self._value_file is not None
        return self._value_file.is_synced 

    def close(self):
        assert self._value_file is not None
        self.sync_value_file()
        self._value_file.close()
        self._value_file = None

    def start_new_segment(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num,
        source_node_id,
        handoff_node_id,
        user_request_id
    ):
        """
        Initiate storing a segment of data for a file
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("request {0}: " \
                       "start_new_segment {1} {2} {3} {4} {5} {6} {7}".format(
                       user_request_id,
                       collection_id, 
                       key, 
                       unified_id, 
                       timestamp_repr, 
                       conjoined_part,
                       segment_num, 
                       source_node_id,))
        if segment_key in self._active_segments:
            raise ValueError("duplicate segment %s" % (segment_key, ))

        timestamp = parse_timestamp_repr(timestamp_repr)

        self._active_segments[segment_key] = {
            "segment-id" : _insert_new_segment_row(self._connection,
                                                   collection_id, 
                                                   unified_id,
                                                   key, 
                                                   timestamp, 
                                                   conjoined_part,
                                                   segment_num,
                                                   source_node_id,
                                                   handoff_node_id),
        }

    def store_sequence(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num, 
        segment_size,
        zfec_padding_size,
        segment_md5_digest,
        segment_adler32,
        sequence_num, 
        data,
        user_request_id
    ):
        """
        store one piece (sequence) of segment data
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("request {0}: " \
                       "store_sequence {1} {2} {3} {4} {5}: {6} ({7})".format(
                       user_request_id,
                       collection_id, 
                       key, 
                       unified_id,
                       timestamp_repr, 
                       segment_num, 
                       sequence_num,
                       segment_size))
        segment_entry = self._active_segments[segment_key]

        # if this write would put us over the max size,
        # start a new output value file
        if self._value_file.size + segment_size > _max_value_file_size:
            self._value_file.close()
            space_id = find_least_volume_space_id("journal",
                                                  self._file_space_info)
            self._value_file = OutputValueFile(self._connection, 
                                               space_id,
                                               self._repository_path)

        segment_sequence_row = segment_sequence_template(
            collection_id=collection_id,
            segment_id=segment_entry["segment-id"],
            zfec_padding_size=zfec_padding_size,
            value_file_id=self._value_file.value_file_id,
            sequence_num=sequence_num,
            value_file_offset=self._value_file.size,
            size=segment_size,
            hash=psycopg2.Binary(segment_md5_digest),
            adler32=segment_adler32,
        )

        self._value_file.write_data_for_one_sequence(
            collection_id, segment_entry["segment-id"], data
        )

        _insert_segment_sequence_row(self._connection, segment_sequence_row)

    def set_tombstone(
        self, 
        collection_id, 
        key, 
        unified_id_to_delete,
        unified_id, 
        timestamp, 
        segment_num, 
        source_node_id,
        handoff_node_id,
        user_request_id,
    ):
        """
        mark a key as deleted
        """
        _insert_segment_tombstone_row(
            self._connection,
            collection_id, 
            key, 
            unified_id,
            timestamp, 
            segment_num,
            unified_id_to_delete,
            source_node_id,
            handoff_node_id
        )

    def cancel_active_archives_from_node(self, source_node_id, timestamp):
        """
        cancel all segment rows 
           * from a specifiic source node
           * are in active status 
           * with a timestamp earlier than the specified time. 
        This is triggered by a web server restart
        """
        _cancel_segment_rows(self._connection, source_node_id, timestamp)

    def cancel_active_archive(self, 
                              unified_id, 
                              conjoined_part, 
                              segment_num,
                              user_request_id):
        """
        cancel an archive that is in progress, presumably due to failure
        at the web server
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("request {0}: " \
                       "cancel_active_archive {1}".format(user_request_id,
                                                          segment_key))
        # 2012-02-27 dougfort -- there is a race condition where the web
        # server sends out cancellations on an archive that has completed
        # because it hasn't reveived the final message yet
        try:
            self._active_segments.pop(segment_key)
        except KeyError:
            pass
        
        _cancel_segment_row(self._connection, 
                            unified_id, 
                            conjoined_part, 
                            segment_num)

    def start_conjoined_archive(
        self, collection_id, key, unified_id, timestamp, handoff_node_id
    ):
        """
        start a conjoined archive
        """
        conjoined_dict = {
            "collection_id"     : collection_id,
            "key"               : key,
            "unified_id"        : unified_id,
            "create_timestamp"  : timestamp,
            "handoff_node_id"   : handoff_node_id,
        }
        _insert_conjoined_row(self._connection, conjoined_dict)

    def abort_conjoined_archive(
        self, collection_id, key, unified_id, timestamp, handoff_node_id
    ):
        """
        mark a conjoined archive as aborted
        """
        conjoined_dict = {
            "collection_id"     : collection_id,
            "key"               : key,
            "unified_id"        : unified_id,
            "abort_timestamp"   : timestamp,
            "handoff_node_id"   : handoff_node_id,
        }
        _set_conjoined_abort_timestamp(self._connection, conjoined_dict)

    def finish_conjoined_archive(
        self, collection_id, key, unified_id, timestamp, handoff_node_id,
    ):
        """
        mark a conjoined archive as finished
        """
        conjoined_dict = {
            "collection_id"      : collection_id,
            "key"                : key,
            "unified_id"         : unified_id,
            "complete_timestamp" : timestamp,
            "handoff_node_id"    : handoff_node_id,
        }
        _set_conjoined_complete_timestamp(self._connection, conjoined_dict)
Ejemplo n.º 7
0
class Writer(object):
    """
    Manage writing segment values to disk
    """
    def __init__(self, connection, repository_path):
        self._log = logging.getLogger("Writer")
        self._connection = connection
        self._repository_path = repository_path
        self._active_segments = dict()

        # open a new value file at startup
        self._value_file = OutputValueFile(
            self._connection, self._repository_path
        )

    def close(self):
        self._value_file.close()

    def start_new_segment(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num,
        source_node_id,
        handoff_node_id,
    ):
        """
        Initiate storing a segment of data for a file
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("start_new_segment %s %s %s %s %s %s %s" % (
            collection_id, 
            key, 
            unified_id, 
            timestamp_repr, 
            conjoined_part,
            segment_num, 
            source_node_id,
        ))
        if segment_key in self._active_segments:
            raise ValueError("duplicate segment %s" % (segment_key, ))

        timestamp = parse_timestamp_repr(timestamp_repr)

        self._active_segments[segment_key] = {
            "segment-id" : _insert_new_segment_row(self._connection,
                                                   collection_id, 
                                                   unified_id,
                                                   key, 
                                                   timestamp, 
                                                   conjoined_part,
                                                   segment_num,
                                                   source_node_id,
                                                   handoff_node_id),
        }

    def store_sequence(
        self, 
        collection_id, 
        key, 
        unified_id,
        timestamp_repr, 
        conjoined_part,
        segment_num, 
        segment_size,
        zfec_padding_size,
        segment_md5_digest,
        segment_adler32,
        sequence_num, 
        data
    ):
        """
        store one piece (sequence) of segment data
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("store_sequence %s %s %s %s %s: %s (%s)" % (
            collection_id, 
            key, 
            unified_id,
            timestamp_repr, 
            segment_num, 
            sequence_num,
            segment_size
        ))
        segment_entry = self._active_segments[segment_key]

        # if this write would put us over the max size,
        # start a new output value file
        if self._value_file.size + segment_size > _max_value_file_size:
            self._value_file.close()
            self._value_file = OutputValueFile(
                self._connection, self._repository_path
            )

        segment_sequence_row = segment_sequence_template(
            collection_id=collection_id,
            segment_id=segment_entry["segment-id"],
            zfec_padding_size=zfec_padding_size,
            value_file_id=self._value_file.value_file_id,
            sequence_num=sequence_num,
            value_file_offset=self._value_file.size,
            size=segment_size,
            hash=psycopg2.Binary(segment_md5_digest),
            adler32=segment_adler32,
        )

        self._value_file.write_data_for_one_sequence(
            collection_id, segment_entry["segment-id"], data
        )

        _insert_segment_sequence_row(self._connection, segment_sequence_row)

    def finish_new_segment(
        self, 
        collection_id,
        unified_id,
        timestamp_repr,
        conjoined_part,
        segment_num,
        file_size,
        file_adler32,
        file_hash,
        meta_dict,
    ): 
        """
        finalize storing one segment of data for a file
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("finish_new_segment %s %s" % (
            unified_id, 
            segment_num, 
        ))
        segment_entry = self._active_segments.pop(segment_key)

        timestamp = parse_timestamp_repr(timestamp_repr)

        meta_rows = list()
        for meta_key, meta_value in meta_dict.items():
            meta_row = meta_row_template(
                collection_id=collection_id,
                segment_id=segment_entry["segment-id"],
                meta_key=meta_key,
                meta_value=meta_value,
                timestamp=timestamp
            )
            meta_rows.append(meta_row)

        _finalize_segment_row(
            self._connection, 
            segment_entry["segment-id"],
            file_size, 
            file_adler32, 
            file_hash, 
            meta_rows
        )
    
    def set_tombstone(
        self, 
        collection_id, 
        key, 
        unified_id_to_delete,
        unified_id, 
        timestamp, 
        segment_num, 
        source_node_id,
        handoff_node_id
    ):
        """
        mark a key as deleted
        """
        _insert_segment_tombstone_row(
            self._connection,
            collection_id, 
            key, 
            unified_id,
            timestamp, 
            segment_num,
            unified_id_to_delete,
            source_node_id,
            handoff_node_id
        )

    def cancel_active_archives_from_node(self, source_node_id, timestamp):
        """
        cancel all segment rows 
           * from a specifiic source node
           * are in active status 
           * with a timestamp earlier than the specified time. 
        This is triggered by a web server restart
        """
        _cancel_segment_rows(self._connection, source_node_id, timestamp)

    def cancel_active_archive(self, unified_id, conjoined_part, segment_num):
        """
        cancel an archive that is in progress, presumably due to failure
        at the web server
        """
        segment_key = (unified_id, conjoined_part, segment_num, )
        self._log.info("cancel_active_archive %s %s" % (
            unified_id, segment_num
        ))
        # 2012-02-27 dougfort -- there is a race condition where the web
        # server sends out cancellations on an archive that has completed
        # because it hasn't reveived the final message yet
        try:
            segment_entry = self._active_segments.pop(segment_key)
        except KeyError:
            pass
        else:
            _cancel_segment_row(self._connection, segment_entry["segment-id"])

    def purge_handoff_source(
        self, collection_id, unified_id, handoff_node_id
    ):
        """
        delete rows for a handoff source
        """
        _purge_handoff_source(
            self._connection, 
            collection_id, 
            unified_id, 
            handoff_node_id
        )

    def start_conjoined_archive(
        self, collection_id, key, unified_id, timestamp
    ):
        """
        start a conjoined archive
        """
        conjoined_dict = {
            "collection_id"     : collection_id,
            "key"               : key,
            "unified_id"        : unified_id,
            "create_timestamp"  : timestamp
        }
        _insert_conjoined_row(self._connection, conjoined_dict)

    def abort_conjoined_archive(
        self, collection_id, key, unified_id, timestamp
    ):
        """
        mark a conjoined archive as aborted
        """
        conjoined_dict = {
            "collection_id"     : collection_id,
            "key"               : key,
            "unified_id"        : unified_id,
            "abort_timestamp"   : timestamp
        }
        _set_conjoined_abort_timestamp(self._connection, conjoined_dict)

    def finish_conjoined_archive(
        self, collection_id, key, unified_id, timestamp
    ):
        """
        mark a conjoined archive as finished
        """
        conjoined_dict = {
            "collection_id"      : collection_id,
            "key"                : key,
            "unified_id"         : unified_id,
            "complete_timestamp" : timestamp
        }
        _set_conjoined_complete_timestamp(self._connection, conjoined_dict)