Exemple #1
0
def _value_file_status(connection, entry):
    log = logging.getLogger("_value_file_status")
    batch_key = make_batch_key(entry)

    value_file_path = compute_value_file_path(_repository_path, 
                                              entry.space_id, 
                                              entry.value_file_id)
    # Always do a stat on the value file. 
    try:
        stat_result = os.stat(value_file_path)
    except OSError as instance:
        # If the value file is missing, consider all of the segment_sequences 
        # to be missing, and handle it as such.
        if instance.errno == errno.ENOENT:
            log.error("value file missing {0} {1}".format(batch_key,
                                                          value_file_path))
            return _value_file_missing
        log.error("Error stat'ing value file {0} {1} {2}".format(
            str(instance), batch_key, value_file_path))
        raise

    # If the value file is still open, consider all data in it undammaged.
    if entry.value_file_close_time is None:
        return _value_file_valid

    # If the value file exists, is closed, and has an md5 in the database, 
    # has a size in the database, and the size in the stat matches the size 
    # in the database, and has a close_time or a last_integrity_check_time 
    # that is younger than (MAX_TIME_BETWEEN_VALUE_FILE_INTEGRITY_CHECK) 
    # consider all records in the file undammaged. (This is the common case.)
    if entry.value_file_hash is None:
        log.info("Value file row has no md5 hash {0} {1}".format(batch_key,
                                                                 entry))
        return _value_file_questionable

    if entry.value_file_size is None:
        log.info("Value file row has no size {0} {1}".format(batch_key,
                                                             entry))
        return _value_file_questionable

    if entry.value_file_size != stat_result.st_size:
        log.info("Value file row size {0} != stat size {1} {2}".format(
            entry.value_file_size, stat_result.st_size, batch_key))
        return _value_file_questionable
    
    current_time = create_timestamp()
    value_file_row_age = current_time - entry.value_file_close_time
    if entry.value_file_last_integrity_check_time is not None:
        value_file_row_age = \
                current_time - entry.value_file_last_integrity_check_time

    if value_file_row_age < _max_value_file_time:
        return _value_file_valid

    value_file_result = _value_file_valid

    # If the value matches all the previous criteria EXCEPT the 
    # MAX_TIME_BETWEEN_VALUE_FILE_INTEGRITY_CHECK, then read the whole file, 
    # and calculate the md5. If it matches, consider the whole file good as 
    # above. Update last_integrity_check_time regardless.

    md5_sum = hashlib.md5()
    try:
        with open(value_file_path, "rb") as input_file:
            while True:
                data = input_file.read(_read_buffer_size)
                if len(data) == 0:
                    break
                md5_sum.update(data)
    except (OSError, IOError) as instance:
        log.error("Error reading {0} {1}".format(value_file_path, 
                                                 instance))
        value_file_result =  _value_file_questionable

    if value_file_result == _value_file_valid and \
       md5_sum.digest() != bytes(entry.value_file_hash):
        log.error(
            "md5 mismatch {0} {1} {2} {3}".format(md5_sum.digest(),
                                                  bytes(entry.value_file_hash),
                                                  batch_key,
                                                  value_file_path))
        value_file_result =  _value_file_questionable


    # we're only supposed to do this after we've also read the file
    # and inserted any damage. not before. otherwise it's a race condition --
    # we may crash before finishing checking the file, and then the file
    # doesn't get checked, but it's marked as checked.

    _update_value_file_last_integrity_check_time(connection,
                                                 entry.value_file_id,
                                                 create_timestamp())

    return value_file_result
Exemple #2
0
def _process_work_batch(connection, known_value_files, batch):
    log = logging.getLogger("_process_work_batch")

    assert len(batch) > 0
    batch_key = make_batch_key(batch[0])
    log.info("batch {0}".format(batch_key))

    missing_sequence_numbers = list()
    defective_sequence_numbers = list()

    expected_slice_count = compute_expected_slice_count(batch[0].file_size)
    expected_sequence_numbers = set(range(0, expected_slice_count))
    actual_sequence_numbers = set([entry.sequence_num for entry in batch])
    missing_sequence_numbers.extend(
            list(expected_sequence_numbers - actual_sequence_numbers))

    for entry in batch:
        if not entry.value_file_id in known_value_files:
            known_value_files[entry.value_file_id] = \
                    _value_file_status(connection, entry)
        value_file_status = known_value_files[entry.value_file_id]

        # if we don't have a value_file for any sequence, 
        # treat that as missing too
        if value_file_status == _value_file_missing:
            log.info("Missing value file {0} for {1} sequence {2}".format(
                entry.value_file_id, batch_key, entry.sequence_num))
            missing_sequence_numbers.append(entry.sequence_num)
            continue

        if not _always_check_entries:
            if value_file_status == _value_file_valid:
                continue

            # if none of the above branches were fruitful, 
            # then all records in the database that point to this value file 
            # must be verified by opening, seeking, reading, and hashing the 
            # record pointed to in the value file. This will be terribly costly 
            # in terms of IO because our work is not sorted by value file. 
            # Fortunately, data corruption should be rare enough that the 
            # efficiency will be irrelevant
            assert value_file_status == _value_file_questionable

        if not _verify_entry_against_value_file(entry):
            log.info("Defective value file {0} for {1} sequence {2}".format(
                entry.value_file_id, batch_key, entry.sequence_num))
            defective_sequence_numbers.append(entry.sequence_num)
            continue

    if len(missing_sequence_numbers) > 0:
        missing_sequence_numbers.sort()
        log.info("missing sequence numbers {0}".format(
            missing_sequence_numbers))
        _store_damaged_segment(connection, 
                               batch[0], 
                               damaged_segment_missing_sequence,
                               missing_sequence_numbers)

    if len(defective_sequence_numbers) > 0:
        defective_sequence_numbers.sort()
        log.info("defective sequence numbers {0}".format(
            defective_sequence_numbers))
        _store_damaged_segment(connection, 
                               batch[0], 
                               damaged_segment_defective_sequence,
                               defective_sequence_numbers)