def _corrupt_with_append_only(library, library_name): def do_fail(version): raise Exception('test') large_ts = create_test_data(size=2000, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) library.write(symbol, large_ts[0:1000]) # v1 library.snapshot('snap_write_a') library.append(symbol, large_ts[1000:1010]) # v2 library.snapshot('snap_write_b') # Here we simulate a scenario where an append succeeds to insert the data segments, # but fails to insert the version document (i.e. Mongo error occurred) orig_insert_version = library._insert_version library._insert_version = do_fail try: library.append(symbol, large_ts[1010:1020]) # v3 except: pass library._insert_version = orig_insert_version library.write_metadata(symbol, {'hello': 'there'}) # , prune_previous_version=False) # Appending subsequently overlapping and non-SHA-matching data cause data corruption library.append(symbol, large_ts[1018:1030]) # , prune_previous_version=False) last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)]) vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1) # Verify no versions have been corrupted for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]): library.read(symbol, as_of=v['version'])
def test_no_corruption_restore_writemeta_append(library, library_name): large_ts = create_test_data(size=2000, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) rows_per_append = 100 last_row = n_append(library, library_name, 9, rows_per_append, large_ts, 0) library.write_metadata(symbol, metadata={'abc': 'xyz'}) n_append(library, library_name, 9, rows_per_append, large_ts, last_row) library.write_metadata(symbol, metadata={'abc2': 'xyz2'}) # Corrupts all versions between the version that row "restore_from_row" was written, restore_from_row = rows_per_append * 10 library.restore_version(symbol, 'snap_{}'.format(restore_from_row)) library.write_metadata(symbol, metadata={'abc3': 'xyz3'}) library.append(symbol, large_ts[restore_from_row:restore_from_row + 50]) library.write_metadata(symbol, metadata={'abc4': 'xyz4'}) last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)]) vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1) # Verify no versions have been corrupted for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]): library.read(symbol, as_of=v['version'])
def test_no_corruption_restore_append_non_overlapping_tstamps(library, library_name): large_ts = create_test_data(size=2000, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) # Append with 50 small uncompressed segments (no new base yet) last_row_b = n_append(library, library_name, 50, 25, large_ts, 0, False, True) library.snapshot('snap_A') # Append with 20 more small segments, causes once copy-rewrite with new base, and then some small appended segments n_append(library, library_name, 15, 25, large_ts, last_row_b, True, True) library.restore_version(symbol, as_of='snap_A') last_row = n_append(library, library_name, 1, 40, large_ts, last_row_b, False, True) library.snapshot('snap_B') # Corrupts all versions last_row = n_append(library, library_name, 1, 10, large_ts, last_row, False, True) last_row = n_append(library, library_name, 8, 20, large_ts, last_row, False, True) library.snapshot('snap_C') last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)]) vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1) # Verify no versions have been corrupted for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]): library.read(symbol, as_of=v['version'])
def get_random_df(num_chunks): num_chunks = num_chunks data_to_write = create_test_data(size=25000, index=True, multiindex=False, random_data=True, random_ids=True, use_hours=True, date_offset=0, cols=10) data_to_write = data_to_write.append([data_to_write] * (num_chunks - 1)) return data_to_write
def test_restore_append_overlapping_corrupts_last(library, library_name): large_ts = create_test_data(size=2000, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) library.write(symbol, large_ts[0:1000]) library.snapshot('snap_write_a') library.append(symbol, large_ts[1000:1010]) library.restore_version(symbol, as_of='snap_write_a', prune_previous_version=True) library.append(symbol, large_ts[1000:1012]) last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)]) vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1) # Verify no versions have been corrupted for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]): library.read(symbol, as_of=v['version'])
def test_fast_check_corruption(library, library_name): ts = create_test_data(size=100, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) library.write(symbol, ts[0:10]) # v1 assert not vsu.fast_is_corrupted(library, symbol, input_v=1) library.append(symbol, ts[10:20], prune_previous_version=False) # v2 assert not vsu.fast_is_corrupted(library, symbol, input_v=2) library.append(symbol, ts[20:30], prune_previous_version=False) # v3 assert not vsu.fast_is_corrupted(library, symbol, input_v=3) # Now the dangerous part last_segment = library._collection.find_one({}, sort=[('_id', pymongo.DESCENDING)]) library._collection.delete_one({'_id': last_segment['_id']}) assert vsu.fast_is_corrupted(library, symbol, input_v=3)
def test_fast_is_safe_to_append(library, library_name): from bson.binary import Binary import hashlib def modify_segment(segment, item): segment['segment'] -= 2 sha = hashlib.sha1() sha.update(item.encode('ascii')) segment['sha'] = Binary(sha.digest()) segment.pop('_id') ts = create_test_data(size=100, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) library.write(symbol, ts[0:10]) # v1 assert vsu.is_safe_to_append(library, symbol, input_v=1) library.append(symbol, ts[10:20], prune_previous_version=False) # v2 assert vsu.is_safe_to_append(library, symbol, input_v=2) library.append(symbol, ts[20:30], prune_previous_version=False) # v3 assert vsu.is_safe_to_append(library, symbol, input_v=3) # Corrupt the data be removing segment last_segment = library._collection.find_one({}, sort=[('_id', pymongo.DESCENDING)]) library._collection.delete_one({'_id': last_segment['_id']}) assert not vsu.is_safe_to_append(library, symbol, input_v=3) with pytest.raises(OperationFailure): library.read(symbol) # Fix the library by adding back the deleted segment library._collection.insert_one(last_segment) assert vsu.is_safe_to_append(library, symbol, input_v=3) # Corrupt the data be adding an unnecessary segment modify_segment(last_segment, 'abcd') library._collection.insert_one(last_segment) assert not vsu.is_safe_to_append(library, symbol, input_v=3) with pytest.raises(OperationFailure): library.read(symbol)
def test_append_fail_after_delete_noupsert(library, library_name): large_ts = create_test_data(size=2000, cols=100, index=True, multiindex=False, random_data=True, random_ids=True) library.write(symbol, large_ts[0:1000]) #v1 library.snapshot('snap_a') library.append(symbol, large_ts[1000:1010]) #v2 library.snapshot('snap_b') library.append(symbol, large_ts[1010:1020]) #v3 library.snapshot('snap_c') library.append(symbol, large_ts[1030:1040]) #v4 library.delete(symbol) #v5 library.append(symbol, large_ts[1040:1050], upsert=False) # v6 last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)]) vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1) # Verify no versions have been corrupted for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]): library.read(symbol, as_of=v['version'])
def get_random_df(num_chunks): num_chunks = num_chunks data_to_write = create_test_data(size=25000, index=True, multiindex=False, random_data=True, random_ids=True, use_hours=True, date_offset=0, cols=10) data_to_write = data_to_write.append([data_to_write] * (num_chunks - 1)) return data_to_write
def _mixed_test_data(): global _TEST_DATA if _TEST_DATA is None: onerow_ts = get_large_ts(1) small_ts = get_large_ts(10) medium_ts = get_large_ts(600) large_ts = get_large_ts(1800) empty_ts = pd.DataFrame() empty_index = create_test_data(size=0, cols=10, index=True, multiindex=False, random_data=True, random_ids=True) with_some_objects_ts = medium_ts.copy(deep=True) with_some_objects_ts.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 0] = None with_some_objects_ts.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 1] = 'A string' large_with_some_objects = create_test_data(size=10000, cols=64, index=True, multiindex=False, random_data=True, random_ids=True, use_hours=True) large_with_some_objects.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 0] = None large_with_some_objects.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 1] = 'A string' with_string_ts = medium_ts.copy(deep=True) with_string_ts['str_col'] = 'abc' with_unicode_ts = medium_ts.copy(deep=True) with_unicode_ts['ustr_col'] = u'abc' with_some_none_ts = medium_ts.copy(deep=True) with_some_none_ts.iloc[10:10] = None with_some_none_ts.iloc[-10:-10] = np.nan with_some_none_ts = with_some_none_ts.replace({np.nan: None}) # Multi-index data frames multiindex_ts = create_test_data(size=500, cols=10, index=True, multiindex=True, random_data=True, random_ids=True) empty_multiindex_ts = create_test_data(size=0, cols=10, index=True, multiindex=True, random_data=True, random_ids=True) large_multi_index = create_test_data( size=50000, cols=10, index=True, multiindex=True, random_data=True, random_ids=True, use_hours=True) # Multi-column data frames columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) empty_multi_column_ts = pd.DataFrame([], columns=columns) columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) multi_column_no_multiindex = pd.DataFrame(np.random.randn(2, 8), index=[0, 1], columns=columns) large_multi_column = pd.DataFrame(np.random.randn(100000, 8), index=range(100000), columns=columns) columns = pd.MultiIndex.from_product([[1, 2, 'a'], ['c', 5]]) multi_column_int_levels = pd.DataFrame([[9, 2, 8, 1, 2, 3], [3, 4, 2, 9, 10, 11]], index=['x', 'y'], columns=columns) # Multi-index and multi-column data frames columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]]) index = pd.MultiIndex.from_product([["x", "y", "z"], ["a", "b"]]) multi_column_and_multi_index = pd.DataFrame(np.random.randn(6, 8), index=index, columns=columns) # Nested n-dimensional def _new_np_nd_array(val): return np.rec.array([(val, ['A', 'BC'])], dtype=[('index', '<M8[ns]'), ('values', 'S2', (2,))]) n_dimensional_df = pd.DataFrame( {'a': [_new_np_nd_array(1356998400000000000), _new_np_nd_array(1356998400000000001)], 'b': [_new_np_nd_array(1356998400000000002), _new_np_nd_array(1356998400000000003)] }, index=(0, 1)) # With mixed types (i.e. string / numbers) in multi-index input_dict = {'POSITION': { (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'SYMA', 'XX', 0): 0.0, (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'SYMA', 'FFL', '201312'): -558.0, (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'AG', 'FFL', '201312'): -74.0, (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'AG', 'XX', 0): 0.0} } multi_index_with_object = pd.DataFrame.from_dict(input_dict) # Exhaust all dtypes mixed_dtypes_df = pd.DataFrame({ 'string': list('abc'), 'int64': list(range(1, 4)), 'uint8': np.arange(3, 6).astype('u1'), 'uint64': np.arange(3, 6).astype('u8'), 'float64': np.arange(4.0, 7.0), 'bool1': [True, False, True], 'dates': pd.date_range('now', periods=3).values, 'other_dates': pd.date_range('20130101', periods=3).values, # 'category': pd.Series(list("ABC")).astype('category'), 'tz_aware_dates': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'complex': np.array([1. + 4.j, 2. + 5.j, 3. + 6.j]) }) mixed_dtypes_df['timedeltas'] = mixed_dtypes_df.dates.diff() # Multi-column with some objects multi_column_with_some_objects = multi_column_no_multiindex.copy() multi_column_with_some_objects.iloc[1:, 1:2] = 'Convert this columnt dtype to object' # Index with timezone-aware datetime index_tz_aware = pd.DataFrame(data={'colA': range(10), 'colB': pd.date_range('20130101', periods=10, tz='US/Eastern')}, index=pd.date_range('20130101', periods=10, tz='US/Eastern')) index_tz_aware.index.name = 'index' _TEST_DATA = { 'onerow': (onerow_ts, df_serializer.serialize(onerow_ts), df_serializer.can_convert_to_records_without_objects(small_ts, 'symA')), 'small': (small_ts, df_serializer.serialize(small_ts), df_serializer.can_convert_to_records_without_objects(small_ts, 'symA')), 'medium': (medium_ts, df_serializer.serialize(medium_ts), df_serializer.can_convert_to_records_without_objects(medium_ts, 'symA')), 'large': (large_ts, df_serializer.serialize(large_ts), df_serializer.can_convert_to_records_without_objects(large_ts, 'symA')), 'empty': (empty_ts, df_serializer.serialize(empty_ts), df_serializer.can_convert_to_records_without_objects(empty_ts, 'symA')), 'empty_index': (empty_index, df_serializer.serialize(empty_index), df_serializer.can_convert_to_records_without_objects(empty_index, 'symA')), 'with_some_objects': (with_some_objects_ts, df_serializer.serialize(with_some_objects_ts), df_serializer.can_convert_to_records_without_objects(with_some_objects_ts, 'symA')), 'large_with_some_objects': ( large_with_some_objects, df_serializer.serialize(large_with_some_objects), df_serializer.can_convert_to_records_without_objects(large_with_some_objects, 'symA')), 'with_string': (with_string_ts, df_serializer.serialize(with_string_ts), df_serializer.can_convert_to_records_without_objects(with_string_ts, 'symA')), 'with_unicode': (with_unicode_ts, df_serializer.serialize(with_unicode_ts), df_serializer.can_convert_to_records_without_objects(with_unicode_ts, 'symA')), 'with_some_none': (with_some_none_ts, df_serializer.serialize(with_some_none_ts), df_serializer.can_convert_to_records_without_objects(with_some_none_ts, 'symA')), 'multiindex': (multiindex_ts, df_serializer.serialize(multiindex_ts), df_serializer.can_convert_to_records_without_objects(multiindex_ts, 'symA')), 'multiindex_with_object': ( multi_index_with_object, df_serializer.serialize(multi_index_with_object), df_serializer.can_convert_to_records_without_objects(multi_index_with_object, 'symA')), 'empty_multiindex': (empty_multiindex_ts, df_serializer.serialize(empty_multiindex_ts), df_serializer.can_convert_to_records_without_objects(empty_multiindex_ts, 'symA')), 'large_multi_index': (large_multi_index, df_serializer.serialize(large_multi_index), df_serializer.can_convert_to_records_without_objects(large_multi_index, 'symA')), 'empty_multicolumn': (empty_multi_column_ts, df_serializer.serialize(empty_multi_column_ts), df_serializer.can_convert_to_records_without_objects(empty_multi_column_ts, 'symA')), 'multi_column_no_multiindex': ( multi_column_no_multiindex, df_serializer.serialize(multi_column_no_multiindex), df_serializer.can_convert_to_records_without_objects(multi_column_no_multiindex, 'symA')), 'large_multi_column': (large_multi_column, df_serializer.serialize(large_multi_column), df_serializer.can_convert_to_records_without_objects(large_multi_column, 'symA')), 'multi_column_int_levels': ( multi_column_int_levels, df_serializer.serialize(multi_column_int_levels), df_serializer.can_convert_to_records_without_objects(multi_column_int_levels, 'symA')), 'multi_column_and_multi_index': ( multi_column_and_multi_index, df_serializer.serialize(multi_column_and_multi_index), df_serializer.can_convert_to_records_without_objects(multi_column_and_multi_index, 'symA')), 'multi_column_with_some_objects': ( multi_column_with_some_objects, df_serializer.serialize(multi_column_with_some_objects), df_serializer.can_convert_to_records_without_objects(multi_column_with_some_objects, 'symA')), 'n_dimensional_df': (n_dimensional_df, Exception, None), 'mixed_dtypes_df': (mixed_dtypes_df, df_serializer.serialize(mixed_dtypes_df), df_serializer.can_convert_to_records_without_objects(mixed_dtypes_df, 'symA')), 'index_tz_aware': (index_tz_aware, df_serializer.serialize(index_tz_aware), df_serializer.can_convert_to_records_without_objects(index_tz_aware, 'symA')) } return _TEST_DATA