def _corrupt_with_append_only(library, library_name):
    def do_fail(version):
        raise Exception('test')

    large_ts = create_test_data(size=2000, cols=100,
                                index=True, multiindex=False,
                                random_data=True, random_ids=True)
    library.write(symbol, large_ts[0:1000])  # v1
    library.snapshot('snap_write_a')
    library.append(symbol, large_ts[1000:1010])  # v2
    library.snapshot('snap_write_b')

    # Here we simulate a scenario where an append succeeds to insert the data segments,
    # but fails to insert the version document (i.e. Mongo error occurred)
    orig_insert_version = library._insert_version
    library._insert_version = do_fail
    try:
        library.append(symbol, large_ts[1010:1020])  # v3
    except:
        pass
    library._insert_version = orig_insert_version

    library.write_metadata(symbol, {'hello': 'there'})  # , prune_previous_version=False)

    # Appending subsequently overlapping and non-SHA-matching data cause data corruption
    library.append(symbol, large_ts[1018:1030])  # , prune_previous_version=False)

    last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)])
    vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1)

    # Verify no versions have been corrupted
    for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]):
        library.read(symbol, as_of=v['version'])
def test_no_corruption_restore_writemeta_append(library, library_name):
    large_ts = create_test_data(size=2000, cols=100,
                                index=True, multiindex=False,
                                random_data=True, random_ids=True)
    rows_per_append = 100

    last_row = n_append(library, library_name, 9, rows_per_append, large_ts, 0)

    library.write_metadata(symbol, metadata={'abc': 'xyz'})

    n_append(library, library_name, 9, rows_per_append, large_ts, last_row)

    library.write_metadata(symbol, metadata={'abc2': 'xyz2'})

    # Corrupts all versions between the version that row "restore_from_row" was written,
    restore_from_row = rows_per_append * 10
    library.restore_version(symbol, 'snap_{}'.format(restore_from_row))

    library.write_metadata(symbol, metadata={'abc3': 'xyz3'})

    library.append(symbol, large_ts[restore_from_row:restore_from_row + 50])

    library.write_metadata(symbol, metadata={'abc4': 'xyz4'})

    last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)])
    vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1)

    # Verify no versions have been corrupted
    for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]):
        library.read(symbol, as_of=v['version'])
def test_no_corruption_restore_append_non_overlapping_tstamps(library, library_name):
    large_ts = create_test_data(size=2000, cols=100,
                                index=True, multiindex=False,
                                random_data=True, random_ids=True)

    # Append with 50 small uncompressed segments (no new base yet)
    last_row_b = n_append(library, library_name, 50, 25, large_ts, 0, False, True)

    library.snapshot('snap_A')

    # Append with 20 more small segments, causes once copy-rewrite with new base, and then some small appended segments
    n_append(library, library_name, 15, 25, large_ts, last_row_b, True, True)

    library.restore_version(symbol, as_of='snap_A')

    last_row = n_append(library, library_name, 1, 40, large_ts, last_row_b, False, True)
    library.snapshot('snap_B')

    # Corrupts all versions
    last_row = n_append(library, library_name, 1, 10, large_ts, last_row, False, True)
    last_row = n_append(library, library_name, 8, 20, large_ts, last_row, False, True)
    library.snapshot('snap_C')

    last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)])
    vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1)

    # Verify no versions have been corrupted
    for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]):
        library.read(symbol, as_of=v['version'])
Example #4
0
def get_random_df(num_chunks):
    num_chunks = num_chunks
    data_to_write = create_test_data(size=25000,
                                     index=True,
                                     multiindex=False,
                                     random_data=True,
                                     random_ids=True,
                                     use_hours=True,
                                     date_offset=0,
                                     cols=10)
    data_to_write = data_to_write.append([data_to_write] * (num_chunks - 1))
    return data_to_write
def test_restore_append_overlapping_corrupts_last(library, library_name):
    large_ts = create_test_data(size=2000, cols=100,
                                index=True, multiindex=False,
                                random_data=True, random_ids=True)
    library.write(symbol, large_ts[0:1000])
    library.snapshot('snap_write_a')

    library.append(symbol, large_ts[1000:1010])

    library.restore_version(symbol, as_of='snap_write_a', prune_previous_version=True)
    library.append(symbol, large_ts[1000:1012])

    last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)])
    vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1)

    # Verify no versions have been corrupted
    for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]):
        library.read(symbol, as_of=v['version'])
def test_fast_check_corruption(library, library_name):
    ts = create_test_data(size=100, cols=100,
                          index=True, multiindex=False,
                          random_data=True, random_ids=True)
    library.write(symbol, ts[0:10])  # v1

    assert not vsu.fast_is_corrupted(library, symbol, input_v=1)

    library.append(symbol, ts[10:20], prune_previous_version=False)  # v2
    assert not vsu.fast_is_corrupted(library, symbol, input_v=2)

    library.append(symbol, ts[20:30], prune_previous_version=False)  # v3
    assert not vsu.fast_is_corrupted(library, symbol, input_v=3)

    # Now the dangerous part
    last_segment = library._collection.find_one({}, sort=[('_id', pymongo.DESCENDING)])
    library._collection.delete_one({'_id': last_segment['_id']})

    assert vsu.fast_is_corrupted(library, symbol, input_v=3)
def test_fast_is_safe_to_append(library, library_name):
    from bson.binary import Binary
    import hashlib
    def modify_segment(segment, item):
        segment['segment'] -= 2
        sha = hashlib.sha1()
        sha.update(item.encode('ascii'))
        segment['sha'] = Binary(sha.digest())
        segment.pop('_id')

    ts = create_test_data(size=100, cols=100,
                          index=True, multiindex=False,
                          random_data=True, random_ids=True)
    library.write(symbol, ts[0:10])  # v1
    assert vsu.is_safe_to_append(library, symbol, input_v=1)

    library.append(symbol, ts[10:20], prune_previous_version=False)  # v2
    assert vsu.is_safe_to_append(library, symbol, input_v=2)

    library.append(symbol, ts[20:30], prune_previous_version=False)  # v3
    assert vsu.is_safe_to_append(library, symbol, input_v=3)

    # Corrupt the data be removing segment
    last_segment = library._collection.find_one({}, sort=[('_id', pymongo.DESCENDING)])
    library._collection.delete_one({'_id': last_segment['_id']})
    assert not vsu.is_safe_to_append(library, symbol, input_v=3)
    with pytest.raises(OperationFailure):
        library.read(symbol)

    # Fix the library by adding back the deleted segment
    library._collection.insert_one(last_segment)
    assert vsu.is_safe_to_append(library, symbol, input_v=3)

    # Corrupt the data be adding an unnecessary segment
    modify_segment(last_segment, 'abcd')
    library._collection.insert_one(last_segment)
    assert not vsu.is_safe_to_append(library, symbol, input_v=3)
    with pytest.raises(OperationFailure):
        library.read(symbol)
def test_append_fail_after_delete_noupsert(library, library_name):
    large_ts = create_test_data(size=2000, cols=100,
                                index=True, multiindex=False,
                                random_data=True, random_ids=True)
    library.write(symbol, large_ts[0:1000])  #v1
    library.snapshot('snap_a')
    library.append(symbol, large_ts[1000:1010])  #v2
    library.snapshot('snap_b')
    library.append(symbol, large_ts[1010:1020])  #v3
    library.snapshot('snap_c')

    library.append(symbol, large_ts[1030:1040])  #v4

    library.delete(symbol) #v5

    library.append(symbol, large_ts[1040:1050], upsert=False)  # v6

    last_v = library._versions.find_one(sort=[('version', pymongo.DESCENDING)])
    vsu.analyze_symbol(library, symbol, 0, last_v['version'] + 1)

    # Verify no versions have been corrupted
    for v in library._versions.find(sort=[('version', pymongo.DESCENDING)]):
        library.read(symbol, as_of=v['version'])
Example #9
0
def get_random_df(num_chunks):
    num_chunks = num_chunks
    data_to_write = create_test_data(size=25000, index=True, multiindex=False, random_data=True, random_ids=True,
                                     use_hours=True, date_offset=0, cols=10)
    data_to_write = data_to_write.append([data_to_write] * (num_chunks - 1))
    return data_to_write
Example #10
0
def _mixed_test_data():
    global _TEST_DATA
    if _TEST_DATA is None:
        onerow_ts = get_large_ts(1)
        small_ts = get_large_ts(10)
        medium_ts = get_large_ts(600)
        large_ts = get_large_ts(1800)
        empty_ts = pd.DataFrame()
        empty_index = create_test_data(size=0, cols=10, index=True, multiindex=False, random_data=True, random_ids=True)

        with_some_objects_ts = medium_ts.copy(deep=True)
        with_some_objects_ts.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 0] = None
        with_some_objects_ts.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 1] = 'A string'
        large_with_some_objects = create_test_data(size=10000, cols=64, index=True, multiindex=False, random_data=True,
                                                   random_ids=True, use_hours=True)
        large_with_some_objects.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 0] = None
        large_with_some_objects.iloc[0:NON_HOMOGENEOUS_DTYPE_PATCH_SIZE_ROWS, 1] = 'A string'

        with_string_ts = medium_ts.copy(deep=True)
        with_string_ts['str_col'] = 'abc'
        with_unicode_ts = medium_ts.copy(deep=True)
        with_unicode_ts['ustr_col'] = u'abc'

        with_some_none_ts = medium_ts.copy(deep=True)
        with_some_none_ts.iloc[10:10] = None
        with_some_none_ts.iloc[-10:-10] = np.nan
        with_some_none_ts = with_some_none_ts.replace({np.nan: None})

        # Multi-index data frames
        multiindex_ts = create_test_data(size=500, cols=10, index=True, multiindex=True, random_data=True,
                                         random_ids=True)
        empty_multiindex_ts = create_test_data(size=0, cols=10, index=True, multiindex=True, random_data=True,
                                               random_ids=True)
        large_multi_index = create_test_data(
            size=50000, cols=10, index=True, multiindex=True, random_data=True, random_ids=True, use_hours=True)

        # Multi-column data frames
        columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"])
        empty_multi_column_ts = pd.DataFrame([], columns=columns)

        columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"])
        multi_column_no_multiindex = pd.DataFrame(np.random.randn(2, 8), index=[0, 1], columns=columns)

        large_multi_column = pd.DataFrame(np.random.randn(100000, 8), index=range(100000), columns=columns)

        columns = pd.MultiIndex.from_product([[1, 2, 'a'], ['c', 5]])
        multi_column_int_levels = pd.DataFrame([[9, 2, 8, 1, 2, 3], [3, 4, 2, 9, 10, 11]],
                                               index=['x', 'y'], columns=columns)

        # Multi-index and multi-column data frames
        columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]])
        index = pd.MultiIndex.from_product([["x", "y", "z"], ["a", "b"]])
        multi_column_and_multi_index = pd.DataFrame(np.random.randn(6, 8), index=index, columns=columns)

        # Nested n-dimensional
        def _new_np_nd_array(val):
            return np.rec.array([(val, ['A', 'BC'])],
                                dtype=[('index', '<M8[ns]'), ('values', 'S2', (2,))])
        n_dimensional_df = pd.DataFrame(
            {'a': [_new_np_nd_array(1356998400000000000), _new_np_nd_array(1356998400000000001)],
             'b': [_new_np_nd_array(1356998400000000002), _new_np_nd_array(1356998400000000003)]
             },
            index=(0, 1))

        # With mixed types (i.e. string / numbers) in multi-index
        input_dict = {'POSITION': {
            (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'SYMA', 'XX', 0): 0.0,
            (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'SYMA', 'FFL', '201312'): -558.0,
            (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'AG', 'FFL', '201312'): -74.0,
            (pd.Timestamp('2013-10-07 15:45:43'), 'MYSTRT', 'AG', 'XX', 0): 0.0}
        }
        multi_index_with_object = pd.DataFrame.from_dict(input_dict)

        # Exhaust all dtypes
        mixed_dtypes_df = pd.DataFrame({
            'string': list('abc'),
            'int64': list(range(1, 4)),
            'uint8': np.arange(3, 6).astype('u1'),
            'uint64': np.arange(3, 6).astype('u8'),
            'float64': np.arange(4.0, 7.0),
            'bool1': [True, False, True],
            'dates': pd.date_range('now', periods=3).values,
            'other_dates': pd.date_range('20130101', periods=3).values,
            # 'category': pd.Series(list("ABC")).astype('category'),
            'tz_aware_dates': pd.date_range('20130101', periods=3, tz='US/Eastern'),
            'complex': np.array([1. + 4.j, 2. + 5.j, 3. + 6.j])
        })
        mixed_dtypes_df['timedeltas'] = mixed_dtypes_df.dates.diff()

        # Multi-column with some objects
        multi_column_with_some_objects = multi_column_no_multiindex.copy()
        multi_column_with_some_objects.iloc[1:, 1:2] = 'Convert this columnt dtype to object'

        # Index with timezone-aware datetime
        index_tz_aware = pd.DataFrame(data={'colA': range(10),
                                            'colB': pd.date_range('20130101', periods=10, tz='US/Eastern')},
                                      index=pd.date_range('20130101', periods=10, tz='US/Eastern'))
        index_tz_aware.index.name = 'index'

        _TEST_DATA = {
            'onerow': (onerow_ts, df_serializer.serialize(onerow_ts),
                       df_serializer.can_convert_to_records_without_objects(small_ts, 'symA')),
            'small': (small_ts, df_serializer.serialize(small_ts),
                      df_serializer.can_convert_to_records_without_objects(small_ts, 'symA')),
            'medium': (medium_ts, df_serializer.serialize(medium_ts),
                       df_serializer.can_convert_to_records_without_objects(medium_ts, 'symA')),
            'large': (large_ts, df_serializer.serialize(large_ts),
                      df_serializer.can_convert_to_records_without_objects(large_ts, 'symA')),
            'empty': (empty_ts, df_serializer.serialize(empty_ts),
                      df_serializer.can_convert_to_records_without_objects(empty_ts, 'symA')),
            'empty_index': (empty_index, df_serializer.serialize(empty_index),
                            df_serializer.can_convert_to_records_without_objects(empty_index, 'symA')),
            'with_some_objects': (with_some_objects_ts, df_serializer.serialize(with_some_objects_ts),
                                  df_serializer.can_convert_to_records_without_objects(with_some_objects_ts, 'symA')),
            'large_with_some_objects': (
                large_with_some_objects, df_serializer.serialize(large_with_some_objects),
                df_serializer.can_convert_to_records_without_objects(large_with_some_objects, 'symA')),
            'with_string': (with_string_ts, df_serializer.serialize(with_string_ts),
                            df_serializer.can_convert_to_records_without_objects(with_string_ts, 'symA')),
            'with_unicode': (with_unicode_ts, df_serializer.serialize(with_unicode_ts),
                             df_serializer.can_convert_to_records_without_objects(with_unicode_ts, 'symA')),
            'with_some_none': (with_some_none_ts, df_serializer.serialize(with_some_none_ts),
                               df_serializer.can_convert_to_records_without_objects(with_some_none_ts, 'symA')),
            'multiindex': (multiindex_ts, df_serializer.serialize(multiindex_ts),
                           df_serializer.can_convert_to_records_without_objects(multiindex_ts, 'symA')),
            'multiindex_with_object': (
                multi_index_with_object, df_serializer.serialize(multi_index_with_object),
                df_serializer.can_convert_to_records_without_objects(multi_index_with_object, 'symA')),
            'empty_multiindex': (empty_multiindex_ts, df_serializer.serialize(empty_multiindex_ts),
                                 df_serializer.can_convert_to_records_without_objects(empty_multiindex_ts, 'symA')),
            'large_multi_index': (large_multi_index, df_serializer.serialize(large_multi_index),
                                  df_serializer.can_convert_to_records_without_objects(large_multi_index, 'symA')),
            'empty_multicolumn': (empty_multi_column_ts, df_serializer.serialize(empty_multi_column_ts),
                                  df_serializer.can_convert_to_records_without_objects(empty_multi_column_ts, 'symA')),
            'multi_column_no_multiindex': (
                multi_column_no_multiindex, df_serializer.serialize(multi_column_no_multiindex),
                df_serializer.can_convert_to_records_without_objects(multi_column_no_multiindex, 'symA')),
            'large_multi_column': (large_multi_column, df_serializer.serialize(large_multi_column),
                                   df_serializer.can_convert_to_records_without_objects(large_multi_column, 'symA')),
            'multi_column_int_levels': (
                multi_column_int_levels, df_serializer.serialize(multi_column_int_levels),
                df_serializer.can_convert_to_records_without_objects(multi_column_int_levels, 'symA')),
            'multi_column_and_multi_index': (
                multi_column_and_multi_index, df_serializer.serialize(multi_column_and_multi_index),
                df_serializer.can_convert_to_records_without_objects(multi_column_and_multi_index, 'symA')),
            'multi_column_with_some_objects': (
                multi_column_with_some_objects, df_serializer.serialize(multi_column_with_some_objects),
                df_serializer.can_convert_to_records_without_objects(multi_column_with_some_objects, 'symA')),
            'n_dimensional_df': (n_dimensional_df, Exception, None),
            'mixed_dtypes_df': (mixed_dtypes_df, df_serializer.serialize(mixed_dtypes_df),
                                df_serializer.can_convert_to_records_without_objects(mixed_dtypes_df, 'symA')),
            'index_tz_aware': (index_tz_aware, df_serializer.serialize(index_tz_aware),
                               df_serializer.can_convert_to_records_without_objects(index_tz_aware, 'symA'))
        }
    return _TEST_DATA