def generate_all_datatypes_doc(): """ Generates random dictionary suitable to be a mongodb Document with all supported datatypes Returns: document """ pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE return { '_id': bson.ObjectId(), 'double_field': random.randrange(-15, 15) / 3, 'string_field': random_string_generator(100), 'object_field': { 'obj_field_1_key': 'obj_field_1_val', 'obj_field_2_key': 'obj_field_2_val', }, 'array_field': [ None, random.randrange(-1, 1) / 5, { 'k': 'v' }, 'array_item', bson.Decimal128( decimal.Decimal(f'{random.randrange(-10, 10) / 6}')), ], 'binary_data_field': b'a binary string', 'object_id_field': bson.objectid.ObjectId(), 'boolean_field': True, 'date_field': datetime.datetime.now(), 'null_field': None, 'regex_field': regex, '32_bit_integer_field': 32, 'timestamp_field': bson.timestamp.Timestamp(int(time.time()), random.randint(0, 100)), '64_bit_integer_field': 34359738368, 'decimal_field': bson.Decimal128( decimal.Decimal(f'{random.randrange(-100, 100) / 33}')), 'javaScript_field': bson.code.Code('var x, y, z;'), 'javaScript_with_scope_field': bson.code.Code('function incrementX() { x++; }', scope={'x': 1}), 'min_key_field': bson.min_key.MinKey(), 'max_key_field': bson.max_key.MaxKey(), }
def test_format_json_with_pymongo(self): test_date = datetime.date(2006, 1, 2) test_datetime = datetime.datetime(2006, 1, 2, 15, 4, 5) chinese = "强哥" test_case = { "chinese": chinese, "date": test_date, "datetime": test_datetime, "decimal": decimal.Decimal("3.14"), "bson_decimal": bson.Decimal128("0.01"), } expected = { "chinese": "强哥", "date": "2006-01-02", "datetime": "2006-01-02 15:04:05", "decimal": "3.14", "bson_decimal": "0.01", } result = hutils.data_types.format_json(test_case) self.assertDictEqual(json.loads(result), expected) # Test ensure_ascii ascii = chinese.encode("unicode-escape").decode() result = hutils.data_types.format_json(test_case, ensure_ascii=True) self.assertIn(ascii, result) # Test json options sorted_result = hutils.data_types.format_json(test_case, sort_keys=True) sorted_expected = json.dumps(expected, ensure_ascii=False, sort_keys=True) self.assertEqual(sorted_result, sorted_expected)
def test_format_json_with_pymongo(self): test_date = datetime.date(2006, 1, 2) test_datetime = datetime.datetime(2006, 1, 2, 15, 4, 5) chinese = '强哥' test_case = { 'chinese': chinese, 'date': test_date, 'datetime': test_datetime, 'decimal': decimal.Decimal('3.14'), 'bson_decimal': bson.Decimal128('0.01'), } expected = { 'chinese': '强哥', 'date': '2006-01-02', 'datetime': '2006-01-02 15:04:05', 'decimal': '3.14', 'bson_decimal': '0.01', } result = hutils.data_types.format_json(test_case) self.assertDictEqual(json.loads(result), expected) # Test ensure_ascii ascii = chinese.encode('unicode-escape').decode() result = hutils.data_types.format_json(test_case, ensure_ascii=True) self.assertIn(ascii, result) # Test json options sorted_result = hutils.data_types.format_json(test_case, sort_keys=True) sorted_expected = json.dumps(expected, ensure_ascii=False, sort_keys=True) self.assertEqual(sorted_result, sorted_expected)
def test_decimal_and_date(self): date_row = {"a_field": bson.timestamp.Timestamp(1565897157, 1)} decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed_date = common.row_to_schema(schema, date_row) changed_decimal = common.row_to_schema(schema, decimal_row) expected = { "type": "object", "properties": { "a_field": { "anyOf": [ {"type": "string", "format": "date-time"}, {"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {} ] } } } self.assertTrue(changed_date) self.assertTrue(changed_decimal) self.assertEqual(expected, schema)
def setUp(self): if not all([x for x in [os.getenv('TAP_MONGODB_HOST'), os.getenv('TAP_MONGODB_USER'), os.getenv('TAP_MONGODB_PASSWORD'), os.getenv('TAP_MONGODB_PORT'), os.getenv('TAP_MONGODB_DBNAME')]]): #pylint: disable=line-too-long raise Exception("set TAP_MONGODB_HOST, TAP_MONGODB_USER, TAP_MONGODB_PASSWORD, TAP_MONGODB_PORT, TAP_MONGODB_DBNAME") with get_test_connection() as client: ############# Drop all dbs/collections ############# drop_all_collections(client) ############# Add datatype collections ############# pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field" : { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field" : [ "array_item_1", "array_item_2", "array_item_3" ], "binary_data_field" : bson.Binary(b"a binary string"), "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field" : True, "date_field" : datetime.datetime(2019, 8, 15, 19, 29, 14, 578000), "null_field": None, "regex_field" : regex, "32_bit_integer_field" : 32, "timestamp_field" : bson.timestamp.Timestamp(1565897157, 1), "64_bit_integer_field" : 34359738368, "decimal_field" : bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field" : bson.code.Code("var x, y, z;"), "javaScript_with_scope_field" : bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field" : bson.min_key.MinKey, "max_key_field" : bson.max_key.MaxKey, "uuid_field": uuid.UUID('3e139ff5-d622-45c6-bf9e-1dfec72820c4'), "dbref_field": bson.dbref.DBRef("some_collection", bson.objectid.ObjectId(b'123456789123'), database='some_database') } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) # NB: Insert an invalid datetime to confirm that works correctly run_mongodb_javascript("datatype_db", "db.invalid_datatype_coll.insert({ \"date_field\": new ISODate(\"0000-01-01T00:00:00.000Z\") });")
def import_products(products_file, db): pl = ProductList(db).col pl.drop() pl = ProductList(db).col with open(products_file) as fd: data = json.load(fd) for product in data: product['_id'] = product['id'] del product['id'] product['currency'] = 'GBP' product['price'] = bson.Decimal128(product['price'].replace('GBP', '')) pl.insert_one(product)
def convert_synthia_to_bson(item: SynthiaExample) -> Dict[str, Any]: return { "path": item.path, "is_training": item.is_training, "is_raw_data": item.is_raw_data, "sparsity_level": bson.Decimal128(item.sparsity_level), "sequence": item.sequence, "is_right_camera": item.is_right_camera, "camera_index": item.camera_index, "order_in_sequence": item.order_in_sequence, "timestamp": int(item.timestamp), "created_at": datetime.datetime.utcnow() }
def test_decimal_then_decimal(self): decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} decimal_row_2 = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed_decimal = common.row_to_schema(schema, decimal_row) changed_decimal_2 = common.row_to_schema(schema, decimal_row_2) expected = { "type": "object", "properties": { "a_field": { "anyOf": [{"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {}] } } } self.assertTrue(changed_decimal) self.assertFalse(changed_decimal_2) self.assertEqual(expected, schema)
def test_simple_decimal(self): row = {"a_decimal": bson.Decimal128(decimal.Decimal('1.34'))} schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) expected = { "type": "object", "properties": { "a_decimal": { "anyOf": [{"type": "number", "multipleOf": decimal.Decimal('1e-34')}, {}] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def process_item(self, item, spider): if 'preco' not in item or 'titulo' not in item: raise DropItem('Invalid item (required fields missing!): %s' % item) self.db[self.collection].find_one_and_replace( {'url': item['url']}, { 'url': item['url'], 'categoria': item['categoria'], 'titulo': item['titulo'], 'disponivel': item.get('disponivel', None), 'moeda': item.get('moeda', 'R$'), 'preco': bson.Decimal128(item['preco']), 'descricao': item['descricao'], 'caracteristicas': item['caracteristicas'] }, upsert=True) return item
def test_array_multiple_types(self): row = { "foo": [ bson.timestamp.Timestamp(1565897157, 1), bson.Decimal128(decimal.Decimal('1.34')) ] } schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) expected = { "type": "object", "properties": { "foo": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "string", "format": "date-time" }, { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } }, {} ] } } } self.assertTrue(changed) self.assertEqual(expected, schema)
def test_decimal_then_float(self): decimal_row = {"a_field": bson.Decimal128(decimal.Decimal('1.34'))} float_row = {"a_field": 1.34} schema = {"type": "object", "properties": {}} changed_decimal = common.row_to_schema(schema, decimal_row) changed_float = common.row_to_schema(schema, float_row) expected = { "type": "object", "properties": { "a_field": { "anyOf": [{"type": "number"}, {}] } } } self.assertTrue(changed_decimal) self.assertTrue(changed_float) self.assertEqual(expected, schema)
def setUp(self): if not all([ x for x in [ os.getenv('TAP_MONGODB_HOST'), os.getenv('TAP_MONGODB_USER'), os.getenv('TAP_MONGODB_PASSWORD'), os.getenv('TAP_MONGODB_PORT'), os.getenv('TAP_MONGODB_DBNAME') ] ]): #pylint: disable=line-too-long raise Exception( "set TAP_MONGODB_HOST, TAP_MONGODB_USER, TAP_MONGODB_PASSWORD, TAP_MONGODB_PORT, TAP_MONGODB_DBNAME" ) with get_test_connection() as client: # drop all dbs/collections drop_all_collections(client) # simple_coll_1 has 50 documents client["simple_db"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50)) # simple_coll_2 has 100 documents client["simple_db"]["simple_coll_2"].insert_many( generate_simple_coll_docs(100)) # admin_coll_1 has 50 documents client["admin"]["admin_coll_1"].insert_many( generate_simple_coll_docs(50)) # create view on simple_coll_1 client["simple_db"].command( bson.son.SON([("create", "simple_view_1"), ("viewOn", "simple_coll_1"), ("pipeline", [])])) # collections with same names as others in different dbs client["simple_db_2"]["simple_coll_1"].insert_many( generate_simple_coll_docs(50)) client["simple_db_2"]["SIMPLE_COLL_1"].insert_many( generate_simple_coll_docs(50)) # collections with special characters in names client["special_db"]["hebrew_ישראל"].insert_many( generate_simple_coll_docs(50)) client['special_db']['hello!world?'].insert_many( generate_simple_coll_docs(50)) # Add datatype collections pattern = re.compile('.*') regex = bson.Regex.from_native(pattern) regex.flags ^= re.UNICODE datatype_doc = { "double_field": 4.3, "string_field": "a sample string", "object_field": { "obj_field_1_key": "obj_field_1_val", "obj_field_2_key": "obj_field_2_val" }, "array_field": ["array_item_1", "array_item_2", "array_item_3"], "binary_data_field": b"a binary string", "object_id_field": bson.objectid.ObjectId(b'123456789123'), "boolean_field": True, "date_field": datetime.datetime.now(), "null_field": None, "regex_field": regex, "32_bit_integer_field": 32, "timestamp_field": bson.timestamp.Timestamp(int(time.time()), 1), "64_bit_integer_field": 34359738368, "decimal_field": bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field": bson.code.Code("var x, y, z;"), "javaScript_with_scope_field": bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field": bson.min_key.MinKey, "max_key_field": bson.max_key.MaxKey } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc)
def transform_python(self, value): return bson.Decimal128(value=value)
"boolean_field": True, "date_field": datetime.datetime.now(), "null_field": None, "regex_field": regex, "32_bit_integer_field": 32, "timestamp_field": bson.timestamp.Timestamp(int(time.time()), 1), "64_bit_integer_field": 34359738368, "decimal_field": bson.Decimal128(decimal.Decimal('1.34')), "javaScript_field": bson.code.Code("var x, y, z;"), "javaScript_with_scope_field": bson.code.Code("function incrementX() { x++; }", scope={"x": 1}), "min_key_field": bson.min_key.MinKey, "max_key_field": bson.max_key.MaxKey } client["datatype_db"]["datatype_coll_1"].insert_one(datatype_doc) client["datatype_db"]["datatype_coll_2"].insert_one(datatype_doc) print("\nPrinting database contents") for db_name in client.list_database_names():
def test_replicate_mongodb_to_sf(self): """Replicate mongodb to Snowflake""" def assert_columns_exist(table): """Helper inner function to test if every table and column exists in the target""" assertions.assert_cols_in_table( self.run_query_target_snowflake, 'ppw_e2e_tap_mongodb', table, [ '_ID', 'DOCUMENT', '_SDC_EXTRACTED_AT', '_SDC_BATCHED_AT', '_SDC_DELETED_AT', ], ) def assert_row_counts_equal(target_schema, table, count_in_source): assert (count_in_source == self.run_query_target_snowflake( f'select count(_id) from {target_schema}.{table}')[0][0]) # Run tap first time - fastsync and singer should be triggered assertions.assert_run_tap_success(TAP_MONGODB_ID, TARGET_ID, ['fastsync', 'singer']) assert_columns_exist('listings') assert_columns_exist('my_collection') assert_columns_exist('all_datatypes') listing_count = self.mongodb_con['listings'].count_documents({}) my_coll_count = self.mongodb_con['my_collection'].count_documents({}) all_datatypes_count = self.mongodb_con[ 'all_datatypes'].count_documents({}) assert_row_counts_equal('ppw_e2e_tap_mongodb', 'listings', listing_count) assert_row_counts_equal('ppw_e2e_tap_mongodb', 'my_collection', my_coll_count) assert_row_counts_equal('ppw_e2e_tap_mongodb', 'all_datatypes', all_datatypes_count) result_insert = self.mongodb_con.my_collection.insert_many([ { 'age': randint(10, 30), 'id': 1001, 'uuid': uuid.uuid4(), 'ts': Timestamp(12030, 500), }, { 'date': datetime.utcnow(), 'id': 1002, 'uuid': uuid.uuid4(), 'regex': bson.Regex(r'^[A-Z]\\w\\d{2,6}.*$'), }, { 'uuid': uuid.uuid4(), 'id': 1003, 'decimal': bson.Decimal128(decimal.Decimal('5.64547548425446546546644')), 'nested_json': { 'a': 1, 'b': 3, 'c': { 'key': bson.datetime.datetime(2020, 5, 3, 10, 0, 0) }, }, }, ]) my_coll_count += len(result_insert.inserted_ids) result_del = self.mongodb_con.my_collection.delete_one( {'_id': result_insert.inserted_ids[0]}) my_coll_count -= result_del.deleted_count result_update = self.mongodb_con.my_collection.update_many( {}, {'$set': { 'id': 0 }}) assertions.assert_run_tap_success(TAP_MONGODB_ID, TARGET_ID, ['singer']) assert (result_update.modified_count == self.run_query_target_snowflake( 'select count(_id) from ppw_e2e_tap_mongodb.my_collection where document:id = 0' )[0][0]) assert_row_counts_equal('ppw_e2e_tap_mongodb', 'my_collection', my_coll_count)
def adapt_decimalfield_value(self, value, max_digits=None, decimal_places=None): if value is None: return None return bson.Decimal128(super().adapt_decimalfield_value(value, max_digits, decimal_places))
def test_array_nested(self): row = { "foo": [ [ bson.timestamp.Timestamp(1565897157, 1), bson.Decimal128(decimal.Decimal('1.34')) ], { "bar": bson.timestamp.Timestamp(1565897157, 1), "bat": bson.Decimal128(decimal.Decimal('1.34')) } ] } row_2 = { "bar": "1", "foo": [ ["bob", "roger"], { "bar": "bob", "bat": "roger" } ] } schema = {"type": "object", "properties": {}} changed = common.row_to_schema(schema, row) changed_2 = common.row_to_schema(schema, row_2) expected = { "type": "object", "properties": { "foo": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "array", "items": { "anyOf": [ { "type": "string", "format": "date-time" }, { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } }, { "type": "object", "properties": { "bar": { "anyOf": [ { "type": "string", "format": "date-time" }, {} ] }, "bat": { "anyOf": [ { "type": "number", "multipleOf": decimal.Decimal('1e-34') }, {} ] } } }, {} ] } }, {} ] } } } singer_row = {k:common.transform_value(v, [k]) for k, v in row_2.items() if type(v) not in [bson.min_key.MinKey, bson.max_key.MaxKey]} decimal.getcontext().prec=100000 validate(instance=singer_row, schema=schema) self.assertTrue(changed) self.assertFalse(changed_2) self.assertEqual(expected, schema)
def test_replicate_mongodb_to_sf(self): """ Test replicate MongoDB to Snowflake """ # Run tap first time - fastsync and singer should be triggered assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) self.assert_columns_exist('listings') self.assert_columns_exist('my_collection') self.assert_columns_exist('all_datatypes') listing_count = self.mongodb_con['listings'].count_documents({}) my_coll_count = self.mongodb_con['my_collection'].count_documents({}) all_datatypes_count = self.mongodb_con[ 'all_datatypes'].count_documents({}) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'listings', listing_count, ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'my_collection', my_coll_count, ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'all_datatypes', all_datatypes_count, ) result_insert = self.mongodb_con.my_collection.insert_many([ { 'age': randint(10, 30), 'id': 1001, 'uuid': uuid.uuid4(), 'ts': bson.Timestamp(12030, 500), }, { 'date': datetime.utcnow(), 'id': 1002, 'uuid': uuid.uuid4(), 'regex': bson.Regex(r'^[A-Z]\\w\\d{2,6}.*$'), }, { 'uuid': uuid.uuid4(), 'id': 1003, 'decimal': bson.Decimal128(decimal.Decimal('5.64547548425446546546644')), 'nested_json': { 'a': 1, 'b': 3, 'c': { 'key': bson.datetime.datetime(2020, 5, 3, 10, 0, 0) }, }, }, ]) my_coll_count += len(result_insert.inserted_ids) result_del = self.mongodb_con.my_collection.delete_one( {'_id': result_insert.inserted_ids[0]}) my_coll_count -= result_del.deleted_count result_update = self.mongodb_con.my_collection.update_many( {}, {'$set': { 'id': 0 }}) assertions.assert_run_tap_success(self.tap_id, self.target_id, ['singer']) self.assertEqual( result_update.modified_count, self.e2e_env.run_query_target_snowflake( f'select count(_id) from ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}.my_collection' f' where document:id = 0')[0][0], ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'my_collection', my_coll_count, )