def test_upsert_records_does_not_replace_with_null_values_in_meta_data( postgres_with_load_and_image_table, tmpdir ): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID = 'a' PROVIDER = 'images_provider' IMG_URL = 'https://images.com/a/img.jpg' LICENSE = 'by' META_DATA_A = '{"description": "a cool picture", "test": "should stay"}' META_DATA_B = '{"description": "I updated my description", "test": null}' load_data_query_a = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}',null,'{IMG_URL}',null,null,null,null,'{LICENSE}',null,null," f"null,null,'{META_DATA_A}',null,null,'{PROVIDER}',null" f");" ) load_data_query_b = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}',null,'{IMG_URL}',null,null,null,null,'{LICENSE}',null,null," f"null,null,'{META_DATA_B}',null,null,'{PROVIDER}',null" f");" ) postgres_with_load_and_image_table.cursor.execute(load_data_query_a) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};" ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute(load_data_query_b) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() actual_row = actual_rows[0] assert len(actual_rows) == 1 expected_meta_data = { 'description': json.loads(META_DATA_B)['description'], 'test': json.loads(META_DATA_A)['test'] } assert actual_row[22] == expected_meta_data
def load_local_data(output_dir, postgres_conn_id, identifier): tsv_file_name = paths.get_staged_file(output_dir, identifier) sql.load_local_data_to_intermediate_table( postgres_conn_id, tsv_file_name, identifier ) sql.upsert_records_to_image_table(postgres_conn_id, identifier)
def test_update_flickr_sub_providers(postgres_with_load_and_image_table): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID_A = 'a' FID_B = 'b' IMG_URL_A = 'https://images.com/a/img.jpg' IMG_URL_B = 'https://images.com/b/img.jpg' CREATOR_URL_A = 'https://www.flickr.com/photos/29988733@N04' CREATOR_URL_B = 'https://www.flickr.com/photos/other_user' PROVIDER = 'flickr' LICENSE = 'by' TAGS = [ {'name': 'tagone', 'provider': 'test'}, {'name': 'tagtwo', 'provider': 'test'} ] insert_data_query = ( f"INSERT INTO {load_table} VALUES" f"('{FID_A}',null,'{IMG_URL_A}',null,null,null,null,'{LICENSE}',null," f"null,'{CREATOR_URL_A}',null,null,'{json.dumps(TAGS)}',null," f"'{PROVIDER}','{PROVIDER}')," f"('{FID_B}',null,'{IMG_URL_B}',null,null,null,null,'{LICENSE}',null," f"null,'{CREATOR_URL_B}',null,null,'{json.dumps(TAGS)}',null," f"'{PROVIDER}','{PROVIDER}');" ) postgres_with_load_and_image_table.cursor.execute(insert_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};" ) postgres_with_load_and_image_table.connection.commit() sql.update_flickr_sub_providers( postgres_conn_id, image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() assert len(actual_rows) == 2 for actual_row in actual_rows: if actual_row[6] == 'a': assert actual_row[5] == 'nasa' else: assert actual_row[6] == 'b' and actual_row[5] == 'flickr'
def test_upsert_records_inserts_one_record_to_empty_image_table( postgres_with_load_and_image_table, tmpdir): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID = 'a' LAND_URL = 'https://images.com/a' IMG_URL = 'https://images.com/a/img.jpg' THM_URL = 'https://images.com/a/img_small.jpg' WIDTH = 1000 HEIGHT = 500 FILESIZE = 2000 LICENSE = 'cc0' VERSION = '1.0' CREATOR = 'Alice' CREATOR_URL = 'https://alice.com' TITLE = 'My Great Pic' META_DATA = '{"description": "what a cool picture"}' TAGS = '["fun", "great"]' WATERMARKED = 'f' PROVIDER = 'images_provider' SOURCE = 'images_source' load_data_query = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}','{LAND_URL}','{IMG_URL}','{THM_URL}','{WIDTH}','{HEIGHT}'," f"'{FILESIZE}','{LICENSE}','{VERSION}','{CREATOR}','{CREATOR_URL}'," f"'{TITLE}','{META_DATA}','{TAGS}','{WATERMARKED}','{PROVIDER}'," f"'{SOURCE}'" f");") postgres_with_load_and_image_table.cursor.execute(load_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table(postgres_conn_id, identifier, image_table=image_table) postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};") actual_rows = postgres_with_load_and_image_table.cursor.fetchall() actual_row = actual_rows[0] assert len(actual_rows) == 1 assert actual_row[5] == PROVIDER assert actual_row[6] == SOURCE assert actual_row[7] == FID assert actual_row[8] == LAND_URL assert actual_row[9] == IMG_URL assert actual_row[10] == THM_URL assert actual_row[11] == WIDTH assert actual_row[12] == HEIGHT assert actual_row[14] == LICENSE assert actual_row[15] == VERSION assert actual_row[16] == CREATOR assert actual_row[17] == CREATOR_URL assert actual_row[18] == TITLE assert actual_row[22] == json.loads(META_DATA) assert actual_row[23] == json.loads(TAGS) assert actual_row[24] is False
def test_upsert_records_replaces_updated_on_and_last_synced_with_source( postgres_with_load_and_image_table, tmpdir ): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID = 'a' LAND_URL = 'https://images.com/a' IMG_URL = 'images.com/a/img.jpg' LICENSE = 'cc0' VERSION = '1.0' PROVIDER = 'images' load_data_query = ( f"INSERT INTO {load_table} (" f"foreign_identifier, foreign_landing_url, url," f" license, license_version, provider, source" f") VALUES (" f"'{FID}','{LAND_URL}','{IMG_URL}'," f"'{LICENSE}','{VERSION}','{PROVIDER}', '{PROVIDER}'" f");" ) postgres_with_load_and_image_table.cursor.execute(load_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) original_row = postgres_with_load_and_image_table.cursor.fetchall()[0] original_updated_on = original_row[2] original_last_synced = original_row[20] time.sleep(0.001) sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) updated_result = postgres_with_load_and_image_table.cursor.fetchall() updated_row = updated_result[0] updated_updated_on = updated_row[2] updated_last_synced = updated_row[20] assert len(updated_result) == 1 assert updated_updated_on > original_updated_on assert updated_last_synced > original_last_synced
def test_update_europeana_sub_providers(postgres_with_load_and_image_table): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID_A = 'a' FID_B = 'b' IMG_URL_A = 'https://images.com/a/img.jpg' IMG_URL_B = 'https://images.com/b/img.jpg' PROVIDER = 'europeana' LICENSE = 'by-nc-nd' META_DATA_A = { 'country': ['Sweden'], 'dataProvider': ['Wellcome Collection'], 'description': 'A', 'license_url': 'http://creativecommons.org/licenses/by-nc-nd/4.0/' } META_DATA_B = { 'country': ['Sweden'], 'dataProvider': ['Other Collection'], 'description': 'B', 'license_url': 'http://creativecommons.org/licenses/by-nc-nd/4.0/' } insert_data_query = ( f"INSERT INTO {load_table} VALUES" f"('{FID_A}',null,'{IMG_URL_A}',null,null,null,null,'{LICENSE}',null," f"null,null,null,'{json.dumps(META_DATA_A)}',null,null," f"'{PROVIDER}','{PROVIDER}')," f"('{FID_B}',null,'{IMG_URL_B}',null,null,null,null,'{LICENSE}',null," f"null,null,null,'{json.dumps(META_DATA_B)}',null,null," f"'{PROVIDER}','{PROVIDER}');") postgres_with_load_and_image_table.cursor.execute(insert_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table(postgres_conn_id, identifier, image_table=image_table) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};") postgres_with_load_and_image_table.connection.commit() sql.update_europeana_sub_providers(postgres_conn_id, image_table) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};") actual_rows = postgres_with_load_and_image_table.cursor.fetchall() assert len(actual_rows) == 2 for actual_row in actual_rows: if actual_row[6] == 'a': assert actual_row[5] == 'wellcome_collection' else: assert actual_row[6] == 'b' and actual_row[5] == 'europeana'
def load_local_data(output_dir, postgres_conn_id, identifier, overwrite=False): tsv_file_name = paths.get_staged_file(output_dir, identifier) ingestion_column.check_and_fix_tsv_file(tsv_file_name) sql.load_local_data_to_intermediate_table( postgres_conn_id, tsv_file_name, identifier ) if overwrite is True: sql.overwrite_records_in_image_table(postgres_conn_id, identifier) else: sql.upsert_records_to_image_table(postgres_conn_id, identifier)
def test_update_smithsonian_sub_providers(postgres_with_load_and_image_table): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID_A = 'a' FID_B = 'b' IMG_URL_A = 'https://images.com/a/img.jpg' IMG_URL_B = 'https://images.com/b/img.jpg' PROVIDER = 'smithsonian' LICENSE = 'by-nc-nd' META_DATA_A = { 'unit_code': 'SIA', 'data_source': 'Smithsonian Institution Archives' } META_DATA_B = { 'unit_code': 'NMNHBIRDS', 'data_source': 'NMNH - Vertebrate Zoology - Birds Division' } insert_data_query = ( f"INSERT INTO {load_table} VALUES" f"('{FID_A}',null,'{IMG_URL_A}',null,null,null,null,'{LICENSE}',null," f"null,null,null,'{json.dumps(META_DATA_A)}',null,null," f"'{PROVIDER}','{PROVIDER}')," f"('{FID_B}',null,'{IMG_URL_B}',null,null,null,null,'{LICENSE}',null," f"null,null,null,'{json.dumps(META_DATA_B)}',null,null," f"'{PROVIDER}','{PROVIDER}');") postgres_with_load_and_image_table.cursor.execute(insert_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table(postgres_conn_id, identifier, image_table=image_table) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};") postgres_with_load_and_image_table.connection.commit() sql.update_smithsonian_sub_providers(postgres_conn_id, image_table) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};") actual_rows = postgres_with_load_and_image_table.cursor.fetchall() assert len(actual_rows) == 2 for actual_row in actual_rows: if actual_row[6] == 'a': assert actual_row[5] == 'smithsonian_institution_archives' else: assert actual_row[6] == 'b' and actual_row[5] == \ 'smithsonian_national_museum_of_natural_history'
def load_s3_data( bucket, aws_conn_id, postgres_conn_id, identifier ): tsv_key = s3.get_staged_s3_object(identifier, bucket, aws_conn_id) sql.load_s3_data_to_intermediate_table( postgres_conn_id, bucket, tsv_key, identifier ) sql.upsert_records_to_image_table(postgres_conn_id, identifier)
def test_upsert_records_inserts_two_records_to_image_table( postgres_with_load_and_image_table, tmpdir ): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID_A = 'a' FID_B = 'b' LAND_URL_A = 'https://images.com/a' LAND_URL_B = 'https://images.com/b' IMG_URL_A = 'images.com/a/img.jpg' IMG_URL_B = 'images.com/b/img.jpg' LICENSE = 'cc0' VERSION = '1.0' PROVIDER = 'images' test_rows = [ (FID_A, LAND_URL_A, IMG_URL_A, LICENSE, VERSION, PROVIDER), (FID_B, LAND_URL_B, IMG_URL_B, LICENSE, VERSION, PROVIDER) ] for r in test_rows: load_data_query = ( f"INSERT INTO {load_table} (" f"foreign_identifier, foreign_landing_url, url," f" license, license_version, provider, source" f") VALUES (" f"'{r[0]}', '{r[1]}', '{r[2]}'," f"'{r[3]}', '{r[4]}', '{r[5]}', '{r[5]}'" f");" ) postgres_with_load_and_image_table.cursor.execute(load_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() assert actual_rows[0][7] == FID_A assert actual_rows[1][7] == FID_B
def load_s3_data( bucket, aws_conn_id, postgres_conn_id, identifier, overwrite=False, ): tsv_key = s3.get_staged_s3_object(identifier, bucket, aws_conn_id) sql.load_s3_data_to_intermediate_table( postgres_conn_id, bucket, tsv_key, identifier ) if overwrite is True: sql.overwrite_records_in_image_table(postgres_conn_id, identifier) else: sql.upsert_records_to_image_table(postgres_conn_id, identifier)
def test_upsert_records_does_not_replace_tags_with_null( postgres_with_load_and_image_table, tmpdir ): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID = 'a' PROVIDER = 'images_provider' IMG_URL = 'https://images.com/a/img.jpg' LICENSE = 'by' TAGS = [ {'name': 'tagone', 'provider': 'test'}, {'name': 'tagtwo', 'provider': 'test'} ] load_data_query_a = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}',null,'{IMG_URL}',null,null,null,null,'{LICENSE}',null,null," f"null,null,null,'{json.dumps(TAGS)}',null,'{PROVIDER}',null" f");" ) load_data_query_b = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}',null,'{IMG_URL}',null,null,null,null,'{LICENSE}',null,null," f"null,null,null,null,null,'{PROVIDER}',null" f");" ) postgres_with_load_and_image_table.cursor.execute(load_data_query_a) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};" ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute(load_data_query_b) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() actual_row = actual_rows[0] assert len(actual_rows) == 1 expect_tags = [ {'name': 'tagone', 'provider': 'test'}, {'name': 'tagtwo', 'provider': 'test'}, ] actual_tags = actual_row[23] assert len(actual_tags) == 2 assert all([t in expect_tags for t in actual_tags]) assert all([t in actual_tags for t in expect_tags])
def test_upsert_records_does_not_replace_with_nulls( postgres_with_load_and_image_table, tmpdir ): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID = 'a' PROVIDER = 'images_provider' SOURCE = 'images_source' WATERMARKED = 'f' IMG_URL = 'https://images.com/a/img.jpg' FILESIZE = 2000 TAGS = '["fun", "great"]' LAND_URL_A = 'https://images.com/a' THM_URL_A = 'https://images.com/a/img_small.jpg' WIDTH_A = 1000 HEIGHT_A = 500 LICENSE_A = 'by' VERSION_A = '4.0' CREATOR_A = 'Alice' CREATOR_URL_A = 'https://alice.com' TITLE_A = 'My Great Pic' META_DATA_A = '{"description": "what a cool picture"}' LAND_URL_B = 'https://images.com/b' LICENSE_B = 'cc0' VERSION_B = '1.0' load_data_query_a = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}','{LAND_URL_A}','{IMG_URL}','{THM_URL_A}'," f"'{WIDTH_A}','{HEIGHT_A}','{FILESIZE}','{LICENSE_A}','{VERSION_A}'," f"'{CREATOR_A}','{CREATOR_URL_A}','{TITLE_A}','{META_DATA_A}'," f"'{TAGS}','{WATERMARKED}','{PROVIDER}','{SOURCE}'" f");" ) postgres_with_load_and_image_table.cursor.execute(load_data_query_a) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() load_data_query_b = ( f"INSERT INTO {load_table} VALUES(" f"'{FID}','{LAND_URL_B}','{IMG_URL}',null," f"null,null,null,'{LICENSE_B}','{VERSION_B}'," f"null,null,null,null," f"'{TAGS}',null,'{PROVIDER}','{SOURCE}'" f");" ) postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};" ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute(load_data_query_b) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() actual_row = actual_rows[0] assert len(actual_rows) == 1 assert actual_row[8] == LAND_URL_B assert actual_row[10] == THM_URL_A assert actual_row[11] == WIDTH_A assert actual_row[12] == HEIGHT_A assert actual_row[14] == LICENSE_B assert actual_row[15] == VERSION_B assert actual_row[16] == CREATOR_A assert actual_row[17] == CREATOR_URL_A assert actual_row[18] == TITLE_A assert actual_row[22] == json.loads(META_DATA_A)
def test_image_expiration(postgres_with_load_and_image_table): postgres_conn_id = POSTGRES_CONN_ID load_table = TEST_LOAD_TABLE image_table = TEST_IMAGE_TABLE identifier = TEST_ID FID_A = 'a' FID_B = 'b' IMG_URL_A = 'https://images.com/a/img.jpg' IMG_URL_B = 'https://images.com/b/img.jpg' PROVIDER_A = 'smithsonian' PROVIDER_B = 'flickr' LICENSE = 'by-nc-nd' insert_data_query = ( f"INSERT INTO {load_table} VALUES" f"('{FID_A}',null,'{IMG_URL_A}',null,null,null,null,'{LICENSE}',null," f"null,null,null,null,null,null," f"'{PROVIDER_A}','{PROVIDER_A}')," f"('{FID_B}',null,'{IMG_URL_B}',null,null,null,null,'{LICENSE}',null," f"null,null,null,null,null,null," f"'{PROVIDER_B}','{PROVIDER_B}');" ) postgres_with_load_and_image_table.cursor.execute(insert_data_query) postgres_with_load_and_image_table.connection.commit() sql.upsert_records_to_image_table( postgres_conn_id, identifier, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"DELETE FROM {load_table};" ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"UPDATE {image_table} SET updated_on = NOW() - INTERVAL '1 year' " f"WHERE provider = 'flickr';" ) postgres_with_load_and_image_table.connection.commit() sql.expire_old_images( postgres_conn_id, PROVIDER_A, image_table=image_table ) sql.expire_old_images( postgres_conn_id, PROVIDER_B, image_table=image_table ) postgres_with_load_and_image_table.connection.commit() postgres_with_load_and_image_table.cursor.execute( f"SELECT * FROM {image_table};" ) actual_rows = postgres_with_load_and_image_table.cursor.fetchall() assert len(actual_rows) == 2 for actual_row in actual_rows: if actual_row[6] == 'a': assert not actual_row[22] else: assert actual_row[6] == 'b' and actual_row[22]