def test_table_query(): """Test command line ability to do table query. """ cols = [] cols.append( synapseclient.Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( synapseclient.Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(synapseclient.Column(name='x', columnType='DOUBLE')) cols.append(synapseclient.Column(name='age', columnType='INTEGER')) cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN')) project_entity = project schema1 = syn.store( synapseclient.Schema(name=str(uuid.uuid4()), columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [['Chris', 'bar', 11.23, 45, False], ['Jen', 'bat', 14.56, 40, False], ['Jane', 'bat', 17.89, 6, False], ['Henry', 'bar', 10.12, 1, False]] row_reference_set1 = syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) # Test query output = run('synapse', '--skip-checks', 'query', 'select * from %s' % schema1.id) output_rows = output.rstrip("\n").split("\n") # Check the length of the output assert len(output_rows) == 5, "got %s rows" % (len(output_rows), ) # Check that headers are correct. # Should be column names in schema plus the ROW_ID and ROW_VERSION my_headers_set = output_rows[0].split("\t") expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list( map(lambda x: x.name, cols)) assert my_headers_set == expected_headers_set, "%r != %r" % ( my_headers_set, expected_headers_set)
def test_migrate_project(request, syn, schedule_for_cleanup, storage_location_id): test_name = request.node.name project_name = "{}-{}".format(test_name, uuid.uuid4()) project = synapseclient.Project(name=project_name) project_entity = syn.store(project) file_0_path = _create_temp_file() schedule_for_cleanup(file_0_path) file_0_name = "{}-{}".format(test_name, 1) file_0 = synapseclient.File(name=file_0_name, path=file_0_path, parent=project_entity) file_0_entity = syn.store(file_0) default_storage_location_id = file_0_entity._file_handle[ 'storageLocationId'] folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4()) folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name) folder_1_entity = syn.store(folder_1) file_1_path = _create_temp_file() schedule_for_cleanup(file_1_path) file_1_name = "{}-{}".format(test_name, 1) file_1 = synapseclient.File(name=file_1_name, path=file_1_path, parent=folder_1_entity) file_1_entity = syn.store(file_1) file_2_path = _create_temp_file() schedule_for_cleanup(file_2_path) file_2_name = "{}-{}".format(test_name, 2) file_2 = synapseclient.File(name=file_2_name, path=file_2_path, parent=folder_1_entity) file_2_entity = syn.store(file_2) # file 3 shares the same file handle id as file 1 file_3_path = file_1_path file_3_name = "{}-{}".format(test_name, 3) file_3 = synapseclient.File(name=file_3_name, path=file_3_path, parent=folder_1_entity) file_3.dataFileHandleId = file_1_entity.dataFileHandleId file_3_entity = syn.store(file_3) table_1_cols = [ synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'), synapseclient.Column(name='num', columnType='INTEGER'), synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'), ] table_1 = syn.store( synapseclient.Schema(name=test_name, columns=table_1_cols, parent=folder_1_entity)) table_1_file_col_1_1 = _create_temp_file() table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1) table_1_file_col_1_2 = _create_temp_file() table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1) table_1_file_col_2_1 = _create_temp_file() table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1) table_1_file_col_2_2 = _create_temp_file() table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1) data = [ [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']], [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']], ] table_1_entity = syn.store( synapseclient.RowSet(schema=table_1, rows=[synapseclient.Row(r) for r in data])) db_path = tempfile.NamedTemporaryFile(delete=False).name schedule_for_cleanup(db_path) index_result = synapseutils.index_files_for_migration( syn, project_entity, storage_location_id, db_path, file_version_strategy='new', include_table_files=True, ) counts_by_status = index_result.get_counts_by_status() assert counts_by_status['INDEXED'] == 8 assert counts_by_status['ERRORED'] == 0 migration_result = synapseutils.migrate_indexed_files(syn, db_path, force=True) file_0_entity_updated = syn.get(utils.id_of(file_0_entity), downloadFile=False) file_1_entity_updated = syn.get(utils.id_of(file_1_entity), downloadFile=False) file_2_entity_updated = syn.get(utils.id_of(file_2_entity), downloadFile=False) file_3_entity_updated = syn.get(utils.id_of(file_3_entity), downloadFile=False) file_handles = [ f['_file_handle'] for f in ( file_0_entity_updated, file_1_entity_updated, file_2_entity_updated, file_3_entity_updated, ) ] table_1_id = utils.id_of(table_1_entity) results = syn.tableQuery("select file_col_1, file_col_2 from {}".format( utils.id_of(table_1_entity))) table_file_handles = [] for row in results: for file_handle_id in row[2:]: file_handle = syn._getFileHandleDownload( file_handle_id, table_1_id, objectType='TableEntity')['fileHandle'] table_file_handles.append(file_handle) file_handles.extend(table_file_handles) _assert_storage_location(file_handles, storage_location_id) assert storage_location_id != default_storage_location_id with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query_result = cursor.execute( "select status, count(*) from migrations where type in (?, ?) group by status", (_MigrationType.FILE.value, _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall() counts = {r[0]: r[1] for r in query_result} # should only be one status and they should all be migrated # should be 3 migrated files entities + 4 migrated table attached files assert len(counts) == 1 assert counts[_MigrationStatus.MIGRATED.value] == 8 csv_file = tempfile.NamedTemporaryFile(delete=False) schedule_for_cleanup(csv_file.name) migration_result.as_csv(csv_file.name) with open(csv_file.name, 'r') as csv_file_in: csv_contents = csv_file_in.read() table_1_id = table_1_entity['tableId'] # assert the content of the csv. we don't assert any particular order of the lines # but the presence of the expected lines and the correct # of lines csv_lines = csv_contents.split('\n') assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines # noqa assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines # noqa assert "" in csv_lines # expect trailing newline in a csv
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) # Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent, # but faster than chunked queries. time.sleep(2) # Test recursive get run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) # Test query get using a Table with an entity column # This should be replaced when Table File Views are implemented in the client cols = [synapseclient.Column(name='id', columnType='ENTITYID')] schema1 = syn.store( synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [[x.id] for x in file_entities] syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) time.sleep(3) # get -q are eventually consistent # Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) #Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) #function under test uses queries which are eventually consistent but not immediately after creating the entities start_time = time.time() while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) ### Test recursive get output = run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) #Verify that we downloaded files: new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) ### Test query get ### Note: We're not querying on annotations because tests can fail if there ### are lots of jobs queued as happens when staging is syncing output = run('synapse', '--skip-checks', 'get', '-q', "select id from file where parentId=='%s'" % folder_entity2.id) #Verify that we downloaded files from folder_entity2 new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0]) ### Test query get using a Table with an entity column ### This should be replaced when Table File Views are implemented in the client cols = [] cols.append(synapseclient.Column(name='id', columnType='ENTITYID')) schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 =[[x.id] for x in file_entities] print(data1) row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1, rows=[synapseclient.Row(r) for r in data1])) ### Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) #Verify that we downloaded files: new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])