def test_get_or_create_schema__call(): """Makes sure correct parameters are called""" schema_name = str(uuid.uuid1()) parentid = str(uuid.uuid1()) schema_ent = synapseclient.Schema(name=schema_name, parentId=parentid) returned = synapseclient.Schema(name=schema_name, id=str(uuid.uuid1()), parentId=parentid) with patch.object(CREATE_CLS, "_find_by_obj_or_create", return_value=returned) as patch_find_or_create: new_schema = CREATE_CLS.get_or_create_schema(name=schema_name, parentId=parentid) assert new_schema == returned patch_find_or_create.assert_called_once_with(schema_ent)
def test_schema_change(syn, tables, new_project, sample_table): source_table = tables["schema"][0]["id"] target_table_cols = deepcopy(tables["columns"][0]) added_col = target_table_cols.pop(2) renamed_original_name = target_table_cols[2]["name"] target_table_cols[2]["name"] = "renamed_col" target_table_cols[3]["maximumSize"] = 100 schema = sc.Schema(name=tables["schema"][0]["name"], columns=target_table_cols, parent=new_project["id"]) incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2]) incomplete_table = incomplete_table.drop(added_col["name"], axis=1) incomplete_table = incomplete_table.rename( {renamed_original_name: "renamed_col"}, axis=1) table = syn.store(sc.Table(schema, incomplete_table)) exported_table = export_tables(syn, table_mapping={source_table: table.tableId}, update=False) updated_table = syn.tableQuery("select * from {}".format(table.tableId)) updated_table = updated_table.asDataFrame().reset_index(drop=True) updated_table_no_fh = updated_table.drop("raw_data", axis=1) comparison_table = sample_table.drop("raw_data", axis=1).reset_index(drop=True) updated_table_no_fh = updated_table_no_fh[comparison_table.columns] print(updated_table_no_fh) print(comparison_table) pd.testing.assert_frame_equal(updated_table_no_fh, comparison_table)
def update_validated_submissions_table(syn, project_id, valid_df): """ Push the latest version of the combined validated submissions table to Synapse. """ try: print("Searching for existing 'ValidatedSubmissions' table...") schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table']) if t['name'] == 'ValidatedSubmissions'][0]['id'] schema = syn.get(schema_id) validated_subs_table = syn.tableQuery('select * from {}'.format(schema_id)) if validated_subs_table.asDataFrame().shape[0] == valid_df.shape[0]: print("No new valid submissions since last update.") validated_subs_table.schema = schema print("Updating 'ValidatedSubmissions' table...") update_table = synapseclient.Table(schema, valid_df) validated_subs_table = _update_syn_table(validated_subs_table, update_table, 'objectId') except IndexError: print("Creating 'ValidatedSubmissions' table...") project = syn.get(project_id) cols = synapseclient.as_table_columns(valid_df) schema = synapseclient.Schema(name='ValidatedSubmissions', columns=cols, parent=project) validated_subs_table = synapseclient.Table(schema, valid_df) print("Storing 'ValidatedSubmissions' table...") validated_subs_table = syn.store(validated_subs_table)
def createMafDatabase(syn, databaseToSynIdMappingDf, testing=False, staging=False): mafDatabaseSynId = process_functions.getDatabaseSynId( syn, "vcf2maf", databaseToSynIdMappingDf=databaseToSynIdMappingDf) mafDatabaseEnt = syn.get(mafDatabaseSynId) mafCols = list(syn.getTableColumns(mafDatabaseSynId)) schema = synapseclient.Schema( name='Narrow MAF %s Database' % time.time(), columns=mafCols, parent=process_functions.getDatabaseSynId( syn, "main", databaseToSynIdMappingDf=databaseToSynIdMappingDf)) schema.primaryKey = mafDatabaseEnt.primaryKey newMafDb = syn.store(schema) #Store in the new database synid databaseToSynIdMappingDf['Id'][0] = newMafDb.id syn.store( synapseclient.Table( process_functions.getDatabaseSynId(syn, "dbMapping", test=testing), databaseToSynIdMappingDf)) if not staging and not testing: #Make sure to store the newly created maf db synid into the staging synapse mapping databaseToSynIdMapping = syn.tableQuery( "SELECT * FROM syn12094210 where Database = 'vcf2maf'") databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() databaseToSynIdMappingDf['Id'][0] = newMafDb.id syn.store(synapseclient.Table("syn12094210", databaseToSynIdMappingDf)) #Move and archive old mafdatabase mafDatabaseEnt.parentId = "syn7208886" mafDatabaseEnt.name = "ARCHIVED " + mafDatabaseEnt.name syn.store(mafDatabaseEnt) mafDatabaseSynId = newMafDb.id #Remove can download permissions from project GENIE team syn.setPermissions(mafDatabaseSynId, 3326313, [])
def test_manually_pass_source_tables_dict(syn, tables, new_project, sample_table): source_table = tables["schema"][0]["id"] schema = sc.Schema(name=tables["schema"][0]["name"], columns=tables["columns"][0], parent=new_project["id"]) incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2]) rest_of_the_table = deepcopy(sample_table.iloc[len(sample_table) // 2:]) table = syn.store(sc.Table(schema, incomplete_table)) source_tables = {source_table: rest_of_the_table} exported_table = export_tables(syn, table_mapping={source_table: table.tableId}, source_tables=source_tables, update=True) updated_table = syn.tableQuery("select * from {}".format(table.tableId)) updated_table = updated_table.asDataFrame().reset_index(drop=True) updated_table_no_fh = updated_table.drop("raw_data", axis=1) update = exported_table[source_table][1] correct_table_no_fh = incomplete_table.append(update, ignore_index=True, sort=False) correct_table_no_fh = correct_table_no_fh.drop( "raw_data", axis=1).reset_index(drop=True) print("returned results \n", updated_table_no_fh) print("correct result \n", correct_table_no_fh) pd.testing.assert_frame_equal(updated_table_no_fh, correct_table_no_fh)
def getOrCreateSchema(syn, parent, name, columns): """Get an existing table schema by name and parent or create a new one.""" schema = synapseclient.Schema(name=name, parent=parent, columns=columns) schema = findByNameOrCreate(syn, schema) return schema
def get_or_create_schema(parent, name, columns): """Get an existing table schema by name and parent or create a new one.""" schema = synapseclient.Schema(name=name, parent=parent, columns=columns) schema = find_by_name_or_create(schema) return schema
def table(syn, parent, obj): df = read(obj) cols = synapseclient.as_table_columns(df) schema = synapseclient.Schema(name=str(uuid.uuid4()), columns=cols, parent=parent) schema = syn.store(schema) table = syn.store(synapseclient.Table(schema, df)) return schema
def test_export_multiple_tables_to_preexisting_update(syn, new_project, tables, sample_table): source_table = tables["schema"][0]["id"] source_table_2 = tables["schema"][1]["id"] schema = sc.Schema(name=tables["schema"][0]["name"], columns=tables["columns"][0], parent=new_project["id"]) incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2]) table = syn.store(sc.Table(schema, incomplete_table)) schema_2 = sc.Schema(name=tables["schema"][1]["name"], columns=tables["columns"][1], parent=new_project["id"]) incomplete_table_2 = deepcopy(sample_table.iloc[:len(sample_table) // 3]) table_2 = syn.store(sc.Table(schema_2, incomplete_table_2)) exported_table = export_tables(syn, table_mapping={ source_table: table.tableId, source_table_2: table_2.tableId }, update=True) updated_table = syn.tableQuery("select * from {}".format(table.tableId)) updated_table = updated_table.asDataFrame().reset_index(drop=True) updated_table_no_fh = updated_table.drop("raw_data", axis=1) update = exported_table[source_table][1] correct_table_no_fh = incomplete_table.append(update, ignore_index=True, sort=False) correct_table_no_fh = correct_table_no_fh.drop( "raw_data", axis=1).reset_index(drop=True) updated_table_2 = syn.tableQuery("select * from {}".format( table_2.tableId)) updated_table_2 = updated_table_2.asDataFrame().reset_index(drop=True) updated_table_2_no_fh = updated_table_2.drop("raw_data", axis=1) update_2 = exported_table[source_table_2][1] correct_table_no_fh_2 = incomplete_table_2.append(update_2, ignore_index=True, sort=False) correct_table_no_fh_2 = correct_table_no_fh_2.drop( "raw_data", axis=1).reset_index(drop=True) print("returned results \n", updated_table_no_fh) print("correct result \n", correct_table_no_fh) assert (updated_table_no_fh.equals(correct_table_no_fh) and updated_table_2_no_fh.equals(correct_table_no_fh_2))
def table_schema(project_obj): cols = [synapseclient.Column(name="recordId", columnType="INTEGER"), synapseclient.Column(name="externalId", columnType="STRING"), synapseclient.Column(name="substudyMemberships", columnType="STRING"), synapseclient.Column(name="bool_property", columnType="BOOLEAN"), synapseclient.Column(name="str_property", columnType="STRING"), synapseclient.Column(name="raw_data", columnType="FILEHANDLEID")] schema = synapseclient.Schema(name = str(uuid.uuid4()), columns = cols, parent = project_obj["id"]) return schema
def test_table_query(): """Test command line ability to do table query. """ cols = [] cols.append( synapseclient.Column(name='name', columnType='STRING', maximumSize=1000)) cols.append( synapseclient.Column(name='foo', columnType='STRING', enumValues=['foo', 'bar', 'bat'])) cols.append(synapseclient.Column(name='x', columnType='DOUBLE')) cols.append(synapseclient.Column(name='age', columnType='INTEGER')) cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN')) project_entity = project schema1 = syn.store( synapseclient.Schema(name=str(uuid.uuid4()), columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [['Chris', 'bar', 11.23, 45, False], ['Jen', 'bat', 14.56, 40, False], ['Jane', 'bat', 17.89, 6, False], ['Henry', 'bar', 10.12, 1, False]] row_reference_set1 = syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) # Test query output = run('synapse', '--skip-checks', 'query', 'select * from %s' % schema1.id) output_rows = output.rstrip("\n").split("\n") # Check the length of the output assert len(output_rows) == 5, "got %s rows" % (len(output_rows), ) # Check that headers are correct. # Should be column names in schema plus the ROW_ID and ROW_VERSION my_headers_set = output_rows[0].split("\t") expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list( map(lambda x: x.name, cols)) assert my_headers_set == expected_headers_set, "%r != %r" % ( my_headers_set, expected_headers_set)
def test__create_schema(): """Tests calling of create schema""" table_name = str(uuid.uuid1()) parentid = str(uuid.uuid1()) columns = [str(uuid.uuid1())] annotations = {"foo": "bar"} schema = synapseclient.Schema(table_name, columns=columns, parent=parentid, annotations=annotations) with patch.object(syn, "store", return_value=schema) as patch_syn_store: new_schema = process_functions._create_schema(syn, table_name, parentid, columns=columns, annotations=annotations) patch_syn_store.assert_called_once_with(schema) assert new_schema == schema
def _create_table(syn: Synapse, name: str, col_config: List[dict], parent: str) -> Schema: """Create Synapse Table Args: syn: Synapse connection name: Table name col_config: Column dict configuration parent: Synapse id of project Returns: Stored Synapse Table """ cols = [synapseclient.Column(**col) for col in col_config] schema = synapseclient.Schema(name=name, columns=cols, parent=parent) schema = syn.store(schema) return schema
def _store_dataframe_to_table(syn, df, df_cols, table_id=None, parent_id=None, table_name=None, **kwargs): """Store a pandas DataFrame to Synapse in a safe way by formatting the the values so that the store operation is not rejected by Synapse. Parameters ---------- syn : synapseclient.Synapse df : pandas.DataFrame df_cols : list of synapseclient.Column objects table_id : str, default None Synapse ID of a preexisting Synapse Table to store `df` to. Either `table_id` or both `parent_id` and `table_name` must be supplied as arguments. parent_id : str, default None Synapse ID of the project to store `df` to as a table. Either `table_id` or both `parent_id` and `table_name` must be supplied as arguments. table_name : str, default None Either `table_id` or both `parent_id` and `table_name` must be supplied as arguments. **kwargs : Keyword arguments to provide to syn.store (useful for provenance) """ if table_id is None and parent_id is None and table_name is None: raise TypeError("Either the table Synapse ID must be set or " "the parent ID and table name must be set.") sanitized_dataframe = _sanitize_dataframe(syn, records=df, cols=df_cols) if table_id is None: target_table_schema = sc.Schema(name=table_name, parent=parent_id, columns=df_cols) target_table = sc.Table(schema=target_table_schema, values=sanitized_dataframe, headers=df_cols) else: target_table = sc.Table(table_id, sanitized_dataframe, headers=df_cols) target_table = syn.store(target_table, **kwargs) return target_table
def _create_schema(syn, table_name, parentid, columns=None, annotations=None): """Creates Table Schema Args: syn: Synapse object table_name: Name of table parentid: Project synapse id columns: Columns of Table annotations: Dictionary of annotations to add Returns: Schema """ schema = synapseclient.Schema(name=table_name, columns=columns, parent=parentid, annotations=annotations) new_schema = syn.store(schema) return new_schema
def test_export_one_table_to_preexisting_no_update(syn, new_project, tables, sample_table): source_table = tables["schema"][0]["id"] schema = sc.Schema(name=tables["schema"][0]["name"], columns=tables["columns"][0], parent=new_project["id"]) incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2]) table = syn.store(sc.Table(schema, incomplete_table)) exported_table = export_tables(syn, table_mapping={source_table: table.tableId}, update=False) updated_table = syn.tableQuery("select * from {}".format(table.tableId)) updated_table = updated_table.asDataFrame().reset_index(drop=True) updated_table_no_fh = updated_table.drop("raw_data", axis=1) comparison_table = sample_table.drop("raw_data", axis=1).reset_index(drop=True) print(updated_table_no_fh) print(comparison_table) pd.testing.assert_frame_equal(updated_table_no_fh, comparison_table)
def create_and_archive_maf_database(syn, database_synid_mappingdf): ''' Creates new MAF database and archives the old database in the staging site Args: syn: Synapse object databaseToSynIdMappingDf: Database to synapse id mapping dataframe Return: Editted database to synapse id mapping dataframe ''' maf_database_synid = process_functions.getDatabaseSynId( syn, "vcf2maf", project_id=None, databaseToSynIdMappingDf=database_synid_mappingdf) maf_database_ent = syn.get(maf_database_synid) maf_columns = list(syn.getTableColumns(maf_database_synid)) schema = synapseclient.Schema( name='Narrow MAF {current_time} Database'.format( current_time=time.time()), columns=maf_columns, parent=process_functions.getDatabaseSynId( syn, "main", databaseToSynIdMappingDf=database_synid_mappingdf)) schema.primaryKey = maf_database_ent.primaryKey new_maf_database = syn.store(schema) # Store in the new database synid database_synid_mappingdf['Id'][database_synid_mappingdf['Database'] == 'vcf2maf'] = new_maf_database.id vcf2maf_mappingdf = database_synid_mappingdf[ database_synid_mappingdf['Database'] == 'vcf2maf'] # vcf2maf_mappingdf['Id'][0] = newMafDb.id syn.store(synapseclient.Table("syn10967259", vcf2maf_mappingdf)) # Move and archive old mafdatabase (This is the staging synid) maf_database_ent.parentId = "syn7208886" maf_database_ent.name = "ARCHIVED " + maf_database_ent.name syn.store(maf_database_ent) # maf_database_synid = new_maf_database.id # Remove can download permissions from project GENIE team syn.setPermissions(new_maf_database.id, 3326313, []) return (database_synid_mappingdf)
def update_team_stats_table(syn, project_id, team_stats_df): """ Push the latest version of the team stats table to Synapse. """ try: print("Searching for existing 'TeamStats' table...") schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table']) if t['name'] == 'TeamStats'][0]['id'] schema = syn.get(schema_id) team_stats_table = syn.tableQuery('select * from {}'.format(schema_id)) team_stats_table.schema = schema print("Updating 'TeamStats' table...") update_table = synapseclient.Table(schema, team_stats_df) team_stats_table = _update_syn_table(team_stats_table, update_table, 'team') except IndexError: print("Creating 'TeamStats' table...") project = syn.get(project_id) cols = synapseclient.as_table_columns(team_stats_df) schema = synapseclient.Schema(name='TeamStats', columns=cols, parent=project) team_stats_table = synapseclient.Table(schema, team_stats_df) print("Storing 'TeamStats' table...") team_stats_table = syn.store(team_stats_table)
SYN = create_autospec(synapseclient.Synapse) SET_PERMS = {"set"} @pytest.mark.parametrize( "entity,principalid,permission_level,mapped", [ # tuple with (input, expectedOutput) (synapseclient.Project(), None, "view", permissions.ENTITY_PERMS_MAPPINGS['view']), (synapseclient.Folder(parentId="syn123"), None, "download", permissions.ENTITY_PERMS_MAPPINGS['download']), (synapseclient.Entity(), None, "edit", permissions.ENTITY_PERMS_MAPPINGS['edit']), (synapseclient.Schema(parentId="syn123"), None, "edit_and_delete", permissions.ENTITY_PERMS_MAPPINGS['edit_and_delete']), (synapseclient.File(parentId="syn123"), None, "admin", permissions.ENTITY_PERMS_MAPPINGS['admin']), (synapseclient.Entity(), None, "remove", permissions.ENTITY_PERMS_MAPPINGS['remove']), (synapseclient.Evaluation(contentSource="syn123"), None, "view", permissions.EVALUATION_PERMS_MAPPINGS['view']), (synapseclient.Evaluation(contentSource="syn123"), None, "submit", permissions.EVALUATION_PERMS_MAPPINGS['submit']), (synapseclient.Evaluation(contentSource="syn123"), None, "score", permissions.EVALUATION_PERMS_MAPPINGS['score']), (synapseclient.Evaluation(contentSource="syn123"), None, "admin", permissions.EVALUATION_PERMS_MAPPINGS['admin']), (synapseclient.Evaluation(contentSource="syn123"), None, "remove", permissions.EVALUATION_PERMS_MAPPINGS['remove'])
def main(syn): # Basic setup of the project project_name = "Testing Synapse Genie" # Determine the short and long names of the centers. center_abbreviations = ['AAA', 'BBB', 'CCC'] center_names = center_abbreviations # Create the project project = synapseclient.Project(project_name) project = syn.store(project) # Create a folder for log files generated by the GENIE processes # of validation and updating the database tables logs_folder = synapseclient.Folder(name='Logs', parent=project) logs_folder = syn.store(logs_folder) # Folder for individual center folders root_center_folder = synapseclient.Folder(name='Centers', parent=project) root_center_folder = syn.store(root_center_folder) # The folders for each center where they will upload files for validation # and submission. There is one folder per center. # This currently deviates from the original GENIE setup of having an # 'Input' and 'Staging' folder for each center. center_folders = [ synapseclient.Folder(name=name, parent=root_center_folder) for name in center_abbreviations ] center_folders = [syn.store(folder) for folder in center_folders] # Make some fake data that only contains basic text to check # for validation. n_files = 5 # number of files per center to create for folder in center_folders: for idx in range(n_files): tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}', suffix='.txt') with open(tmp.name, mode='w') as fh: fh.write(random.choice(['ERROR', 'VALID', 'NOPE'])) synfile = syn.store(synapseclient.File(tmp.name, parent=folder)) # Set up the table that holds the validation status of all submitted files. status_schema = create_status_table(syn, project) # Set up the table that maps the center abbreviation to the folder where # their data is uploaded. This is used by the GENIE framework to find the # files to validate for a center. center_map_table_defs = [ { 'name': 'name', 'columnType': 'STRING', 'maximumSize': 250 }, { 'name': 'center', 'columnType': 'STRING', 'maximumSize': 50 }, { 'name': 'inputSynId', 'columnType': 'ENTITYID' }, # {'name': 'stagingSynId', # 'columnType': 'ENTITYID'}, { 'name': 'release', 'defaultValue': 'false', 'columnType': 'BOOLEAN' } # {'id': '68438', # 'name': 'mutationInCisFilter', # 'defaultValue': 'true', # 'columnType': 'BOOLEAN', # 'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'} ] center_map_cols = [ synapseclient.Column(**col) for col in center_map_table_defs ] center_schema = synapseclient.Schema(name='Center Table', columns=center_map_cols, parent=project) center_schema = syn.store(center_schema) # Add the center folders created above to this table. center_folder_ids = [folder.id for folder in center_folders] center_df = pandas.DataFrame( dict(name=center_names, center=center_abbreviations, inputSynId=center_folder_ids)) tbl = synapseclient.Table(schema=center_schema, values=center_df) tbl = syn.store(tbl) # Create a table that stores the error logs for each submitted file. error_col_defs = [ { 'name': 'id', 'columnType': 'ENTITYID' }, { 'name': 'center', 'columnType': 'STRING', 'maximumSize': 50, 'facetType': 'enumeration' }, { 'name': 'errors', 'columnType': 'LARGETEXT' }, { 'name': 'name', 'columnType': 'STRING', 'maximumSize': 500 }, # {'name': 'versionNumber', # 'columnType': 'STRING', # 'maximumSize': 50}, { 'name': 'fileType', 'columnType': 'STRING', 'maximumSize': 50 } ] error_map_cols = [synapseclient.Column(**col) for col in error_col_defs] error_schema = synapseclient.Schema(name='Error Table', columns=error_map_cols, parent=project) error_schema = syn.store(error_schema) # Create a table that maps the various database tables to a short name. # This table is used in many GENIE functions to find the correct table to update # or get the state of something from. db_map_col_defs = [{ 'name': 'Database', 'columnType': 'STRING', 'maximumSize': 50 }, { 'name': 'Id', 'columnType': 'ENTITYID' }] db_map_cols = [synapseclient.Column(**col) for col in db_map_col_defs] db_map_schema = synapseclient.Schema(name='DB Mapping Table', columns=db_map_cols, parent=project) db_map_schema = syn.store(db_map_schema) # Add dbMapping annotation project.annotations.dbMapping = db_map_schema.tableId project = syn.store(project) # Add the tables we already created to the mapping table. dbmap_df = pandas.DataFrame( dict(Database=[ 'centerMapping', 'validationStatus', 'errorTracker', 'dbMapping', 'logs' ], Id=[ center_schema.id, status_schema.id, error_schema.id, db_map_schema.id, logs_folder.id ])) db_map_tbl = synapseclient.Table(schema=db_map_schema, values=dbmap_df) db_map_tbl = syn.store(db_map_tbl) # Make a top level folder for output. Some processing for # file types copy a file from one place to another. output_folder = synapseclient.Folder(name='Output', parent=project) output_folder = syn.store(output_folder) output_folder_map = [] # default_table_col_defs = status_table_col_defs = [ # {'name': 'PRIMARY_KEY', # 'columnType': 'STRING'} # ] # default_table_cols = [synapseclient.Column(**col) # for col in default_table_col_defs] default_primary_key = 'PRIMARY_KEY' # For each file type format in the format registry, create an output folder and a table. # Some GENIE file types copy a file to a new place, and some update a table. Having both # means that both of these operations will be available at the beginning. # The mapping between the file type and the folder or table have a consistent naming. # The key ('Database' value) is {file_type}_folder or {file_type}_table. # Determine which file formats are going to be used. format_registry = config.collect_format_types(['example_registry']) for file_type, obj in format_registry.items(): file_type_folder = synapseclient.Folder(name=file_type, parent=output_folder) file_type_folder = syn.store(file_type_folder) output_folder_map.append( dict(Database=f"{file_type}_folder", Id=file_type_folder.id)) file_type_schema = synapseclient.Schema(name=file_type, parent=project) file_type_schema.annotations.primaryKey = default_primary_key file_type_schema = syn.store(file_type_schema) output_folder_map.append( dict(Database=f"{file_type}_table", Id=file_type_schema.id)) # Add the folders and tables created to the mapping table. db_map_tbl = synapseclient.Table( schema=db_map_schema, values=pandas.DataFrame(output_folder_map)) db_map_tbl = syn.store(db_map_tbl)
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) #Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) #function under test uses queries which are eventually consistent but not immediately after creating the entities start_time = time.time() while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) ### Test recursive get output = run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) #Verify that we downloaded files: new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) ### Test query get ### Note: We're not querying on annotations because tests can fail if there ### are lots of jobs queued as happens when staging is syncing output = run('synapse', '--skip-checks', 'get', '-q', "select id from file where parentId=='%s'" % folder_entity2.id) #Verify that we downloaded files from folder_entity2 new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0]) ### Test query get using a Table with an entity column ### This should be replaced when Table File Views are implemented in the client cols = [] cols.append(synapseclient.Column(name='id', columnType='ENTITYID')) schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 =[[x.id] for x in file_entities] print(data1) row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1, rows=[synapseclient.Row(r) for r in data1])) ### Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) #Verify that we downloaded files: new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): print(uploaded, downloaded) assert os.path.exists(downloaded) assert filecmp.cmp(downloaded, uploaded) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])
def store_dataframe_to_synapse(syn, df, parent, name, cols): df = df[[c['name'] for c in cols]] schema = sc.Schema(name = name, columns = cols, parent = parent) table = sc.Table(schema, df) table = syn.store(table) return table
def test_migrate_project(request, syn, schedule_for_cleanup, storage_location_id): test_name = request.node.name project_name = "{}-{}".format(test_name, uuid.uuid4()) project = synapseclient.Project(name=project_name) project_entity = syn.store(project) file_0_path = _create_temp_file() schedule_for_cleanup(file_0_path) file_0_name = "{}-{}".format(test_name, 1) file_0 = synapseclient.File(name=file_0_name, path=file_0_path, parent=project_entity) file_0_entity = syn.store(file_0) default_storage_location_id = file_0_entity._file_handle[ 'storageLocationId'] folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4()) folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name) folder_1_entity = syn.store(folder_1) file_1_path = _create_temp_file() schedule_for_cleanup(file_1_path) file_1_name = "{}-{}".format(test_name, 1) file_1 = synapseclient.File(name=file_1_name, path=file_1_path, parent=folder_1_entity) file_1_entity = syn.store(file_1) file_2_path = _create_temp_file() schedule_for_cleanup(file_2_path) file_2_name = "{}-{}".format(test_name, 2) file_2 = synapseclient.File(name=file_2_name, path=file_2_path, parent=folder_1_entity) file_2_entity = syn.store(file_2) # file 3 shares the same file handle id as file 1 file_3_path = file_1_path file_3_name = "{}-{}".format(test_name, 3) file_3 = synapseclient.File(name=file_3_name, path=file_3_path, parent=folder_1_entity) file_3.dataFileHandleId = file_1_entity.dataFileHandleId file_3_entity = syn.store(file_3) table_1_cols = [ synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'), synapseclient.Column(name='num', columnType='INTEGER'), synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'), ] table_1 = syn.store( synapseclient.Schema(name=test_name, columns=table_1_cols, parent=folder_1_entity)) table_1_file_col_1_1 = _create_temp_file() table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1) table_1_file_col_1_2 = _create_temp_file() table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1) table_1_file_col_2_1 = _create_temp_file() table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1) table_1_file_col_2_2 = _create_temp_file() table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1) data = [ [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']], [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']], ] table_1_entity = syn.store( synapseclient.RowSet(schema=table_1, rows=[synapseclient.Row(r) for r in data])) db_path = tempfile.NamedTemporaryFile(delete=False).name schedule_for_cleanup(db_path) index_result = synapseutils.index_files_for_migration( syn, project_entity, storage_location_id, db_path, file_version_strategy='new', include_table_files=True, ) counts_by_status = index_result.get_counts_by_status() assert counts_by_status['INDEXED'] == 8 assert counts_by_status['ERRORED'] == 0 migration_result = synapseutils.migrate_indexed_files(syn, db_path, force=True) file_0_entity_updated = syn.get(utils.id_of(file_0_entity), downloadFile=False) file_1_entity_updated = syn.get(utils.id_of(file_1_entity), downloadFile=False) file_2_entity_updated = syn.get(utils.id_of(file_2_entity), downloadFile=False) file_3_entity_updated = syn.get(utils.id_of(file_3_entity), downloadFile=False) file_handles = [ f['_file_handle'] for f in ( file_0_entity_updated, file_1_entity_updated, file_2_entity_updated, file_3_entity_updated, ) ] table_1_id = utils.id_of(table_1_entity) results = syn.tableQuery("select file_col_1, file_col_2 from {}".format( utils.id_of(table_1_entity))) table_file_handles = [] for row in results: for file_handle_id in row[2:]: file_handle = syn._getFileHandleDownload( file_handle_id, table_1_id, objectType='TableEntity')['fileHandle'] table_file_handles.append(file_handle) file_handles.extend(table_file_handles) _assert_storage_location(file_handles, storage_location_id) assert storage_location_id != default_storage_location_id with sqlite3.connect(db_path) as conn: cursor = conn.cursor() query_result = cursor.execute( "select status, count(*) from migrations where type in (?, ?) group by status", (_MigrationType.FILE.value, _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall() counts = {r[0]: r[1] for r in query_result} # should only be one status and they should all be migrated # should be 3 migrated files entities + 4 migrated table attached files assert len(counts) == 1 assert counts[_MigrationStatus.MIGRATED.value] == 8 csv_file = tempfile.NamedTemporaryFile(delete=False) schedule_for_cleanup(csv_file.name) migration_result.as_csv(csv_file.name) with open(csv_file.name, 'r') as csv_file_in: csv_contents = csv_file_in.read() table_1_id = table_1_entity['tableId'] # assert the content of the csv. we don't assert any particular order of the lines # but the presence of the expected lines and the correct # of lines csv_lines = csv_contents.split('\n') assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines # noqa assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines # noqa assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines # noqa assert "" in csv_lines # expect trailing newline in a csv
def test_command_get_recursive_and_query(): """Tests the 'synapse get -r' and 'synapse get -q' functions""" project_entity = project # Create Folders in Project folder_entity = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity)) folder_entity2 = syn.store( synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity)) # Create and upload two files in sub-Folder uploaded_paths = [] file_entities = [] for i in range(2): f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity2) file_entity = syn.store(file_entity) file_entities.append(file_entity) schedule_for_cleanup(f) # Add a file in the Folder as well f = utils.make_bogus_data_file() uploaded_paths.append(f) schedule_for_cleanup(f) file_entity = synapseclient.File(f, parent=folder_entity) file_entity = syn.store(file_entity) file_entities.append(file_entity) # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent, # but faster than chunked queries. time.sleep(2) # Test recursive get run('synapse', '--skip-checks', 'get', '-r', folder_entity.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) # Test query get using a Table with an entity column # This should be replaced when Table File Views are implemented in the client cols = [synapseclient.Column(name='id', columnType='ENTITYID')] schema1 = syn.store( synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity)) schedule_for_cleanup(schema1.id) data1 = [[x.id] for x in file_entities] syn.store( synapseclient.RowSet(schema=schema1, rows=[synapseclient.Row(r) for r in data1])) time.sleep(3) # get -q are eventually consistent # Test Table/View query get output = run('synapse', '--skip-checks', 'get', '-q', "select id from %s" % schema1.id) # Verify that we downloaded files: new_paths = [ os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1] ] new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1]))) schedule_for_cleanup(folder_entity.name) for downloaded, uploaded in zip(new_paths, uploaded_paths): assert_true(os.path.exists(downloaded)) assert_true(filecmp.cmp(downloaded, uploaded)) schedule_for_cleanup(downloaded) schedule_for_cleanup(new_paths[0])
def pubmed(args, syn): """ Given a list of grant numbers pulled from a synapse table column, utilizes a pubmed API to generate a search query. This query is constructed by the union ('or' logic) of all the grant numbers, which would aid in pulling down a list of all PubMed publication id's associated with the grants. Then it will go through the PubMed id's and scrape the publication for basic informative information. :param args: :param syn: :return: """ projectId = args.projectId project = syn.get(projectId) if args.grantviewId is not None: grantviewId = args.grantviewId else: grantviewId = "syn10142562" csbcGrants = csbcGrantList(syn, grantviewId) grantIds = getGrantQuery(csbcGrants) pubmedIds = getPubMedIds(grantIds) csbcView = getCenterIdsView(syn, grantviewId) # for utf encoding and debugging # finalTable.to_csv("csbc.csv", sep=',', index=False, encoding="utf-8") # finalTable = pandas.read_csv("csbc.csv", delimiter=',', encoding="utf-8") # os.remove("csbc.csv") if args.tableId: # update existing schema tableId = args.tableId schema = syn.get(tableId) publicationTable = syn.tableQuery("select * from %s" % tableId) currentTable = publicationTable.asDataFrame() new_pubmed_ids = list( set(pubmedIds) - set([i.split("=")[1] for i in list(currentTable.PubMed)])) finalTable = getPMIDDF(new_pubmed_ids, csbcGrants, csbcView) table = synapseclient.Table(schema, finalTable.values.tolist()) table = syn.store(table) else: # create a new schema # cols = synapseclient.as_table_columns(finalTable) finalTable = getPMIDDF(pubmedIds, csbcGrants, csbcView) cols = [ Column(name='CSBC PSON Center', columnType='ENTITYID', maximumSize=50), Column(name='Consortium', columnType='STRING', maximumSize=100), Column(name='PubMed', columnType='LINK', maximumSize=100), Column(name='Journal', columnType='STRING', maximumSize=100), Column(name='Publication Year', columnType='DATE'), Column(name='Title', columnType='STRING', maximumSize=500), Column(name='Authors', columnType='STRING', maximumSize=990), Column(name='Grant', columnType='STRING', maximumSize=50), Column(name='Data Location', columnType='LINK', maximumSize=1000), Column(name='Synapse Location', columnType='ENTITYID', maximumSize=50), Column(name='Keywords', columnType='STRING', maximumSize=250) ] schema = synapseclient.Schema(name=args.tableName, columns=cols, parent=project) table = synapseclient.Table(schema, finalTable) table = syn.store(table)
with patch.object(SYN, "findEntityId", return_value=post_return),\ patch.object(SYN, "get", return_value=obj),\ pytest.raises(AssertionError, match="Retrieved .* had type .* rather than .*"): GET_CLS._find_entity_by_name( parentid="syn12345", entity_name="foo.txt", concrete_type="Test" ) @pytest.mark.parametrize( "obj", [synapseclient.Project(name="foo"), synapseclient.File(path="foo.txt", parentId="syn12345"), synapseclient.Folder(name="foo", parentId="syn12345"), synapseclient.Schema(name="foo", parentId="syn12345")] ) def test__get_obj__entity(obj): """Test getting of entities""" with patch.object(GET_CLS, "_find_entity_by_name", return_value=obj) as patch_get: return_obj = GET_CLS._get_obj(obj) patch_get.assert_called_once_with( parentid=obj.properties.get("parentId", None), entity_name=obj.name, concrete_type=obj.properties.concreteType) assert obj == return_obj @pytest.mark.parametrize("obj,get_func", [(synapseclient.Team(name="foo"), "getTeam"),
def main(): import argparse parser = argparse.ArgumentParser( description='Convert JSON to Synapse Table Schema') parser.add_argument('path', type=str, help='Path (or URL) to JSON file') parser.add_argument('--projectId', type=str, help='Synapse Project ID to store schema') parser.add_argument('-n', '--dry_run', action="store_true", default=False, help='Dry run') parser.add_argument('--synapseJSONSchema', action="store_true", default=False, help="JSON is already in Synapse Table Schema format") args = parser.parse_args() syn = synapseclient.login(silent=True) project = syn.get(args.projectId) f = urllib.urlopen(path2url(args.path)) data = json.load(f) url_path = urllib.splittype(args.path)[1] filename = os.path.split(url_path)[1] schema_name = os.path.splitext(filename)[0] if args.synapseJSONSchema: schema = synapseclient.Schema(name=schema_name, parent=project) schema.columns_to_store = data else: cols = [] for k, v in data.iteritems(): # Handle null values, assume that they will be strings if not v: column_type = "STRING" elif bool in map(type, v): column_type = "BOOLEAN" elif int in map(type, v): column_type = "INTEGER" elif float in map(type, v): column_type = "DOUBLE" else: column_type = "STRING" cols.append( synapseclient.Column(name=k, columnType=column_type, enumValues=v, maximumSize=250)) schema = synapseclient.Schema(name=schema_name, columns=cols, parent=project) if args.dry_run: schema_as_list = map(dict, schema.columns_to_store) new_schema_as_list = [] _key_order = [ 'name', 'description', 'columnType', 'maximumSize', 'enumValues' ] for col in schema_as_list: col['description'] = "" col['source'] = "" new_enum_values = [] for v in col['enumValues']: new_value_ordered_dict = collections.OrderedDict() new_value_ordered_dict['value'] = v new_value_ordered_dict['description'] = "" new_value_ordered_dict['source'] = "" new_enum_values.append(new_value_ordered_dict) col['enumValues'] = new_enum_values new_ordered_dict = collections.OrderedDict() for k in _key_order: new_ordered_dict[k] = col[k] new_schema_as_list.append(new_ordered_dict) print json.dumps(new_schema_as_list, indent=2) else: schema = syn.store(schema)
###current table is NTAP existing_table="syn18496443" rowset=syn.tableQuery("select * from "+existing_table) syn.delete(rowset) #table = synapseclient.table.build_table("NTAP Project Information Integration", 'syn4939478', final_df) ### Table takes in the schema and values (here as a dataframe ) table=syn.store(synapseclient.table.Table(existing_table,final_df)) # %% cols = syn.getTableColumns(existing_table) # %% lst = list(cols) # In[41]: #table = syn.store(table) schema = synapseclient.Schema(columns=lst, parent = "syn4939478") # %% print(schema) # %% synapseclient.table.Table(schema,final_df) # %% synapseclient.table.Table( synapseclient.Schema(columns=lst, parent = "syn4939478") , final_df)
def main(syn, project, format_registry=None, centers=None): # TODO: add PRIMARY_KEY annotation to each of the tables # Dangerous to have lists as default values if format_registry is None: format_registry = ['example_registry'] if centers is None: centers = [] # Determine the short and long names of the centers. center_abbreviations = centers center_names = center_abbreviations # Create a folder for log files generated by the GENIE processes # of validation and updating the database tables logs_folder = synapseclient.Folder(name='Logs', parent=project) logs_folder = syn.store(logs_folder) # Folder for individual center folders root_center_folder = synapseclient.Folder(name='Centers', parent=project) root_center_folder = syn.store(root_center_folder) # The folders for each center where they will upload files for validation # and submission. There is one folder per center. # This currently deviates from the original GENIE setup of having an # 'Input' and 'Staging' folder for each center. center_folders = [ synapseclient.Folder(name=name, parent=root_center_folder) for name in center_abbreviations ] center_folders = [syn.store(folder) for folder in center_folders] # Make some fake data that only contains basic text to check # for validation. n_files = 2 # number of files per center to create for folder in center_folders: for _ in range(n_files): tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}', suffix='.txt') with open(tmp.name, mode='w') as file_h: file_h.write(random.choice(['ERROR', 'VALID', 'NOPE'])) syn.store(synapseclient.File(tmp.name, parent=folder)) # Set up the table that holds the validation status of all submitted # files. status_schema = create_status_table(syn, project) # Set up the table that maps the center abbreviation to the folder where # their data is uploaded. This is used by the GENIE framework to find the # files to validate for a center. center_schema = create_center_map_table(syn, project) # Add the center folders created above to this table. center_folder_ids = [folder.id for folder in center_folders] center_df = pandas.DataFrame(dict(name=center_names, center=center_abbreviations, inputSynId=center_folder_ids)) center_df['release'] = True existing_center = syn.tableQuery(f"select * from {center_schema.id}") existing_centerdf = existing_center.asDataFrame() process_functions.updateDatabase(syn, existing_centerdf, center_df, center_schema.id, ["center"], to_delete=True) # TODO: Remove centers that aren't part of the list # Create a table that stores the error logs for each submitted file. error_schema = create_error_tracking_table(syn, project) # Create a table that maps the various database tables to a short name. # This table is used in many GENIE functions to find the correct table # to update or get the state of something from. db_map_schema = create_db_mapping_table(syn, project) # Add dbMapping annotation project.annotations.dbMapping = db_map_schema.id project = syn.store(project) # Add the tables we already created to the mapping table. dbmap_df = pandas.DataFrame( dict(Database=['centerMapping', 'validationStatus', 'errorTracker', 'dbMapping', 'logs'], Id=[center_schema.id, status_schema.id, error_schema.id, db_map_schema.id, logs_folder.id]) ) # Make a top level folder for output. Some processing for # file types copy a file from one place to another. output_folder = synapseclient.Folder(name='Output', parent=project) output_folder = syn.store(output_folder) output_folder_map = [] # default_table_col_defs = status_table_col_defs = [ # {'name': 'PRIMARY_KEY', # 'columnType': 'STRING'} # ] # default_table_cols = [synapseclient.Column(**col) # for col in default_table_col_defs] default_primary_key = 'PRIMARY_KEY' # For each file type format in the format registry, create an output # folder and a table. # Some GENIE file types copy a file to a new place, and some update a # table. Having both means that both of these operations will be available # at the beginning. # The mapping between the file type and the folder or table have a # consistent naming. # The key ('Database' value) is {file_type}_folder or {file_type}_table. # Determine which file formats are going to be used. format_registry = config.collect_format_types(format_registry) # Get existing database tables existing_dbmap = syn.tableQuery(f"select * from {db_map_schema.id}") existing_dbmapdf = existing_dbmap.asDataFrame() for file_type, obj in format_registry.items(): if file_type not in existing_dbmapdf['Database'].tolist(): file_type_folder = synapseclient.Folder(name=file_type, parent=output_folder) file_type_folder = syn.store(file_type_folder) output_folder_map.append(dict(Database=f"{file_type}_folder", Id=file_type_folder.id)) file_type_schema = synapseclient.Schema(name=file_type, parent=project) # The DCC will have to set the schema and primary key # after this is created. file_type_schema.annotations.primaryKey = default_primary_key file_type_schema = syn.store(file_type_schema) output_folder_map.append(dict(Database=file_type, Id=file_type_schema.id)) else: print("Database already exists") # Add the folders and tables created to the mapping table. dbmap_df = dbmap_df.append(pandas.DataFrame(output_folder_map)) process_functions.updateDatabase(syn, existing_dbmapdf, dbmap_df, db_map_schema.id, ["Database"])
def transferTables(syn, sourceProjId, uploadProjId, extId_Str='', simpleNameFilters=[], healthCodeList=None): """ This function transfers tables from a source project to the upload project (target project) sorted by external Ids which contain extId_Str, group tables with simpleNameFilters, also can filter tables by healthcodes and then group by activity""" # dataframe of all tables using get_tables from synapsebridgehelper.tableHelpers all_tables = synapsebridgehelpers.get_tables(syn, sourceProjId, simpleNameFilters) # Converting externalIds to healthCodes if extId_Str != '': res = synapsebridgehelpers.externalIds2healthCodes( syn, list(all_tables['table.id'])) res = res[res['externalId'].str.contains(extId_Str)] healthCodeList = list(res['healthCode']) # List of tables sorted by activity and filtered using healthcodes tables_dict = synapsebridgehelpers.filterTablesByActivity( syn, all_tables, healthCodes=healthCodeList) # Iterate over each activity in tables_dict for activity_, activityTableIds in tables_dict.items(): df_list = [] # list of dataframes corresponding to that activity cols_filehandleid = [ ] # list of columns that have type FILEHANDLEID across all dataframes for that activity # looping over all tables corresponding to that activity for table_index in range(0, len(activityTableIds)): result = synapsebridgehelpers.tableWithFileIds( syn, table_id=activityTableIds[table_index], healthcodes=healthCodeList) cols_filehandleid = cols_filehandleid + list( set(result['cols']) - set(cols_filehandleid)) df_list.append(result['df']) # Concatenating all tables to form one table for the activity df_main = pd.concat(df_list) cols = synapseclient.as_table_columns(df_main) # Change the type of columns that are FILEHANDLEIDs as calculated before for col in cols: if col.name in cols_filehandleid: col.columnType = 'FILEHANDLEID' # If different datatypes happen while merging tables this will change the column type in the resulting dataframe # The following code sets it right and casts the data into its original form / form that syn.store would accept # (for FILEHANDLEID type columns, the input needs to be an integer) for col in cols: if col.columnType == 'STRING': df_main[col.name] = [ str(item) if item == item else '' for item in df_main[col.name] ] elif col.columnType == 'INTEGER': df_main[col.name] = [ int(item) if item == item else '' for item in df_main[col.name] ] elif col.columnType == 'FILEHANDLEID': df_main[col.name] = [ int(item) if (item != '' and item == item) else '' for item in df_main[col.name] ] else: df_main[col.name] = [ item if item == item else '' for item in df_main[col.name] ] # Updaing schema and uploading schema = synapseclient.Schema(name=activity_, columns=cols, parent=uploadProjId) table = synapseclient.Table(schema, df_main) table = syn.store(table) table = syn.setProvenance(table.schema.id, activity=synapseclient.activity.Activity( used=tables_dict[activity_]))