def update_validated_submissions_table(syn, project_id, valid_df): """ Push the latest version of the combined validated submissions table to Synapse. """ try: print("Searching for existing 'ValidatedSubmissions' table...") schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table']) if t['name'] == 'ValidatedSubmissions'][0]['id'] schema = syn.get(schema_id) validated_subs_table = syn.tableQuery('select * from {}'.format(schema_id)) if validated_subs_table.asDataFrame().shape[0] == valid_df.shape[0]: print("No new valid submissions since last update.") validated_subs_table.schema = schema print("Updating 'ValidatedSubmissions' table...") update_table = synapseclient.Table(schema, valid_df) validated_subs_table = _update_syn_table(validated_subs_table, update_table, 'objectId') except IndexError: print("Creating 'ValidatedSubmissions' table...") project = syn.get(project_id) cols = synapseclient.as_table_columns(valid_df) schema = synapseclient.Schema(name='ValidatedSubmissions', columns=cols, parent=project) validated_subs_table = synapseclient.Table(schema, valid_df) print("Storing 'ValidatedSubmissions' table...") validated_subs_table = syn.store(validated_subs_table)
def upload(directory, synID, synName, dataFrameList): """ Upload the data to a Synapse table Input: directory: The name of the directory holding the data synID: Synapse ID of the project where the table will be stored synName: Name to be given to the new table dataFrameList: List of dataframes with all of the data """ df = pd.DataFrame() print("Creating dataframe") for entry in dataFrameList: df = df.append(entry, ignore_index=True) # Each of these columns are longer than 1000 characters each. # Cut them down to 1000 chars max df = df.applymap(lambda x: str(x)[:1000]) print("Writing to file") df.to_csv('%s/allData.csv' % directory, encodings='utf-8', index=False) print("Uploading to Synapse") schema = Schema(name=synName, columns=as_table_columns(df), parent=synID) syn.store(Table(schema, df))
def table(syn, parent, obj): df = read(obj) cols = synapseclient.as_table_columns(df) schema = synapseclient.Schema(name=str(uuid.uuid4()), columns=cols, parent=parent) schema = syn.store(schema) table = syn.store(synapseclient.Table(schema, df)) return schema
def write_synapse_table(table_data, synapse_project_id, table_name='', username='', password=''): """ Write data to a Synapse table. Parameters ---------- table_data : Pandas DataFrame Synapse table contents synapse_project_id : string Synapse ID for project within which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Examples -------- >>> import os >>> import pandas as pd >>> from mhealthx.xio import write_synapse_table >>> path = os.environ['MHEALTHX_OUTPUT'] >>> table = os.path.join(path, 'feature_tables', ... 'tap_row0_v0_9d44a388-5d7e-4271-8705-2faa66204486.csv') >>> table_data = pd.read_csv(table) >>> username = '' >>> password = '' >>> synapse_project_id = 'syn4899451' >>> table_name = 'Contents of table' >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password) """ import synapseclient from synapseclient import Schema, Table, as_table_columns syn = synapseclient.Synapse(skip_checks=True) # Log in to Synapse: if username and password: syn.login(username, password, silent=True) else: syn.login(silent=True) #table_data.index = range(table_data.shape[0]) schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id, includeRowIdAndRowVersion=False) syn.store(Table(schema, table_data))
def test_tables_pandas(syn, project): # create a pandas DataFrame df = pd.DataFrame({ 'A': ("foo", "bar", "baz", "qux", "asdf"), 'B': tuple(0.42 * i for i in range(5)), 'C': (101, 202, 303, 404, 505), 'D': (False, True, False, True, False), # additional data types supported since SYNPY-347 'int64': tuple(np.int64(range(5))), 'datetime64': tuple( np.datetime64(d) for d in [ '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04', '2005-02-05' ]), 'string_': tuple( np.string_(s) for s in ['urgot', 'has', 'dark', 'mysterious', 'past']) }) cols = as_table_columns(df) cols[0].maximumSize = 20 schema = Schema(name="Nifty Table", columns=cols, parent=project) # store in Synapse table = syn.store(Table(schema, df)) # retrieve the table and verify results = syn.tableQuery('select * from %s' % table.schema.id, resultsAs='csv') df2 = results.asDataFrame(convert_to_datetime=True) # simulate rowId-version rownames for comparison df.index = ['%s_1' % i for i in range(1, 6)] df['string_'] = df['string_'].transform(str) # SYNPY-717 df['datetime64'] = df['datetime64'].apply( lambda x: pd.Timestamp(x).tz_localize('UTC')) # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column; # second .all() gives a bool that is ANDed value of that Series assert_frame_equal(df2, df)
def update_team_stats_table(syn, project_id, team_stats_df): """ Push the latest version of the team stats table to Synapse. """ try: print("Searching for existing 'TeamStats' table...") schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table']) if t['name'] == 'TeamStats'][0]['id'] schema = syn.get(schema_id) team_stats_table = syn.tableQuery('select * from {}'.format(schema_id)) team_stats_table.schema = schema print("Updating 'TeamStats' table...") update_table = synapseclient.Table(schema, team_stats_df) team_stats_table = _update_syn_table(team_stats_table, update_table, 'team') except IndexError: print("Creating 'TeamStats' table...") project = syn.get(project_id) cols = synapseclient.as_table_columns(team_stats_df) schema = synapseclient.Schema(name='TeamStats', columns=cols, parent=project) team_stats_table = synapseclient.Table(schema, team_stats_df) print("Storing 'TeamStats' table...") team_stats_table = syn.store(team_stats_table)
def createAMPADTable(keyFile, clinicalFile): """ Create the AMP AD table with merged data from keyFile with clinicalFile. If any of the supplementary files exist for a particular dataset, change the binary classifiers to the synapse ID holding the data and reset 0 to null for the table. Input: keyFile: Dataframe with the keys and information regarding what exists for each patient clinicalFile: Dataframe with clinical data for various patients """ toUpload = [] clinicalHeader = clinicalFile.columns.values #seenList = [] # Iterate through each project within keyFile for i, row in keyFile.iterrows(): # Create empty list for new row to be added to synapse table newRow = [] # Ignore binary varibles which all end in '_data' for item in row.iteritems(): if (item[0] == 'niagas_data'): if (not pd.isnull(row.niagas_data)): newRow.append(arrayExpressionSynID) else: newRow.append(float('nan')) elif (not item[0].endswith('_data')): newRow.append(item[1]) # Check if row has clinical data if (row.clinical_data): # Create reference to clinicalFile project ID clinicalKeyList = clinicalFile['projid'] # get the index of the projID in the clinical file index = clinicalKeyList[clinicalKeyList == row.projid].index.tolist() if (len(index) == 1): index = index[0] #seenList.append(row.projid) for entry in clinicalFile.iloc[index][1:]: newRow.append(entry) # If the length of the idnex is 0, it means the key file thinks # there is clinical information for this patient but it does # not exist in the clinical file elif (len(index) == 0): print("Key file indicates that projID %s should have "\ "clinical information, but it does not exist in "\ "the clinical information file" % row.projid) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # If the lengh of index list is greater than 1, that means projID # appears more than once in the file. Send warning to user else: print("projID %s appears more than once in clinical file at "\ "positions %s" % (row.projid, index)) for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) else: for _ in range(1, len(clinicalHeader)): newRow.append(float('nan')) # Check if row has gwas data if (row.gwas_data): newRow.append(genotypeSynID) newRow.append(imputedGenotypeSynID) else: newRow.append(float('nan')) newRow.append(float('nan')) if (row.mwas_data): newRow.append(methylationSynID) else: newRow.append(float('nan')) if (row.mirna_data): newRow.append(mirnaSynID) else: newRow.append(float('nan')) if (row.mrna_data): newRow.append(rnaseqSynID) else: newRow.append(float('nan')) toUpload.append(newRow) df = pd.DataFrame(toUpload) columns = list(keyFile.columns.values) index = columns.index('clinical_data') - 1 columns.remove('clinical_data') idnex = columns.index('gwas_data') columns.remove('gwas_data') columns.insert(index + 1, 'genotype data') columns.insert(index + 2, 'imputed genotype data') for i in range(1, len(clinicalHeader)): columns.insert(index + i, clinicalHeader[i]) df.columns = columns df.to_csv('mergedTables.csv', encodings='utf-8', index=False) print("Uploading to Synapse") schema = Schema(name='AMP AD Samples Table', columns=as_table_columns(df), parent='syn2580853') syn.store(Table(schema, df))
def opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password): """ Save openSMILE's SMILExtract audio features to a Synapse table. Parameters ---------- in_files : list of strings full path to the input files synapse_project_id : string Synapse ID for project to which table is to be written table_name : string schema name of table username : string Synapse username (only needed once on a given machine) password : string Synapse password (only needed once on a given machine) Returns ------- table_data : Pandas DataFrame output table table_name : string schema name of table synapse_table_id : string Synapse table ID Examples -------- >>> from mhealthx.features import opensmile_features_to_synapse >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv'] >>> synapse_project_id = 'syn4899451' >>> table_name = 'Phonation openSMILE feature table' >>> username = '' >>> password = '' >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password) """ import pandas as pd import synapseclient from synapseclient import Schema, Table, as_table_columns from mhealthx.io_data import concatenate_tables_to_synapse_table as cat syn = synapseclient.Synapse() # Log in to Synapse: if username and password: syn.login(username, password) else: syn.login() # Store each file as a row in a Synapse table: first = True for in_file in in_files: if first: df_data = pd.read_csv(in_file) first = False else: df_data = pd.read_csv(in_file) table_data, project_id = cat(frames, synapse_project_id, table_name, username, password) # Create table schema: schema = Schema(name=table_name, columns=as_table_columns(table_data), parent=synapse_project_id) # Store as Synapse table: table = syn.store(Table(schema, table_data)) synapse_table_id = str(table.tableId) return table_data, table_name, synapse_table_id
def transferTables(syn, sourceProjId, uploadProjId, extId_Str='', simpleNameFilters=[], healthCodeList=None): """ This function transfers tables from a source project to the upload project (target project) sorted by external Ids which contain extId_Str, group tables with simpleNameFilters, also can filter tables by healthcodes and then group by activity""" # dataframe of all tables using get_tables from synapsebridgehelper.tableHelpers all_tables = synapsebridgehelpers.get_tables(syn, sourceProjId, simpleNameFilters) # Converting externalIds to healthCodes if extId_Str != '': res = synapsebridgehelpers.externalIds2healthCodes( syn, list(all_tables['table.id'])) res = res[res['externalId'].str.contains(extId_Str)] healthCodeList = list(res['healthCode']) # List of tables sorted by activity and filtered using healthcodes tables_dict = synapsebridgehelpers.filterTablesByActivity( syn, all_tables, healthCodes=healthCodeList) # Iterate over each activity in tables_dict for activity_, activityTableIds in tables_dict.items(): df_list = [] # list of dataframes corresponding to that activity cols_filehandleid = [ ] # list of columns that have type FILEHANDLEID across all dataframes for that activity # looping over all tables corresponding to that activity for table_index in range(0, len(activityTableIds)): result = synapsebridgehelpers.tableWithFileIds( syn, table_id=activityTableIds[table_index], healthcodes=healthCodeList) cols_filehandleid = cols_filehandleid + list( set(result['cols']) - set(cols_filehandleid)) df_list.append(result['df']) # Concatenating all tables to form one table for the activity df_main = pd.concat(df_list) cols = synapseclient.as_table_columns(df_main) # Change the type of columns that are FILEHANDLEIDs as calculated before for col in cols: if col.name in cols_filehandleid: col.columnType = 'FILEHANDLEID' # If different datatypes happen while merging tables this will change the column type in the resulting dataframe # The following code sets it right and casts the data into its original form / form that syn.store would accept # (for FILEHANDLEID type columns, the input needs to be an integer) for col in cols: if col.columnType == 'STRING': df_main[col.name] = [ str(item) if item == item else '' for item in df_main[col.name] ] elif col.columnType == 'INTEGER': df_main[col.name] = [ int(item) if item == item else '' for item in df_main[col.name] ] elif col.columnType == 'FILEHANDLEID': df_main[col.name] = [ int(item) if (item != '' and item == item) else '' for item in df_main[col.name] ] else: df_main[col.name] = [ item if item == item else '' for item in df_main[col.name] ] # Updaing schema and uploading schema = synapseclient.Schema(name=activity_, columns=cols, parent=uploadProjId) table = synapseclient.Table(schema, df_main) table = syn.store(table) table = syn.setProvenance(table.schema.id, activity=synapseclient.activity.Activity( used=tables_dict[activity_]))
def createAMPADTable(keyFile, clinicalFile): """ Create the AMP AD table with merged data from keyFile with clinicalFile. If any of the supplementary files exist for a particular dataset, change the binary classifiers to the synapse ID holding the data and reset 0 to null for the table. Input: keyFile: Dataframe with the keys and information regarding what exists for each patient clinicalFile: Dataframe with clinical data for various patients """ toUpload = [] clinicalHeader = clinicalFile.columns.values # seenList = [] # Iterate through each project within keyFile for i, row in keyFile.iterrows(): # Create empty list for new row to be added to synapse table newRow = [] # Ignore binary varibles which all end in '_data' for item in row.iteritems(): if item[0] == "niagas_data": if not pd.isnull(row.niagas_data): newRow.append(arrayExpressionSynID) else: newRow.append(float("nan")) elif not item[0].endswith("_data"): newRow.append(item[1]) # Check if row has clinical data if row.clinical_data: # Create reference to clinicalFile project ID clinicalKeyList = clinicalFile["projid"] # get the index of the projID in the clinical file index = clinicalKeyList[clinicalKeyList == row.projid].index.tolist() if len(index) == 1: index = index[0] # seenList.append(row.projid) for entry in clinicalFile.iloc[index][1:]: newRow.append(entry) # If the length of the idnex is 0, it means the key file thinks # there is clinical information for this patient but it does # not exist in the clinical file elif len(index) == 0: print( "Key file indicates that projID %s should have " "clinical information, but it does not exist in " "the clinical information file" % row.projid ) for _ in range(1, len(clinicalHeader)): newRow.append(float("nan")) # If the lengh of index list is greater than 1, that means projID # appears more than once in the file. Send warning to user else: print("projID %s appears more than once in clinical file at " "positions %s" % (row.projid, index)) for _ in range(1, len(clinicalHeader)): newRow.append(float("nan")) else: for _ in range(1, len(clinicalHeader)): newRow.append(float("nan")) # Check if row has gwas data if row.gwas_data: newRow.append(genotypeSynID) newRow.append(imputedGenotypeSynID) else: newRow.append(float("nan")) newRow.append(float("nan")) if row.mwas_data: newRow.append(methylationSynID) else: newRow.append(float("nan")) if row.mirna_data: newRow.append(mirnaSynID) else: newRow.append(float("nan")) if row.mrna_data: newRow.append(rnaseqSynID) else: newRow.append(float("nan")) toUpload.append(newRow) df = pd.DataFrame(toUpload) columns = list(keyFile.columns.values) index = columns.index("clinical_data") - 1 columns.remove("clinical_data") idnex = columns.index("gwas_data") columns.remove("gwas_data") columns.insert(index + 1, "genotype data") columns.insert(index + 2, "imputed genotype data") for i in range(1, len(clinicalHeader)): columns.insert(index + i, clinicalHeader[i]) df.columns = columns df.to_csv("mergedTables.csv", encodings="utf-8", index=False) print("Uploading to Synapse") schema = Schema(name="AMP AD Samples Table", columns=as_table_columns(df), parent="syn2580853") syn.store(Table(schema, df))
record = Entrez.read(handle) datasets = [] for geo_id in record['IdList']: handle = Entrez.esummary(db="gds", id=geo_id) dataset = Entrez.read(handle)[0] del dataset['Samples'] del dataset['SSInfo'] for k,v in dataset.iteritems(): try: if len(v)>1000: dataset[k]=v[:999] except TypeError: pass datasets.append(dataset) df = pd.DataFrame(datasets) df.drop(['ExtRelations', 'Projects', 'Relations'], axis=1, inplace=True) ftpLink = Column(columnType='LINK', maximumSize=84, name= 'FTPLink') columns = [ftpLink if col['name']=='FTPLink' else col for col in as_table_columns(df)] schema = Schema(name='GEO Datasets', columns=columns, parent='syn4012977') df.to_csv('skit.csv', encoding='utf-8', index=False) table = syn.store(Table(schema, 'skit.csv'))