def update_validated_submissions_table(syn, project_id, valid_df):
    """
    Push the latest version of the combined validated submissions 
    table to Synapse.
    """
    try:
        print("Searching for existing 'ValidatedSubmissions' table...")
        schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table'])
                     if t['name'] == 'ValidatedSubmissions'][0]['id']
        schema = syn.get(schema_id)
        validated_subs_table = syn.tableQuery('select * from {}'.format(schema_id))
        if validated_subs_table.asDataFrame().shape[0] == valid_df.shape[0]:
            print("No new valid submissions since last update.")
        validated_subs_table.schema = schema
        print("Updating 'ValidatedSubmissions' table...")
        update_table = synapseclient.Table(schema, valid_df)
        validated_subs_table = _update_syn_table(validated_subs_table, update_table, 'objectId')
    except IndexError:
        print("Creating 'ValidatedSubmissions' table...")
        project = syn.get(project_id)
        cols = synapseclient.as_table_columns(valid_df)
        schema = synapseclient.Schema(name='ValidatedSubmissions', columns=cols, parent=project)
        validated_subs_table = synapseclient.Table(schema, valid_df)
    print("Storing 'ValidatedSubmissions' table...")
    validated_subs_table = syn.store(validated_subs_table)
def upload(directory, synID, synName, dataFrameList):
    """
    Upload the data to a Synapse table

    Input:
        directory: The name of the directory holding the data
        synID: Synapse ID of the project where the table will be stored
        synName: Name to be given to the new table
        dataFrameList: List of dataframes with all of the data

    """

    df = pd.DataFrame()
    print("Creating dataframe")
    for entry in dataFrameList:
        df = df.append(entry, ignore_index=True)

    # Each of these columns are longer than 1000 characters each.
    # Cut them down to 1000 chars max
    df = df.applymap(lambda x: str(x)[:1000])
    
    print("Writing to file")
    df.to_csv('%s/allData.csv' % directory, encodings='utf-8', index=False)
    
    print("Uploading to Synapse")
    schema = Schema(name=synName, columns=as_table_columns(df),
        parent=synID)
    syn.store(Table(schema, df))
Exemple #3
0
def upload(directory, synID, synName, dataFrameList):
    """
    Upload the data to a Synapse table

    Input:
        directory: The name of the directory holding the data
        synID: Synapse ID of the project where the table will be stored
        synName: Name to be given to the new table
        dataFrameList: List of dataframes with all of the data

    """

    df = pd.DataFrame()
    print("Creating dataframe")
    for entry in dataFrameList:
        df = df.append(entry, ignore_index=True)

    # Each of these columns are longer than 1000 characters each.
    # Cut them down to 1000 chars max
    df = df.applymap(lambda x: str(x)[:1000])

    print("Writing to file")
    df.to_csv('%s/allData.csv' % directory, encodings='utf-8', index=False)

    print("Uploading to Synapse")
    schema = Schema(name=synName, columns=as_table_columns(df), parent=synID)
    syn.store(Table(schema, df))
Exemple #4
0
def table(syn, parent, obj):
    df = read(obj)
    cols = synapseclient.as_table_columns(df)
    schema = synapseclient.Schema(name=str(uuid.uuid4()),
                                  columns=cols,
                                  parent=parent)
    schema = syn.store(schema)
    table = syn.store(synapseclient.Table(schema, df))
    return schema
def write_synapse_table(table_data,
                        synapse_project_id,
                        table_name='',
                        username='',
                        password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> import os
    >>> import pandas as pd
    >>> from mhealthx.xio import write_synapse_table
    >>> path = os.environ['MHEALTHX_OUTPUT']
    >>> table = os.path.join(path, 'feature_tables',
    ...         'tap_row0_v0_9d44a388-5d7e-4271-8705-2faa66204486.csv')
    >>> table_data = pd.read_csv(table)
    >>> username = ''
    >>> password = ''
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Contents of table'
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse(skip_checks=True)

    # Log in to Synapse:
    if username and password:
        syn.login(username, password, silent=True)
    else:
        syn.login(silent=True)

    #table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
Exemple #6
0
def write_synapse_table(table_data, synapse_project_id, table_name='',
                        username='', password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> import os
    >>> import pandas as pd
    >>> from mhealthx.xio import write_synapse_table
    >>> path = os.environ['MHEALTHX_OUTPUT']
    >>> table = os.path.join(path, 'feature_tables',
    ...         'tap_row0_v0_9d44a388-5d7e-4271-8705-2faa66204486.csv')
    >>> table_data = pd.read_csv(table)
    >>> username = ''
    >>> password = ''
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Contents of table'
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse(skip_checks=True)

    # Log in to Synapse:
    if username and password:
        syn.login(username, password, silent=True)
    else:
        syn.login(silent=True)

    #table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id, includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
Exemple #7
0
def test_tables_pandas(syn, project):
    # create a pandas DataFrame
    df = pd.DataFrame({
        'A': ("foo", "bar", "baz", "qux", "asdf"),
        'B':
        tuple(0.42 * i for i in range(5)),
        'C': (101, 202, 303, 404, 505),
        'D': (False, True, False, True, False),
        # additional data types supported since SYNPY-347
        'int64':
        tuple(np.int64(range(5))),
        'datetime64':
        tuple(
            np.datetime64(d) for d in [
                '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                '2005-02-05'
            ]),
        'string_':
        tuple(
            np.string_(s)
            for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
    })

    cols = as_table_columns(df)
    cols[0].maximumSize = 20
    schema = Schema(name="Nifty Table", columns=cols, parent=project)

    # store in Synapse
    table = syn.store(Table(schema, df))

    # retrieve the table and verify
    results = syn.tableQuery('select * from %s' % table.schema.id,
                             resultsAs='csv')
    df2 = results.asDataFrame(convert_to_datetime=True)

    # simulate rowId-version rownames for comparison
    df.index = ['%s_1' % i for i in range(1, 6)]

    df['string_'] = df['string_'].transform(str)

    # SYNPY-717
    df['datetime64'] = df['datetime64'].apply(
        lambda x: pd.Timestamp(x).tz_localize('UTC'))

    # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column;
    # second .all() gives a bool that is ANDed value of that Series

    assert_frame_equal(df2, df)
def update_team_stats_table(syn, project_id, team_stats_df):
    """
    Push the latest version of the team stats table to Synapse.
    """
    try:
        print("Searching for existing 'TeamStats' table...")
        schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table'])
                     if t['name'] == 'TeamStats'][0]['id']
        schema = syn.get(schema_id)
        team_stats_table = syn.tableQuery('select * from {}'.format(schema_id))
        team_stats_table.schema = schema
        print("Updating 'TeamStats' table...")
        update_table = synapseclient.Table(schema, team_stats_df)
        team_stats_table = _update_syn_table(team_stats_table, update_table, 'team')
    except IndexError:
        print("Creating 'TeamStats' table...")
        project = syn.get(project_id)
        cols = synapseclient.as_table_columns(team_stats_df)
        schema = synapseclient.Schema(name='TeamStats', columns=cols, parent=project)
        team_stats_table = synapseclient.Table(schema, team_stats_df)
    print("Storing 'TeamStats' table...")
    team_stats_table = syn.store(team_stats_table)
Exemple #9
0
def createAMPADTable(keyFile, clinicalFile):
    """
    Create the AMP AD table with merged data from keyFile with clinicalFile.
    If any of the supplementary files exist for a particular dataset, change
    the binary classifiers to the synapse ID holding the data and reset 0
    to null for the table.

    Input:
        keyFile: Dataframe with the keys and information regarding what
            exists for each patient
        clinicalFile: Dataframe with clinical data for various patients

    """

    toUpload = []

    clinicalHeader = clinicalFile.columns.values

    #seenList = []
    # Iterate through each project within keyFile
    for i, row in keyFile.iterrows():
        # Create empty list for new row to be added to synapse table
        newRow = []

        # Ignore binary varibles which all end in '_data'
        for item in row.iteritems():
            if (item[0] == 'niagas_data'):
                if (not pd.isnull(row.niagas_data)):
                    newRow.append(arrayExpressionSynID)
                else:
                    newRow.append(float('nan'))

            elif (not item[0].endswith('_data')):
                newRow.append(item[1])

        # Check if row has clinical data
        if (row.clinical_data):
            # Create reference to clinicalFile project ID
            clinicalKeyList = clinicalFile['projid']

            # get the index of the projID in the clinical file
            index = clinicalKeyList[clinicalKeyList ==
                                    row.projid].index.tolist()

            if (len(index) == 1):
                index = index[0]
                #seenList.append(row.projid)
                for entry in clinicalFile.iloc[index][1:]:
                    newRow.append(entry)

            # If the length of the idnex is 0, it means the key file thinks
            # there is clinical information for this patient but it does
            # not exist in the clinical file
            elif (len(index) == 0):
                print("Key file indicates that projID %s should have "\
                    "clinical information, but it does not exist in "\
                    "the clinical information file" % row.projid)
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

            # If the lengh of index list is greater than 1, that means projID
            # appears more than once in the file. Send warning to user
            else:
                print("projID %s appears more than once in clinical file at "\
                    "positions %s" % (row.projid, index))
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

        else:
            for _ in range(1, len(clinicalHeader)):
                newRow.append(float('nan'))

        # Check if row has gwas data
        if (row.gwas_data):
            newRow.append(genotypeSynID)
            newRow.append(imputedGenotypeSynID)
        else:
            newRow.append(float('nan'))
            newRow.append(float('nan'))

        if (row.mwas_data):
            newRow.append(methylationSynID)
        else:
            newRow.append(float('nan'))

        if (row.mirna_data):
            newRow.append(mirnaSynID)
        else:
            newRow.append(float('nan'))

        if (row.mrna_data):
            newRow.append(rnaseqSynID)
        else:
            newRow.append(float('nan'))

        toUpload.append(newRow)

    df = pd.DataFrame(toUpload)
    columns = list(keyFile.columns.values)
    index = columns.index('clinical_data') - 1
    columns.remove('clinical_data')

    idnex = columns.index('gwas_data')
    columns.remove('gwas_data')
    columns.insert(index + 1, 'genotype data')
    columns.insert(index + 2, 'imputed genotype data')

    for i in range(1, len(clinicalHeader)):
        columns.insert(index + i, clinicalHeader[i])

    df.columns = columns

    df.to_csv('mergedTables.csv', encodings='utf-8', index=False)

    print("Uploading to Synapse")
    schema = Schema(name='AMP AD Samples Table',
                    columns=as_table_columns(df),
                    parent='syn2580853')
    syn.store(Table(schema, df))
Exemple #10
0
def opensmile_features_to_synapse(in_files, synapse_project_id,
                                  table_name, username, password):
    """
    Save openSMILE's SMILExtract audio features to a Synapse table.

    Parameters
    ----------
    in_files : list of strings
        full path to the input files
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    table_name : string
        schema name of table
    synapse_table_id : string
        Synapse table ID

    Examples
    --------
    >>> from mhealthx.features import opensmile_features_to_synapse
    >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Phonation openSMILE feature table'
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password)

    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    from mhealthx.io_data import concatenate_tables_to_synapse_table as cat

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store each file as a row in a Synapse table:
    first = True
    for in_file in in_files:
        if first:
            df_data = pd.read_csv(in_file)
            first = False
        else:
            df_data = pd.read_csv(in_file)

    table_data, project_id = cat(frames, synapse_project_id, table_name,
                                 username, password)

    # Create table schema:
    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))
    synapse_table_id = str(table.tableId)

    return table_data, table_name, synapse_table_id
def transferTables(syn,
                   sourceProjId,
                   uploadProjId,
                   extId_Str='',
                   simpleNameFilters=[],
                   healthCodeList=None):
    """ This function transfers tables from a source project to the upload project (target project) 
    sorted by external Ids which contain extId_Str, group tables with simpleNameFilters, also can filter
    tables by healthcodes and then group by activity"""

    # dataframe of all tables using get_tables from synapsebridgehelper.tableHelpers
    all_tables = synapsebridgehelpers.get_tables(syn, sourceProjId,
                                                 simpleNameFilters)

    # Converting externalIds to healthCodes
    if extId_Str != '':
        res = synapsebridgehelpers.externalIds2healthCodes(
            syn, list(all_tables['table.id']))
        res = res[res['externalId'].str.contains(extId_Str)]
        healthCodeList = list(res['healthCode'])

    # List of tables sorted by activity and filtered using healthcodes
    tables_dict = synapsebridgehelpers.filterTablesByActivity(
        syn, all_tables, healthCodes=healthCodeList)

    # Iterate over each activity in tables_dict
    for activity_, activityTableIds in tables_dict.items():
        df_list = []  # list of dataframes corresponding to that activity
        cols_filehandleid = [
        ]  # list of columns that have type FILEHANDLEID across all dataframes for that activity

        # looping over all tables corresponding to that activity
        for table_index in range(0, len(activityTableIds)):
            result = synapsebridgehelpers.tableWithFileIds(
                syn,
                table_id=activityTableIds[table_index],
                healthcodes=healthCodeList)
            cols_filehandleid = cols_filehandleid + list(
                set(result['cols']) - set(cols_filehandleid))
            df_list.append(result['df'])

        # Concatenating all tables to form one table for the activity
        df_main = pd.concat(df_list)
        cols = synapseclient.as_table_columns(df_main)

        # Change the type of columns that are FILEHANDLEIDs as calculated before
        for col in cols:
            if col.name in cols_filehandleid:
                col.columnType = 'FILEHANDLEID'

        # If different datatypes happen while merging tables this will change the column type in the resulting dataframe
        # The following code sets it right and casts the data into its original form / form that syn.store would accept
        # (for FILEHANDLEID type columns, the input needs to be an integer)
        for col in cols:
            if col.columnType == 'STRING':
                df_main[col.name] = [
                    str(item) if item == item else ''
                    for item in df_main[col.name]
                ]
            elif col.columnType == 'INTEGER':
                df_main[col.name] = [
                    int(item) if item == item else ''
                    for item in df_main[col.name]
                ]
            elif col.columnType == 'FILEHANDLEID':
                df_main[col.name] = [
                    int(item) if (item != '' and item == item) else ''
                    for item in df_main[col.name]
                ]
            else:
                df_main[col.name] = [
                    item if item == item else '' for item in df_main[col.name]
                ]

        # Updaing schema and uploading
        schema = synapseclient.Schema(name=activity_,
                                      columns=cols,
                                      parent=uploadProjId)
        table = synapseclient.Table(schema, df_main)
        table = syn.store(table)
        table = syn.setProvenance(table.schema.id,
                                  activity=synapseclient.activity.Activity(
                                      used=tables_dict[activity_]))
Exemple #12
0
def createAMPADTable(keyFile, clinicalFile):
    """
    Create the AMP AD table with merged data from keyFile with clinicalFile.
    If any of the supplementary files exist for a particular dataset, change
    the binary classifiers to the synapse ID holding the data and reset 0
    to null for the table.

    Input:
        keyFile: Dataframe with the keys and information regarding what
            exists for each patient
        clinicalFile: Dataframe with clinical data for various patients

    """

    toUpload = []

    clinicalHeader = clinicalFile.columns.values

    # seenList = []
    # Iterate through each project within keyFile
    for i, row in keyFile.iterrows():
        # Create empty list for new row to be added to synapse table
        newRow = []

        # Ignore binary varibles which all end in '_data'
        for item in row.iteritems():
            if item[0] == "niagas_data":
                if not pd.isnull(row.niagas_data):
                    newRow.append(arrayExpressionSynID)
                else:
                    newRow.append(float("nan"))

            elif not item[0].endswith("_data"):
                newRow.append(item[1])

        # Check if row has clinical data
        if row.clinical_data:
            # Create reference to clinicalFile project ID
            clinicalKeyList = clinicalFile["projid"]

            # get the index of the projID in the clinical file
            index = clinicalKeyList[clinicalKeyList == row.projid].index.tolist()

            if len(index) == 1:
                index = index[0]
                # seenList.append(row.projid)
                for entry in clinicalFile.iloc[index][1:]:
                    newRow.append(entry)

            # If the length of the idnex is 0, it means the key file thinks
            # there is clinical information for this patient but it does
            # not exist in the clinical file
            elif len(index) == 0:
                print(
                    "Key file indicates that projID %s should have "
                    "clinical information, but it does not exist in "
                    "the clinical information file" % row.projid
                )
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float("nan"))

            # If the lengh of index list is greater than 1, that means projID
            # appears more than once in the file. Send warning to user
            else:
                print("projID %s appears more than once in clinical file at " "positions %s" % (row.projid, index))
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float("nan"))

        else:
            for _ in range(1, len(clinicalHeader)):
                newRow.append(float("nan"))

        # Check if row has gwas data
        if row.gwas_data:
            newRow.append(genotypeSynID)
            newRow.append(imputedGenotypeSynID)
        else:
            newRow.append(float("nan"))
            newRow.append(float("nan"))

        if row.mwas_data:
            newRow.append(methylationSynID)
        else:
            newRow.append(float("nan"))

        if row.mirna_data:
            newRow.append(mirnaSynID)
        else:
            newRow.append(float("nan"))

        if row.mrna_data:
            newRow.append(rnaseqSynID)
        else:
            newRow.append(float("nan"))

        toUpload.append(newRow)

    df = pd.DataFrame(toUpload)
    columns = list(keyFile.columns.values)
    index = columns.index("clinical_data") - 1
    columns.remove("clinical_data")

    idnex = columns.index("gwas_data")
    columns.remove("gwas_data")
    columns.insert(index + 1, "genotype data")
    columns.insert(index + 2, "imputed genotype data")

    for i in range(1, len(clinicalHeader)):
        columns.insert(index + i, clinicalHeader[i])

    df.columns = columns

    df.to_csv("mergedTables.csv", encodings="utf-8", index=False)

    print("Uploading to Synapse")
    schema = Schema(name="AMP AD Samples Table", columns=as_table_columns(df), parent="syn2580853")
    syn.store(Table(schema, df))
    record = Entrez.read(handle)
    datasets = []
    for geo_id in record['IdList']:
        handle = Entrez.esummary(db="gds", id=geo_id)
        dataset = Entrez.read(handle)[0]
        del dataset['Samples']
        del dataset['SSInfo']
        for k,v in dataset.iteritems():
            try:
                if len(v)>1000:
                    dataset[k]=v[:999]
            except TypeError:
                pass
        datasets.append(dataset)
        
df = pd.DataFrame(datasets)
df.drop(['ExtRelations', 'Projects', 'Relations'], axis=1, inplace=True)

ftpLink = Column(columnType='LINK',  maximumSize=84,  name= 'FTPLink')
columns = [ftpLink if col['name']=='FTPLink' else col for col in as_table_columns(df)]


schema = Schema(name='GEO Datasets', columns=columns, parent='syn4012977')

df.to_csv('skit.csv', encoding='utf-8', index=False)

table = syn.store(Table(schema, 'skit.csv'))