Ejemplo n.º 1
0
def test_synapse_integer_columns_with_missing_values_from_dataframe(
        syn, project, schedule_for_cleanup):
    # SYNPY-267
    cols = [
        Column(name='x', columnType='STRING'),
        Column(name='y', columnType='INTEGER'),
        Column(name='z', columnType='DOUBLE')
    ]
    schema = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    line_terminator = str(os.linesep)
    # write rows to CSV file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
                                     delete=False) as temp:
        schedule_for_cleanup(temp.name)
        # 2nd row is missing a value in its integer column
        temp.write('x,y,z' + line_terminator + 'a,1,0.9' + line_terminator +
                   'b,,0.8' + line_terminator + 'c,3,0.7' + line_terminator)
        temp.flush()
        filename = temp.name

    # create a table from csv
    table = Table(schema, filename)
    df = table.asDataFrame()

    table_from_dataframe = Table(schema, df)
    assert table.filepath != table_from_dataframe.filepath
    df2 = table_from_dataframe.asDataFrame()
    assert_frame_equal(df, df2)
Ejemplo n.º 2
0
def updateDatabase(database, new_dataset, databaseSynId, checkBy):
	"""
	Updates synapse tables by a row identifier with another dataset that has the same number and order of columns
	
	:param database:   	   The synapse table (pandas dataframe)
	:param new_dataset:    New dataset (pandas dataframe)
	:param databaseSynId   Synapse Id of the database table
	:param checkBy:        Column to compare both datasets by

	:returns:      		   Don't know yet	
	"""
	updatedSet = database.apply(lambda x: _updateRows(x, new_dataset, checkBy),axis=1)
	updatedSet = updatedSet[~updatedSet[checkBy].isnull()]
	#All new rows
	newSet =  new_dataset[~new_dataset[checkBy].isin(database[checkBy])]
	#All deleted rows (This assumes that all data that don't show up in the new uploaded data should be deleted...)
	deleteSets = database[~database[checkBy].isin(new_dataset[checkBy])]
	print(updatedSet.empty)
	print(newSet.empty)
	print(deleteSets.empty)
	if not deleteSets.empty:
		deleteRows = syn.delete(Table(syn.get(databaseSynId), deleteSets))
	else:
		print("No deleted rows")
	#updatedSet = updatedSet.append(newSet)
	if not updatedSet.empty:
		table = syn.store(Table(syn.get(databaseSynId), updatedSet))	
	else:
		print("No updated rows")
	if not newSet.empty:
		table = syn.store(Table(syn.get(databaseSynId), newSet))	
	else:
		print("No new rows")
def process_new_table(args, syn):
    """
    Function: process_new_table

    Purpose: Create an annotations table with the specified name under the
             specified Synapse parent ID using the specified JSON schema. This
             function is called when the "new_table" option is specified when
             the program is called.

    Arguments: JSON schema file reference
               Synapse parent ID
               Synapse table name
               A Synapse client object
    """

    # Define column names for the synapse table.
    dcc_column_names = [
        Column(name="key", columnType="STRING", maximumSize=100),
        Column(name="description", columnType="STRING", maximumSize=250),
        Column(name="columnType", columnType="STRING", maximumSize=50),
        Column(name="maximumSize", columnType="DOUBLE"),
        Column(name="value", columnType="STRING", maximumSize=250),
        Column(name="valueDescription", columnType="LARGETEXT"),
        Column(name="source", columnType="STRING", maximumSize=250),
        Column(name="module", columnType="STRING", maximumSize=100)
    ]

    syn_table_df = process_schema(args.json_schema_file)

    # Build and populate the Synapse table.
    table_schema = Schema(name=args.synapse_table_name,
                          columns=dcc_column_names,
                          parent=args.parent_synapse_id)
    dcc_table = syn.store(Table(table_schema, syn_table_df))
Ejemplo n.º 4
0
    def __syn_store(self, data):
        """store data to Synapse

        Notes: Synapse frequently encounters SSL and other connection errors. This method will retry the push however
        many times are defined in the application config setting SYNAPSE_RETRIES. Sleeps three seconds between attempts

        Args:
            data: (dict) should match SYN_SCHEMA defined above

        Returns:
            None
        """
        retries = secrets.SYNAPSE_RETRIES

        while retries > 0:
            try:
                syn.store(Table(SYN_SCHEMA, data))
                retries = 0
            except SSLError:
                pass
            except SynapseHTTPError:
                pass
            except Exception as e:
                add_log_entry(
                    f'consent failed to push to Synapse with <{str(e)}>',
                    self.internal_id)
                retries = 0

            retries -= 1
            time.sleep(3)
Ejemplo n.º 5
0
def _copyTable(syn, entity, destinationId, updateExisting=False):
    """
    Copies synapse Tables

    :param entity:          A synapse ID of Table Schema

    :param destinationId:   Synapse ID of a project that the Table wants to be copied to

    :param updateExisting:  Can choose to update files that have the same name
                            Default to False
    """

    print("Getting table %s" % entity)
    myTableSchema = syn.get(entity)
    # CHECK: If Table name already exists, raise value error
    existingEntity = syn.findEntityId(myTableSchema.name, parent=destinationId)
    if existingEntity is not None:
        raise ValueError(
            'An entity named "%s" already exists in this location. Table could not be copied'
            % myTableSchema.name)

    d = syn.tableQuery('select * from %s' % myTableSchema.id,
                       includeRowIdAndRowVersion=False)

    colIds = myTableSchema.columnIds

    newTableSchema = Schema(name=myTableSchema.name,
                            parent=destinationId,
                            columns=colIds)

    print("Created new table using schema %s" % newTableSchema.name)
    newTable = Table(schema=newTableSchema, values=d.filepath)
    newTable = syn.store(newTable)
    return newTable.schema.id
Ejemplo n.º 6
0
def test_tables_csv(syn, project):

    # Define schema
    cols = [
        Column(name='Name', columnType='STRING'),
        Column(name='Born', columnType='INTEGER'),
        Column(name='Hipness', columnType='DOUBLE'),
        Column(name='Living', columnType='BOOLEAN')
    ]

    schema = Schema(name='Jazz Guys', columns=cols, parent=project)

    data = [["John Coltrane", 1926, 8.65, False],
            ["Miles Davis", 1926, 9.87, False],
            ["Bill Evans", 1929, 7.65, False],
            ["Paul Chambers", 1935, 5.14, False],
            ["Jimmy Cobb", 1929, 5.78, True],
            ["Scott LaFaro", 1936, 4.21, False],
            ["Sonny Rollins", 1930, 8.99, True],
            ["Kenny Burrel", 1931, 4.37, True]]

    # the following creates a CSV file and uploads it to create a new table
    table = syn.store(Table(schema, data))

    # Query and download an identical CSV
    results = syn.tableQuery("select * from %s" % table.schema.id,
                             resultsAs="csv",
                             includeRowIdAndRowVersion=False)

    # Test that CSV file came back as expected
    for expected_row, row in zip(data, results):
        assert expected_row == row, "expected %s but got %s" % (expected_row,
                                                                row)
def add_new_rows_to_table(df, replace_table=False, dry_run=False):
    """Add rows for synapse IDs not already represented in the table or replace the whole table"""
    schema = syn.get(TABLE_SYNAPSE_ID)
    if replace_table:
        ## delete previous entries in pilot-63-progress table
        results = syn.tableQuery('select * from %s' % utils.id_of(schema),
                                 resultsAs='rowset')
        if not dry_run:
            syn.delete(results)
    else:
        results = syn.tableQuery('select synapse_id from %s' %
                                 utils.id_of(schema),
                                 includeRowIdAndRowVersion=False)
        synapse_ids = [row[0] for row in results]
        df = df[[
            synapse_id not in synapse_ids for synapse_id in df['synapse_id']
        ]]

    if df.shape[0] > 0:
        if dry_run:
            print "Dry run: would have added %d rows to pilot-63-progress table" % df.shape[
                0]
        else:
            print "Adding %d rows to pilot-63-progress table" % df.shape[0]
            syn.store(Table(schema, df))
        return df.shape[0]
    else:
        print "No new rows for pilot-63-progress table"
        return None
Ejemplo n.º 8
0
def upload(directory, synID, synName, dataFrameList):
    """
    Upload the data to a Synapse table

    Input:
        directory: The name of the directory holding the data
        synID: Synapse ID of the project where the table will be stored
        synName: Name to be given to the new table
        dataFrameList: List of dataframes with all of the data

    """

    df = pd.DataFrame()
    print("Creating dataframe")
    for entry in dataFrameList:
        df = df.append(entry, ignore_index=True)

    # Each of these columns are longer than 1000 characters each.
    # Cut them down to 1000 chars max
    df = df.applymap(lambda x: str(x)[:1000])

    print("Writing to file")
    df.to_csv('%s/allData.csv' % directory, encodings='utf-8', index=False)

    print("Uploading to Synapse")
    schema = Schema(name=synName, columns=as_table_columns(df), parent=synID)
    syn.store(Table(schema, df))
def write_synapse_table(table_data,
                        synapse_project_id,
                        table_name='',
                        username='',
                        password=''):
    """
    Write data to a Synapse table.

    Parameters
    ----------
    table_data : Pandas DataFrame
        Synapse table contents
    synapse_project_id : string
        Synapse ID for project within which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Examples
    --------
    >>> import os
    >>> import pandas as pd
    >>> from mhealthx.xio import write_synapse_table
    >>> path = os.environ['MHEALTHX_OUTPUT']
    >>> table = os.path.join(path, 'feature_tables',
    ...         'tap_row0_v0_9d44a388-5d7e-4271-8705-2faa66204486.csv')
    >>> table_data = pd.read_csv(table)
    >>> username = ''
    >>> password = ''
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Contents of table'
    >>> write_synapse_table(table_data, synapse_project_id, table_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    syn = synapseclient.Synapse(skip_checks=True)

    # Log in to Synapse:
    if username and password:
        syn.login(username, password, silent=True)
    else:
        syn.login(silent=True)

    #table_data.index = range(table_data.shape[0])

    schema = Schema(name=table_name,
                    columns=as_table_columns(table_data),
                    parent=synapse_project_id,
                    includeRowIdAndRowVersion=False)

    syn.store(Table(schema, table_data))
Ejemplo n.º 10
0
def test_store_table_datetime(syn, project):
    current_datetime = datetime.fromtimestamp(round(time.time(), 3))
    schema = syn.store(
        Schema("testTable", [Column(name="testerino", columnType='DATE')],
               project))
    rowset = RowSet(rows=[Row([current_datetime])], schema=schema)
    syn.store(Table(schema, rowset))

    query_result = syn.tableQuery("select * from %s" % utils.id_of(schema),
                                  resultsAs="rowset")
    assert current_datetime == query_result.rowset['rows'][0]['values'][0]
Ejemplo n.º 11
0
def build_synapse_table():
    """build the table in Synapse to match the schema defined above"""
    table = Table(SYN_SCHEMA, values=[BLANK_CONSENT])
    table = syn.store(table)

    results = syn.tableQuery("select * from %s where study_id = '%s'" %
                             (table.tableId, BLANK_CONSENT[0]))
    syn.delete(results)

    syn.setProvenance(entity=table.tableId,
                      activity=synapseclient.Activity(
                          name='Created',
                          description='Table generated by gTap.'))
Ejemplo n.º 12
0
def test_tables_pandas(syn, project):
    # create a pandas DataFrame
    df = pd.DataFrame({
        'A': ("foo", "bar", "baz", "qux", "asdf"),
        'B':
        tuple(0.42 * i for i in range(5)),
        'C': (101, 202, 303, 404, 505),
        'D': (False, True, False, True, False),
        # additional data types supported since SYNPY-347
        'int64':
        tuple(np.int64(range(5))),
        'datetime64':
        tuple(
            np.datetime64(d) for d in [
                '2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
                '2005-02-05'
            ]),
        'string_':
        tuple(
            np.string_(s)
            for s in ['urgot', 'has', 'dark', 'mysterious', 'past'])
    })

    cols = as_table_columns(df)
    cols[0].maximumSize = 20
    schema = Schema(name="Nifty Table", columns=cols, parent=project)

    # store in Synapse
    table = syn.store(Table(schema, df))

    # retrieve the table and verify
    results = syn.tableQuery('select * from %s' % table.schema.id,
                             resultsAs='csv')
    df2 = results.asDataFrame(convert_to_datetime=True)

    # simulate rowId-version rownames for comparison
    df.index = ['%s_1' % i for i in range(1, 6)]

    df['string_'] = df['string_'].transform(str)

    # SYNPY-717
    df['datetime64'] = df['datetime64'].apply(
        lambda x: pd.Timestamp(x).tz_localize('UTC'))

    # df2 == df gives Dataframe of boolean values; first .all() gives a Series object of ANDed booleans of each column;
    # second .all() gives a bool that is ANDed value of that Series

    assert_frame_equal(df2, df)
def process_overwrite_table(args, syn):
    """
    Function: process_overwrite_table

    Purpose: Overwrite the specified annotations table with data contained in
             the specified JSON schema. This function is called when the
             "overwrite_table" option is specified when the program is called.

    Arguments: JSON schema file reference
               Synapse ID of the table to be overwritten
               A Synapse client object
    """

    syn_table_df = process_schema(args.json_schema_file)

    # Delete the old records from the Synapse table and then write out the
    # new ones.
    dcc_val_table = syn.get(args.table_synapse_id)
    results = syn.tableQuery(f"select * from {dcc_val_table.id}")
    delete_out = syn.delete(results)

    table_out = syn.store(Table(dcc_val_table.id, syn_table_df))
Ejemplo n.º 14
0
def countAndUpdateTable(input, tableId):
    i, fileMeta = input
    print 'updating table:%s' %tableId, 'with file %s(%s)' %(fileMeta['name'], fileMeta.id), fileMeta['basename']
    ent = syn.get(fileMeta.id)
    if fileMeta.fileType =='bed5':
        data = pd.read_csv(ent.path, sep='\t')
        nFeatures = 0
        samples = list(set(data.Sample.dropna()))
    else: #All other fileTypes
        data = pd.read_csv(ent.path, sep='\t', index_col=0)
        nFeatures, nSamples = data.shape
        samples = data.columns
    metadata = pd.DataFrame([fileMeta]*len(samples))
    metadata['nFeatures'] = nFeatures
    metadata['samples'] = samples
    metadata['patient_barcode'] = [x[:12] for x in metadata.samples]
    metadata.drop(['tissue', u'md5', u'assembly'], axis=1, inplace=True)
    metadata.nFeatures = metadata.nFeatures.astype('int')
    cols = syn.tableQuery('select * from %s limit 1' %args.tableId).asDataFrame().columns

    #Update rows in table
    print 'adding', metadata.shape[0]
    t = syn.store(Table(tableId, metadata[cols]))
    return metadata
Ejemplo n.º 15
0
def update_global_scores_table(global_data):
    import challenge_config as config
    from synapseclient import Schema, Column, Table, Row, RowSet, as_table_columns
    # 'principalId', 'name', 'score_lb', 'score_mean', 'score_ub', 'rank'
    cols = [
        Column(name='UserID', columnType='STRING', maximumSize=100),
        Column(name='Name', columnType='STRING', maximumSize=100),
        Column(name='score_lb', columnType='DOUBLE'),
        Column(name='score_mean', columnType='DOUBLE'),
        Column(name='score_ub', columnType='DOUBLE'),
        Column(name='rank', columnType='DOUBLE'),
    ]
    schema = Schema(name='Global Scores',
                    columns=cols,
                    parent=config.CHALLENGE_SYN_ID)

    results = syn.tableQuery("select * from {}".format('syn7237020'))
    if len(results) > 0:
        a = syn.delete(results.asRowSet())
    table = syn.store(Table(schema, global_data))
    results = syn.tableQuery("select * from {}".format(table.tableId))
    for row in results:
        print row
    return
Ejemplo n.º 16
0
def test_create_and_update_file_view(syn, project, schedule_for_cleanup):

    # Create a folder
    folder = Folder(str(uuid.uuid4()),
                    parent=project,
                    description='creating a file-view')
    folder = syn.store(folder)

    # Create dummy file with annotations in our folder
    path = utils.make_bogus_data_file()
    file_annotations = dict(fileFormat='jpg',
                            dataType='image',
                            artist='Banksy',
                            medium='print',
                            title='Girl With Ballon')
    schedule_for_cleanup(path)
    a_file = File(path, parent=folder, annotations=file_annotations)
    a_file = syn.store(a_file)
    schedule_for_cleanup(a_file)

    # Add new columns for the annotations on this file and get their IDs
    my_added_cols = [
        syn.store(Column(name=k, columnType="STRING"))
        for k in file_annotations.keys()
    ]
    my_added_cols_ids = [c['id'] for c in my_added_cols]
    view_default_ids = [
        c['id'] for c in syn._get_default_view_columns(
            "entityview", EntityViewType.FILE.value)
    ]
    col_ids = my_added_cols_ids + view_default_ids
    scopeIds = [folder['id'].lstrip('syn')]

    # Create an empty entity-view with defined scope as folder

    entity_view = EntityViewSchema(name=str(uuid.uuid4()),
                                   scopeIds=scopeIds,
                                   addDefaultViewColumns=True,
                                   addAnnotationColumns=False,
                                   type='file',
                                   columns=my_added_cols,
                                   parent=project)

    entity_view = syn.store(entity_view)
    schedule_for_cleanup(entity_view)

    assert set(scopeIds) == set(entity_view.scopeIds)
    assert set(col_ids) == set(entity_view.columnIds)
    assert EntityViewType.FILE.value == entity_view.viewTypeMask

    # get the current view-schema
    view = syn.tableQuery("select * from %s" % entity_view.id)
    schedule_for_cleanup(view.filepath)

    view_dict = list(
        csv.DictReader(io.open(view.filepath, encoding="utf-8", newline='')))

    # check that all of the annotations were retrieved from the view
    assert set(file_annotations.keys()).issubset(set(view_dict[0].keys()))

    updated_a_file = syn.get(a_file.id, downloadFile=False)

    # Check that the values are the same as what was set
    # Both in the view and on the entity itself
    for k, v in file_annotations.items():
        assert view_dict[0][k] == v
        assert updated_a_file.annotations[k][0] == v

    # Make a change to the view and store
    view_dict[0]['fileFormat'] = 'PNG'

    with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp:
        schedule_for_cleanup(temp.name)
        temp_filename = temp.name

    with io.open(temp_filename, mode='w', encoding="utf-8",
                 newline='') as temp_file:
        dw = csv.DictWriter(temp_file,
                            fieldnames=view_dict[0].keys(),
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator=str(os.linesep))
        dw.writeheader()
        dw.writerows(view_dict)
        temp_file.flush()
    syn.store(Table(entity_view.id, temp_filename))
    new_view_dict = list(
        csv.DictReader(io.open(temp_filename, encoding="utf-8", newline='')))
    assert new_view_dict[0]['fileFormat'] == 'PNG'

    # query for the change
    start_time = time.time()

    new_view_results = syn.tableQuery("select * from %s" % entity_view.id)
    schedule_for_cleanup(new_view_results.filepath)
    new_view_dict = list(
        csv.DictReader(
            io.open(new_view_results.filepath, encoding="utf-8", newline='')))
    # query until change is seen.
    while new_view_dict[0]['fileFormat'] != 'PNG':
        # check timeout
        assert time.time() - start_time < QUERY_TIMEOUT_SEC
        # query again
        new_view_results = syn.tableQuery("select * from %s" % entity_view.id)
        new_view_dict = list(
            csv.DictReader(
                io.open(new_view_results.filepath,
                        encoding="utf-8",
                        newline='')))
    # paranoid check
    assert new_view_dict[0]['fileFormat'] == 'PNG'
def process_rows(syn,
                 voice_table_id=VOICE_TABLE,
                 last_row_version=None,
                 limit=None,
                 offset=None,
                 data_columns=DATA_COLUMNS,
                 completed=None,
                 out_files=None,
                 opensmile_conf=OPENSMILE_CONF_PATH,
                 cleanup=True,
                 append=True):
    """
    Perform audio feature extraction for rows in a Synapse table
    containing file references. For each .m4a audio file:
        1. convert to .wav with ffmpeg
        2. extract features using openSMILE
        3. accumulate metadata and features in an output .csv file

    Parameters:
        syn: connection to Synapse
        voice_table_id (str): Synapse ID of voice table.
        last_row_version: Compute features only for rows whose version is
            greater than the oven given. Enables processing rows added to
            the source table since the script was last run.
        limit: Maximum number of rows to process
        offset: Process rows starting from an offset from the 1st row
        data_columns (list of str): Column names holding file handles
        completed (list of str): Optional paths to files holding completed
            feature data (parallel to data_columns)
        out_files (list of str): Paths to files into which to write feature
            data (parallel to data_columns)
        opensmile_conf (str): Path to opensmile configuration file
        cleanup (bool): remove temporary files
        append (bool): append rows to output files

    Notes:
        The voice table is expected to hold metadata columns and fileHandles to
        raw voice data from the mPower app in m4a format. For each column in
        data_columns, the program will produce one output file, which will have
        one row of metadata and features for each row in the source table.
    """
    ##----------------------------------------------------------
    ## Query source table with limit and offset
    ##----------------------------------------------------------
    query = "SELECT {cols} FROM {table_id}".format(cols=','.join(
        double_quote(KEEP_COLUMNS + data_columns)),
                                                   table_id=voice_table_id)
    if last_row_version:
        query += " WHERE ROW_VERSION > {0}".format(last_row_version)
    if limit:
        query += " LIMIT {0}".format(limit)
    if offset:
        query += " OFFSET {0}".format(offset)
    results = syn.tableQuery(query)

    ##----------------------------------------------------------
    ## load results as a DataFrame, but specify string dtype for FileHandleId
    ## columns so Pandas doesn't infer their type as int64 or float (with nans
    ## for missing values).
    ##----------------------------------------------------------
    df = pd.read_csv(results.filepath,
                     dtype={col: 'string'
                            for col in data_columns})
    df.index = [
        "%s_%s" % (id, version)
        for id, version in zip(df["ROW_ID"], df["ROW_VERSION"])
    ]
    del df["ROW_ID"]
    del df["ROW_VERSION"]

    ##----------------------------------------------------------
    ## don't redo rows for which all data columns have
    ## already been processed
    ##----------------------------------------------------------
    completed_dfs = {}
    if completed:
        completed_rows = pd.Series(True, index=df.index)
        for i, column in enumerate(data_columns):
            ## read .csv
            completed_dfs[column] = pd.read_csv(
                completed[i], dtype={col: 'string'
                                     for col in data_columns})
            ## fix calculatedMeds column name
            completed_dfs[column] = completed_dfs[column].rename(
                columns={'medTimepoint': 'calculatedMeds'})
            ## reorder metadata columns to match query results,
            ## assuming completed dfs have metadata columns similar to:
            ## recordId, createdOn, appVersion, medTimepoint, ROW_VERSION,
            ## healthCode, phoneInfo, ROW_ID, audio_countdown.m4a, audio_audio.m4a,
            ## and cols 10:72 are the 62 GeMAPS features computed by openSMILE
            ## starting with 'F0semitoneFrom27.5Hz_sma3nz_amean'
            column_index = df.columns.append(
                completed_dfs[column].columns[10:72])
            completed_dfs[column] = completed_dfs[column][column_index]
            ## track rows with all data columns completed
            completed_rows = completed_rows & df.recordId.isin(
                completed_dfs[column].recordId)
    else:
        completed_rows = pd.Series(False, index=df.index)
    df_to_download = df.ix[~completed_rows, :]

    ##----------------------------------------------------------
    ## Bulk download audio data in .m4a format
    ##----------------------------------------------------------
    file_map = syn.downloadTableColumns(Table(results.tableId, df_to_download),
                                        data_columns)

    ##----------------------------------------------------------
    ## unix time stamps -> nicely formated dates
    ##----------------------------------------------------------
    df.createdOn = df.createdOn.apply(utils.from_unix_epoch_time)

    ##----------------------------------------------------------
    ## process audio files
    ##----------------------------------------------------------
    for i in range(df.shape[0]):
        row = df.iloc[[i], :]
        print "processing:", i, row['recordId'].values[0]

        for column, out_file in zip(data_columns, out_files):

            ## check if we've already processed this record
            if completed_rows[i]:
                out_row = completed_dfs[column].ix[
                    completed_dfs[column].recordId == df.recordId[i], :]
                print "already computed!"
            else:

                file_handle_id = df.ix[i, column]

                ## Pandas represents missing values as nan
                if isinstance(file_handle_id, float):
                    if math.isnan(file_handle_id):
                        continue

                try:
                    filepath = file_map[file_handle_id]
                except KeyError as ex2:
                    print 'No file path for file handle id "%s".' % file_handle_id
                    continue

                try:
                    ##----------------------------------------------------------
                    ## convert to wav
                    ##----------------------------------------------------------
                    basename, ext = os.path.splitext(
                        os.path.basename(filepath))
                    wave_file = basename + ".wav"
                    if os.path.exists(wave_file):
                        os.remove(wave_file)
                    command = "ffmpeg -i {infile} -ac 2 {outfile}".format(
                        infile=filepath, outfile=wave_file)
                    output = subprocess.check_output(command,
                                                     shell=True,
                                                     stderr=subprocess.STDOUT)

                    ##----------------------------------------------------------
                    ## extract features with openSMILE
                    ## example: SMILExtract -I output.wav -C ./openSMILE-2.1.0/config/gemaps/GeMAPSv01a.conf --csvoutput features.csv
                    ##----------------------------------------------------------
                    features_file = basename + ".csv"
                    if os.path.exists(features_file):
                        os.remove(features_file)
                    command = "SMILExtract -I {input_file} -C {conf_file} --csvoutput {output_file}".format(
                        input_file=wave_file,
                        conf_file=opensmile_conf,
                        output_file=features_file)
                    output = subprocess.check_output(command,
                                                     shell=True,
                                                     stderr=subprocess.STDOUT)

                    ##----------------------------------------------------------
                    ## merge metadata and features
                    ##----------------------------------------------------------
                    features = pd.read_csv(features_file,
                                           sep=';',
                                           index_col=None)
                    ## get rid of useless column
                    features.drop('name', axis=1, inplace=True)
                    ## force the indexes to be equal so they will concat into 1 row. WTF, pandas?
                    features.index = row.index
                    out_row = pd.concat((row, features), axis=1)

                except Exception as ex1:
                    try:
                        sys.stderr.write(
                            "~~>Exception while processing record:{record}\n".
                            format(record=row['recordId']))
                    except Exception as ex2:
                        sys.stderr.write(
                            "~~~>Exception while processing record.\n")

                finally:
                    if cleanup:
                        ## opensmile want to output an .arff file whether you ask it to or not. Worse
                        ## yet, it appends, so it keeps growing. Let's clean it up.
                        opensmile_arff_file = "output.arff"
                        for v in [
                                'wave_file', 'features_file',
                                'opensmile_arff_file'
                        ]:
                            try:
                                if v in locals() and os.path.exists(
                                        locals()[v]):
                                    os.remove(locals()[v])
                            except Exception as ex:
                                sys.stderr.write(
                                    'Error cleaning up temp files: ')
                                sys.stderr.write(str(ex))
                                sys.stderr.write('\n')

            ## append row to output .csv file
            append = (append or i > 0)
            with open(out_file, 'a' if append else 'w') as f:
                out_row.to_csv(f,
                               header=(not append),
                               index=False,
                               quoting=csv.QUOTE_NONNUMERIC)

    print "processing rows complete!"
Ejemplo n.º 18
0
	if not oldValues.empty:
		if not all([old == new for old, new in zip(oldValues.values[0], newValues)]):
			projectTrackerDf[projectTrackerDf['projectEntity'] == synId] = newValues
		else:
			removeSamples.append(synId)
	else:
		projectTrackerDf = projectTrackerDf.append(pd.DataFrame([newValues],columns=['projectEntity','numberOfFiles','numberOfContributors','lateModified','Active']))

newUploads = projectTrackerDf[~projectTrackerDf['projectEntity'].isin(removeSamples)]
if not newUploads.empty:
	newUploads['lateModified'] = newUploads['lateModified'].apply(int)
	newUploads['numberOfFiles'] = newUploads['numberOfFiles'].apply(int)
	newUploads['numberOfContributors'] = newUploads['numberOfContributors'].apply(int)
	newUploads['lateModified'][newUploads['lateModified'] == 0] = ""
	schema = syn.get(projectUploadActivitySynId)
	tablestore = Table(schema, newUploads, etag=projectTracker.etag)
	tablestore = syn.store(tablestore)
else:
	print("No updates!")


#Table 2: Files by assay type
#Assay Type -- Number of Files -- Number of Cell Lines
#> assay, grab number of unique synapseid, sampleIdentifier
#https://www.synapse.org/#!Synapse:syn4939478/wiki/411658
ntap_generated_data_synId = "syn7805078"
ntap_generated_data = syn.tableQuery('SELECT * FROM %s' % ntap_generated_data_synId)
ntap_generated_data_df = ntap_generated_data.asDataFrame()

annot_synIds = ["syn7506024","syn7805075","syn7992153"]
assaysNumSynId = {}
Ejemplo n.º 19
0
def opensmile_features_to_synapse(in_files, synapse_project_id,
                                  table_name, username, password):
    """
    Save openSMILE's SMILExtract audio features to a Synapse table.

    Parameters
    ----------
    in_files : list of strings
        full path to the input files
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    table_data : Pandas DataFrame
        output table
    table_name : string
        schema name of table
    synapse_table_id : string
        Synapse table ID

    Examples
    --------
    >>> from mhealthx.features import opensmile_features_to_synapse
    >>> in_files = ['/home/arno/smile/test1.wav.csv','/home/arno/smile/test2.wav.csv','/home/arno/smile/test3.wav.csv']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Phonation openSMILE feature table'
    >>> username = ''
    >>> password = ''
    >>> table_data, table_name, synapse_table_id = opensmile_features_to_synapse(in_files, synapse_project_id, table_name, username, password)

    """
    import pandas as pd
    import synapseclient
    from synapseclient import Schema, Table, as_table_columns

    from mhealthx.io_data import concatenate_tables_to_synapse_table as cat

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store each file as a row in a Synapse table:
    first = True
    for in_file in in_files:
        if first:
            df_data = pd.read_csv(in_file)
            first = False
        else:
            df_data = pd.read_csv(in_file)

    table_data, project_id = cat(frames, synapse_project_id, table_name,
                                 username, password)

    # Create table schema:
    schema = Schema(name=table_name, columns=as_table_columns(table_data),
                    parent=synapse_project_id)

    # Store as Synapse table:
    table = syn.store(Table(schema, table_data))
    synapse_table_id = str(table.tableId)

    return table_data, table_name, synapse_table_id
Ejemplo n.º 20
0
def createAMPADTable(keyFile, clinicalFile):
    """
    Create the AMP AD table with merged data from keyFile with clinicalFile.
    If any of the supplementary files exist for a particular dataset, change
    the binary classifiers to the synapse ID holding the data and reset 0
    to null for the table.

    Input:
        keyFile: Dataframe with the keys and information regarding what
            exists for each patient
        clinicalFile: Dataframe with clinical data for various patients

    """

    toUpload = []

    clinicalHeader = clinicalFile.columns.values

    #seenList = []
    # Iterate through each project within keyFile
    for i, row in keyFile.iterrows():
        # Create empty list for new row to be added to synapse table
        newRow = []

        # Ignore binary varibles which all end in '_data'
        for item in row.iteritems():
            if (item[0] == 'niagas_data'):
                if (not pd.isnull(row.niagas_data)):
                    newRow.append(arrayExpressionSynID)
                else:
                    newRow.append(float('nan'))

            elif (not item[0].endswith('_data')):
                newRow.append(item[1])

        # Check if row has clinical data
        if (row.clinical_data):
            # Create reference to clinicalFile project ID
            clinicalKeyList = clinicalFile['projid']

            # get the index of the projID in the clinical file
            index = clinicalKeyList[clinicalKeyList ==
                                    row.projid].index.tolist()

            if (len(index) == 1):
                index = index[0]
                #seenList.append(row.projid)
                for entry in clinicalFile.iloc[index][1:]:
                    newRow.append(entry)

            # If the length of the idnex is 0, it means the key file thinks
            # there is clinical information for this patient but it does
            # not exist in the clinical file
            elif (len(index) == 0):
                print("Key file indicates that projID %s should have "\
                    "clinical information, but it does not exist in "\
                    "the clinical information file" % row.projid)
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

            # If the lengh of index list is greater than 1, that means projID
            # appears more than once in the file. Send warning to user
            else:
                print("projID %s appears more than once in clinical file at "\
                    "positions %s" % (row.projid, index))
                for _ in range(1, len(clinicalHeader)):
                    newRow.append(float('nan'))

        else:
            for _ in range(1, len(clinicalHeader)):
                newRow.append(float('nan'))

        # Check if row has gwas data
        if (row.gwas_data):
            newRow.append(genotypeSynID)
            newRow.append(imputedGenotypeSynID)
        else:
            newRow.append(float('nan'))
            newRow.append(float('nan'))

        if (row.mwas_data):
            newRow.append(methylationSynID)
        else:
            newRow.append(float('nan'))

        if (row.mirna_data):
            newRow.append(mirnaSynID)
        else:
            newRow.append(float('nan'))

        if (row.mrna_data):
            newRow.append(rnaseqSynID)
        else:
            newRow.append(float('nan'))

        toUpload.append(newRow)

    df = pd.DataFrame(toUpload)
    columns = list(keyFile.columns.values)
    index = columns.index('clinical_data') - 1
    columns.remove('clinical_data')

    idnex = columns.index('gwas_data')
    columns.remove('gwas_data')
    columns.insert(index + 1, 'genotype data')
    columns.insert(index + 2, 'imputed genotype data')

    for i in range(1, len(clinicalHeader)):
        columns.insert(index + i, clinicalHeader[i])

    df.columns = columns

    df.to_csv('mergedTables.csv', encodings='utf-8', index=False)

    print("Uploading to Synapse")
    schema = Schema(name='AMP AD Samples Table',
                    columns=as_table_columns(df),
                    parent='syn2580853')
    syn.store(Table(schema, df))
Ejemplo n.º 21
0
def updateDatabase(syn,
                   database,
                   new_dataset,
                   databaseSynId,
                   uniqueKeyCols,
                   toDelete=False):
    """
	Updates synapse tables by a row identifier with another dataset that has the same number and order of columns
	
	:param database:   	   The synapse table (pandas dataframe)
	:param new_dataset:    New dataset (pandas dataframe)
	:param databaseSynId   Synapse Id of the database table
	:param uniqueKeyCols:  Column(s) that make up the unique key

	:returns:      		   Don't know yet	
	"""
    checkBy = 'UNIQUE_KEY'
    database = database.fillna("")
    new_dataset = new_dataset.fillna("")
    #Columns must be in the same order
    new_dataset = new_dataset[database.columns]
    database[uniqueKeyCols] = database[uniqueKeyCols].applymap(str)
    database[checkBy] = database[uniqueKeyCols].apply(lambda x: ' '.join(x),
                                                      axis=1)
    new_dataset[uniqueKeyCols] = new_dataset[uniqueKeyCols].applymap(str)
    new_dataset[checkBy] = new_dataset[uniqueKeyCols].apply(
        lambda x: ' '.join(x), axis=1)
    updateSet = new_dataset[new_dataset[checkBy].isin(database[checkBy])]
    updatingDatabase = database[database[checkBy].isin(new_dataset[checkBy])]

    allRowIds = database.index.values
    rowIds = updatingDatabase.index.values
    #If you input the exact same dataframe theres nothing to update
    if updateSet.empty and updatingDatabase.empty:
        differentRows = []
    else:
        allRowIds = database.index.values
        rowIds = updatingDatabase.index.values

        updateSet.index = updateSet[checkBy]
        updatingDatabase.index = updatingDatabase[checkBy]
        updateSet = updateSet.ix[updatingDatabase.index]
        differences = updateSet != updatingDatabase
        differentRows = differences.apply(sum, axis=1) > 0

    if sum(differentRows) > 0:
        updatingDatabase.ix[differentRows] = updateSet.ix[differentRows]
        toUpdate = updatingDatabase.ix[differentRows]
        toUpdate.index = [
            rowId for rowId, row in zip(rowIds, differentRows) if row
        ]
        del toUpdate[checkBy]
        print("Updating rows")
        table = syn.store(Table(syn.get(databaseSynId), toUpdate))
    else:
        print("No updated rows")
    #All deleted rows (This assumes that all data that don't show up in the new uploaded data should be deleted...)
    if toDelete:
        database.index = allRowIds
        deleteSets = database[~database[checkBy].isin(new_dataset[checkBy])]
        del deleteSets[checkBy]
        if not deleteSets.empty:
            print("Deleting Rows")
            deleteRows = syn.delete(Table(syn.get(databaseSynId), deleteSets))
        else:
            print("No deleted rows")
    #All new rows
    newSet = new_dataset[~new_dataset[checkBy].isin(database[checkBy])]
    if not newSet.empty:
        print("Adding Rows")
        del newSet[checkBy]
        table = syn.store(Table(syn.get(databaseSynId), newSet))
    else:
        print("No new rows")
def archive(evaluation, destination=None, token=None, name=None, query=None):
    """
    Archive the submissions for the given evaluation queue and store them in the destination synapse folder.

    :param evaluation: a synapse evaluation queue or its ID
    :param destination: a synapse folder or its ID
    :param query: a query that will return the desired submissions. At least the ID must be returned.
                  defaults to _select * from evaluation_[EVAL_ID] where status=="SCORED"_.
    """
    challenge = {'5877348':'FusionDetection','5952651':'IsoformQuantification'}
    if not query:
        query = 'select * from evaluation_%s where status=="SCORED"' % utils.id_of(evaluation)
    path = challenge[utils.id_of(evaluation)]
    ## for each submission, download it's associated file and write a line of metadata
    results = Query(query=query)
    if 'objectId' not in results.headers:
        raise ValueError("Can't find the required field \"objectId\" in the results of the query: \"{0}\"".format(query))
    for result in results:
        #Check if the folder has already been created in synapse 
        #(This is used as a tool to check submissions that have already been cached)
        new_map = []
        mapping = syn.get("syn7348150")
        submissionId = result[results.headers.index('objectId')]
        check = syn.query('select id,name from folder where parentId == "%s" and name == "%s"' % (destination,submissionId))
        if check['totalNumberOfResults']==0:
            os.mkdir(submissionId)
            submission = syn.getSubmission(submissionId, downloadLocation=submissionId)
            if submission.entity.externalURL is None:
                newFilePath = submission.filePath.replace(' ', '_')
                shutil.move(submission.filePath,newFilePath)
                #Store CWL file in bucket
                os.system('gsutil cp -R %s gs://smc-rna-cache/%s' % (submissionId,path))
                with open(newFilePath,"r") as cwlfile:
                    docs = yaml.load(cwlfile)
                    merged = docs['$graph']
                    docker = []
                    for tools in merged:
                        if tools['class'] == 'CommandLineTool':
                            if tools.get('requirements',None) is not None:
                                for i in tools['requirements']:
                                    if i.get('dockerPull',None) is not None:
                                        docker.append(i['dockerPull'])
                            if tools.get('hints', None) is not None:
                                for i in tools['hints']:
                                    if i.get('dockerPull',None) is not None:
                                        docker.append(i['dockerPull']) 
                        if tools['class'] == 'Workflow':
                            hints = tools.get("hints",None)
                            if hints is not None:
                                for i in tools['hints']:
                                    if os.path.basename(i['class']) == "synData":
                                        temp = syn.get(i['entity'])
                                        #create synid and index mapping
                                        new_map.append([temp.id,"gs://smc-rna-cache/%s/%s/%s" %(path,submissionId,temp.name)])
                                        #Store index files
                                        os.system('gsutil cp %s gs://smc-rna-cache/%s/%s' % (temp.path,path,submissionId))
                os.system('rm -rf ~/.synapseCache/*')
            else:
                os.system('rm %s' % os.path.join(submissionId, submission.name))
                test = subprocess.check_call(["python", os.path.join(os.path.dirname(__file__),"../../SMC-RNA-Eval/sbg-download.py"), "--token", token, submission.name, submissionId])
                os.system('gsutil cp -R %s gs://smc-rna-cache/%s' % (submissionId,path))
                #Pull down docker containers
                with open("%s/submission.cwl" % submissionId,"r") as cwlfile:
                    docs = yaml.load(cwlfile)
                    merged = docs['steps']
                    docker = []
                    for tools in merged:
                        for hint in tools['run']['hints']:
                            if hint['class'] == 'DockerRequirement':
                                docker.append(hint['dockerPull'])
                        for require in tools['run']['requirements']:
                            if require.get('requirements') is not None:
                                for i in require.get('requirements'):
                                    if i['class'] == 'DockerRequirement':
                                        docker.append(i['dockerPull'])
            os.system('rm -rf %s' % submissionId)
            if len(new_map) > 0:
                table = syn.store(Table(mapping, new_map))
            #Pull, save, and store docker containers
            docker = set(docker)
            for i in docker:
                fileName = os.path.basename(i).replace(":","_")
                os.system('sudo -i docker pull %s' % i)
                #os.system('sudo -i docker save %s' % i)
                os.system('sudo docker save -o %s.tar %s' %(fileName,i))
                os.system('sudo chmod a+r %s.tar' % fileName)
                os.system('gsutil cp %s.tar gs://smc-rna-cache/%s/%s' % (fileName,path,submissionId))
                os.remove("%s.tar" % fileName)
            submission_parent = syn.store(Folder(submissionId,parent=destination))