Example #1
0
def test_RowSetTable_len():
    schema = Schema(parentId="syn123",
                    id='syn456',
                    columns=[Column(name='column_name', id='123')])
    rowset = RowSet(schema=schema,
                    rows=[Row(['first row']),
                          Row(['second row'])])
    row_set_table = RowSetTable(schema, rowset)
    assert_equals(2, len(row_set_table))
def dontruntest_big_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='n', columnType='INTEGER'))
    cols.append(Column(name='is_bogus', columnType='BOOLEAN'))

    table1 = syn.store(Schema(name='Big Table', columns=cols, parent=project))

    rows_per_append = 10

    for i in range(1000):
        rows = []
        for j in range(rows_per_append):
            foo = cols[1].enumValues[random.randint(0, 2)]
            rows.append(
                Row(('Robot ' + str(i * rows_per_append + j), foo,
                     random.random() * 200.0, random.randint(0, 100),
                     random.random() >= 0.5)))
        rowset1 = syn.store(RowSet(columns=cols, schema=table1, rows=rows))

    results = syn.tableQuery("select * from %s" % table1.id)

    results = syn.tableQuery(
        "select n, COUNT(n), MIN(x), AVG(x), MAX(x), SUM(x) from %s group by n"
        % table1.id)
    df = results.asDataFrame()
Example #3
0
def test_Row_forward_compatibility():
    row = Row("2, 3, 4", rowId=1, versionNumber=1, etag=None, new_field="new")
    assert_equals("2, 3, 4", row.get("values"))
    assert_equals(1, row.get("rowId"))
    assert_equals(1, row.get("versionNumber"))
    assert_is_none(row.get("etag"))
    assert_equals("new", row.get("new_field"))
def test_Row_forward_compatibility():
    row = Row("2, 3, 4", rowId=1, versionNumber=1, etag=None, new_field="new")
    assert "2, 3, 4" == row.get("values")
    assert 1 == row.get("rowId")
    assert 1 == row.get("versionNumber")
    assert row.get("etag") is None
    assert "new" == row.get("new_field")
def test_store_table_datetime():
    current_datetime = datetime.fromtimestamp(round(time.time(), 3))
    schema = syn.store(
        Schema("testTable", [Column(name="testerino", columnType='DATE')],
               project))
    rowset = RowSet(rows=[Row([current_datetime])], schema=schema)
    rowset_table = syn.store(Table(schema, rowset))

    query_result = syn.tableQuery("select * from %s" % id_of(schema),
                                  resultsAs="rowset")
    assert_equals(current_datetime,
                  query_result.rowset['rows'][0]['values'][0])
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='STRING', maximumSize=1000),
         Column(name='bar', columnType='STRING')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [['foo1', None], [None, 'bar2']]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
 def _table_setup(cls):
     # set up a table
     cols = [
         Column(name='foo', columnType='INTEGER'),
         Column(name='bar', columnType='INTEGER')
     ]
     schema = syn.store(
         Schema(name='PartialRowTest' + str(uuid.uuid4()),
                columns=cols,
                parent=project))
     data = [[1, None], [None, 2]]
     syn.store(RowSet(schema=schema, rows=[Row(r) for r in data]))
     return schema
Example #8
0
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn._chunkedUploadFile(path)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(columns=cols, schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        'select artist, album, year, catalog, cover from %s' % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        print "%s_%s" % (row.rowId, row.versionNumber), row.values
        file_info = syn.downloadTableFile(results,
                                          rowId=row.rowId,
                                          versionNumber=row.versionNumber,
                                          column='cover',
                                          downloadLocation='.')
        assert filecmp.cmp(original_files[i], file_info['path'])
        schedule_for_cleanup(file_info['path'])
def test_rowset_tables():
    cols = [
        Column(name='name', columnType='STRING', maximumSize=1000),
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']),
        Column(name='x', columnType='DOUBLE'),
        Column(name='age', columnType='INTEGER'),
        Column(name='cartoon', columnType='BOOLEAN'),
        Column(name='description', columnType='LARGETEXT')
    ]

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert_equals(len(row_reference_set1['rows']), 4)
def test_download_table_files():
    cols = [
        Column(name='artist', columnType='STRING', maximumSize=50),
        Column(name='album', columnType='STRING', maximumSize=50),
        Column(name='year', columnType='INTEGER'),
        Column(name='catalog', columnType='STRING', maximumSize=50),
        Column(name='cover', columnType='FILEHANDLEID')
    ]

    schema = syn.store(Schema(name='Jazz Albums', columns=cols,
                              parent=project))
    schedule_for_cleanup(schema)

    data = [[
        "John Coltrane", "Blue Train", 1957, "BLP 1577",
        "coltraneBlueTrain.jpg"
    ], ["Sonny Rollins", "Vol. 2", 1957, "BLP 1558", "rollinsBN1558.jpg"],
            [
                "Sonny Rollins", "Newk's Time", 1958, "BLP 4001",
                "rollinsBN4001.jpg"
            ],
            [
                "Kenny Burrel", "Kenny Burrel", 1956, "BLP 1543",
                "burrellWarholBN1543.jpg"
            ]]

    ## upload files and store file handle ids
    original_files = []
    for row in data:
        path = utils.make_bogus_data_file()
        original_files.append(path)
        schedule_for_cleanup(path)
        file_handle = syn.uploadFileHandle(path, project)
        row[4] = file_handle['id']

    row_reference_set = syn.store(
        RowSet(schema=schema, rows=[Row(r) for r in data]))

    ## retrieve the files for each row and verify that they are identical to the originals
    results = syn.tableQuery(
        "select artist, album, 'year', 'catalog', cover from %s" % schema.id,
        resultsAs="rowset")
    for i, row in enumerate(results):
        path = syn.downloadTableFile(results,
                                     rowId=row.rowId,
                                     versionNumber=row.versionNumber,
                                     column='cover')
        assert filecmp.cmp(original_files[i], path)
        schedule_for_cleanup(path)

    ## test that cached copies are returned for already downloaded files
    original_downloadFile_method = syn._downloadFileHandle
    with patch(
            "synapseclient.Synapse._downloadFileHandle") as _downloadFile_mock:
        _downloadFile_mock.side_effect = original_downloadFile_method

        results = syn.tableQuery(
            "select artist, album, 'year', 'catalog', cover from %s where artist = 'John Coltrane'"
            % schema.id,
            resultsAs="rowset")
        for i, row in enumerate(results):
            file_path = syn.downloadTableFile(results,
                                              rowId=row.rowId,
                                              versionNumber=row.versionNumber,
                                              column='cover')
            assert filecmp.cmp(original_files[i], file_path)

        assert not _downloadFile_mock.called, "Should have used cached copy of file and not called _downloadFile"

    ## test download table column
    results = syn.tableQuery('select * from %s' % schema.id)
    ## uncache 2 out of 4 files
    for i, row in enumerate(results):
        if i % 2 == 0:
            syn.cache.remove(row[6])
    file_map = syn.downloadTableColumns(results, ['cover'])
    assert len(file_map) == 4
    for row in results:
        filecmp.cmp(original_files[i], file_map[row[6]])
def test_rowset_tables():
    cols = []
    cols.append(Column(name='name', columnType='STRING', maximumSize=1000))
    cols.append(
        Column(name='foo',
               columnType='STRING',
               enumValues=['foo', 'bar', 'bat']))
    cols.append(Column(name='x', columnType='DOUBLE'))
    cols.append(Column(name='age', columnType='INTEGER'))
    cols.append(Column(name='cartoon', columnType='BOOLEAN'))
    cols.append(Column(name='description', columnType='LARGETEXT'))

    schema1 = syn.store(Schema(name='Foo Table', columns=cols, parent=project))

    ## Get columns associated with the given table
    retrieved_cols = list(syn.getTableColumns(schema1))

    ## Test that the columns we get are the same as the ones we stored
    assert len(retrieved_cols) == len(cols)
    for retrieved_col, col in zip(retrieved_cols, cols):
        assert retrieved_col.name == col.name
        assert retrieved_col.columnType == col.columnType

    data1 = [['Chris', 'bar', 11.23, 45, False, 'a'],
             ['Jen', 'bat', 14.56, 40, False, 'b'],
             ['Jane', 'bat', 17.89, 6, False, 'c' * 1002],
             ['Henry', 'bar', 10.12, 1, False, 'd']]
    row_reference_set1 = syn.store(
        RowSet(schema=schema1, rows=[Row(r) for r in data1]))
    assert len(row_reference_set1['rows']) == 4

    ## add more new rows
    data2 = [['Fred', 'bat', 21.45, 20, True, 'e'],
             ['Daphne', 'foo', 27.89, 20, True, 'f'],
             ['Shaggy', 'foo', 23.45, 20, True, 'g'],
             ['Velma', 'bar', 25.67, 20, True, 'h']]
    syn.store(RowSet(schema=schema1, rows=[Row(r) for r in data2]))

    results = syn.tableQuery("select * from %s order by name" % schema1.id,
                             resultsAs="rowset")

    assert results.count == 8
    assert results.tableId == schema1.id

    ## test that the values made the round trip
    expected = sorted(data1 + data2)
    for expected_values, row in zip(expected, results):
        assert expected_values == row['values'], 'got %s but expected %s' % (
            row['values'], expected_values)

    ## To modify rows, we have to select then first.
    result2 = syn.tableQuery('select * from %s where age>18 and age<30' %
                             schema1.id,
                             resultsAs="rowset")

    ## make a change
    rs = result2.asRowSet()
    for row in rs['rows']:
        row['values'][2] = 88.888

    ## store it
    row_reference_set = syn.store(rs)

    ## check if the change sticks
    result3 = syn.tableQuery('select name, x, age from %s' % schema1.id,
                             resultsAs="rowset")
    for row in result3:
        if int(row['values'][2]) == 20:
            assert row['values'][1] == 88.888

    ## Add a column
    bday_column = syn.store(Column(name='birthday', columnType='DATE'))

    column = syn.getColumn(bday_column.id)
    assert column.name == "birthday"
    assert column.columnType == "DATE"

    schema1.addColumn(bday_column)
    schema1 = syn.store(schema1)

    results = syn.tableQuery(
        'select * from %s where cartoon=false order by age' % schema1.id,
        resultsAs="rowset")
    rs = results.asRowSet()

    ## put data in new column
    bdays = ('2013-3-15', '2008-1-3', '1973-12-8', '1969-4-28')
    for bday, row in zip(bdays, rs.rows):
        row['values'][6] = bday
    row_reference_set = syn.store(rs)

    ## query by date and check that we get back two kids
    date_2008_jan_1 = utils.to_unix_epoch_time(datetime(2008, 1, 1))
    results = syn.tableQuery(
        'select name from %s where birthday > %d order by birthday' %
        (schema1.id, date_2008_jan_1),
        resultsAs="rowset")
    assert ["Jane", "Henry"] == [row['values'][0] for row in results]

    try:
        import pandas as pd
        df = results.asDataFrame()
        assert all(df.ix[:, "name"] == ["Jane", "Henry"])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    results = syn.tableQuery(
        'select birthday from %s where cartoon=false order by age' %
        schema1.id,
        resultsAs="rowset")
    for bday, row in zip(bdays, results):
        assert row['values'][0] == datetime.strptime(
            bday,
            "%Y-%m-%d"), "got %s but expected %s" % (row['values'][0], bday)

    try:
        import pandas as pd
        results = syn.tableQuery(
            "select foo, MAX(x), COUNT(foo), MIN(age) from %s group by foo order by foo"
            % schema1.id,
            resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape == (3, 4)
        assert all(df.iloc[:, 0] == ["bar", "bat", "foo"])
        assert all(df.iloc[:, 1] == [88.888, 88.888, 88.888])
        assert all(df.iloc[:, 2] == [3, 3, 2])
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )

    ## test delete rows by deleting cartoon characters
    syn.delete(
        syn.tableQuery('select name from %s where cartoon = true' % schema1.id,
                       resultsAs="rowset"))

    results = syn.tableQuery('select name from %s order by birthday' %
                             schema1.id,
                             resultsAs="rowset")
    assert ["Chris", "Jen", "Jane",
            "Henry"] == [row['values'][0] for row in results]

    ## check what happens when query result is empty
    results = syn.tableQuery('select * from %s where age > 1000' % schema1.id,
                             resultsAs="rowset")
    assert len(list(results)) == 0

    try:
        import pandas as pd
        results = syn.tableQuery('select * from %s where age > 1000' %
                                 schema1.id,
                                 resultsAs="rowset")
        df = results.asDataFrame()
        assert df.shape[0] == 0
    except ImportError as e1:
        sys.stderr.write(
            'Pandas is apparently not installed, skipping part of test_rowset_tables.\n\n'
        )
Example #12
0
def files_to_synapse_table(in_files,
                           synapse_project_id,
                           table_name,
                           column_name='fileID',
                           username='',
                           password=''):
    """
    Upload files and file handle IDs to Synapse.

    Parameters
    ----------
    in_files : list of strings
        paths to files to upload to Synapse
    synapse_project_id : string
        Synapse ID for project to which table is to be written
    table_name : string
        schema name of table
    column_name : string
        header for column of fileIDs
    username : string
        Synapse username (only needed once on a given machine)
    password : string
        Synapse password (only needed once on a given machine)

    Returns
    -------
    synapse_project_id : string
        Synapse ID for project

    Examples
    --------
    >>> from mhealthx.io_data import files_to_synapse_table
    >>> in_files = ['/Users/arno/Local/wav/test1.wav']
    >>> synapse_project_id = 'syn4899451'
    >>> table_name = 'Test to store files and file handle IDs'
    >>> column_name = 'fileID1'
    >>> username = ''
    >>> password = ''
    >>> table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)
    >>> #column_name = 'fileID2'
    >>> #in_files = ['/Users/arno/Local/wav/test2.wav']
    >>> #table_data, synapse_project_id = files_to_synapse_table(in_files, synapse_project_id, table_name, column_name, username, password)

    """
    import synapseclient
    from synapseclient import Schema
    from synapseclient.table import Column, RowSet, Row

    syn = synapseclient.Synapse()

    # Log in to Synapse:
    if username and password:
        syn.login(username, password)
    else:
        syn.login()

    # Store file handle IDs:
    files_handles = []
    for in_file in in_files:
        file_handle = syn._chunkedUploadFile(in_file)
        files_handles.append([file_handle['id']])

    # New column headers:
    new_column_header = Column(name=column_name, columnType='FILEHANDLEID')

    # See if Synapse table exists:
    # tex = list(syn.chunkedQuery("select id from Table where parentId=='{0}'"
    #                             " and name=='{1}'".format(synapse_project_id,
    #                                                       table_name)))
    # If Synapse table does not exist, create table schema:
    # if not tex:

    # Create table schema:
    schema = syn.store(
        Schema(name=table_name,
               columns=[new_column_header],
               parent=synapse_project_id))

    # Upload files and file handle IDs with new schema:
    syn.store(
        RowSet(columns=[new_column_header],
               schema=schema,
               rows=[Row(r) for r in files_handles]))